435 files changed, 12912 insertions, 8458 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index bed48fa96521..9ac4ffe9ac7d 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -29,10 +29,10 @@
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/inet.h>
 #include <linux/pagemap.h>
 #include <linux/idr.h>
+#include <linux/sched.h>
 
 #include "debug.h"
 #include "v9fs.h"
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index ddffd8aa902d..d93960429c09 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -30,10 +30,10 @@
 #include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/inet.h>
 #include <linux/namei.h>
 #include <linux/idr.h>
+#include <linux/sched.h>
 
 #include "debug.h"
 #include "v9fs.h"
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 3129688143ea..1dd86ee90bc5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -29,7 +29,6 @@
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/sched.h>
 #include <linux/inet.h>
 #include <linux/idr.h>
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index c7b677253843..6e7678e4852f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -30,7 +30,6 @@
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/inet.h>
 #include <linux/list.h>
 #include <asm/uaccess.h>
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b01b0a457932..c76cd8fa3f6c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -30,10 +30,10 @@
 #include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/inet.h>
 #include <linux/namei.h>
 #include <linux/idr.h>
+#include <linux/sched.h>
 
 #include "debug.h"
 #include "v9fs.h"
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 0ec42f665457..7bdf8b326841 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -31,12 +31,12 @@
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/inet.h>
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/idr.h>
+#include <linux/sched.h>
 
 #include "debug.h"
 #include "v9fs.h"
diff --git a/fs/Kconfig b/fs/Kconfig
index e33c08924572..0fa0c1193e81 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -314,7 +314,7 @@ config REISERFS_CHECK
 
 config REISERFS_PROC_INFO
 	bool "Stats in /proc/fs/reiserfs"
-	depends on REISERFS_FS
+	depends on REISERFS_FS && PROC_FS
 	help
 	  Create under /proc/fs/reiserfs a hierarchy of files, displaying
 	  various ReiserFS statistics and internal data at the expense of
@@ -724,10 +724,6 @@ config FAT_FS
 	  file system and use GNU tar's M option. GNU tar is a program
 	  available for Unix and DOS ("man tar" or "info tar").
 
-	  It is now also becoming possible to read and write compressed FAT
-	  file systems; read <file:Documentation/filesystems/fat_cvf.txt> for
-	  details.
-
 	  The FAT support will enlarge your kernel by about 37 KB. If unsure,
 	  say Y.
 
@@ -1734,6 +1730,18 @@ config SUNRPC
 config SUNRPC_GSS
 	tristate
 
+config SUNRPC_BIND34
+	bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
+	depends on SUNRPC && EXPERIMENTAL
+	help
+	  Provides kernel support for querying rpcbind servers via versions 3
+	  and 4 of the rpcbind protocol.  The kernel automatically falls back
+	  to version 2 if a remote rpcbind service does not support versions
+	  3 or 4.
+
+	  If unsure, say N to get traditional behavior (version 2 rpcbind
+	  requests only).
+
 config RPCSEC_GSS_KRB5
 	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index f3d3d81eb7e9..d4fc6095466d 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -26,7 +26,7 @@ config BINFMT_ELF
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y
-	depends on FRV
+	depends on (FRV || BLACKFIN)
 	help
 	  ELF FDPIC binaries are based on ELF, but allow the individual load
 	  segments of a binary to be located in memory independently of each
@@ -38,7 +38,7 @@ config BINFMT_ELF_FDPIC
 
 config BINFMT_FLAT
 	tristate "Kernel support for flat binaries"
-	depends on !MMU || SUPERH
+	depends on !MMU
 	help
 	  Support uClinux FLAT format binaries.
 
diff --git a/fs/Makefile b/fs/Makefile
index 9edf4112bee0..720c29d57a62 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -22,6 +22,10 @@ endif
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
+obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
+obj-$(CONFIG_SIGNALFD)		+= signalfd.o
+obj-$(CONFIG_TIMERFD)		+= timerfd.o
+obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
 nfsd-$(CONFIG_NFSD)		:= nfsctl.o
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2e5f2c8371ee..de2ed5ca3351 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -232,9 +232,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 4aa8079e71be..c8796906f584 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,11 +628,7 @@ static int affs_prepare_write_ofs(struct file *file, struct page *page, unsigned
 			return err;
 	}
 	if (to < PAGE_CACHE_SIZE) {
-		char *kaddr = kmap_atomic(page, KM_USER0);
-
-		memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, to, PAGE_CACHE_SIZE - to, KM_USER0);
 		if (size > offset + to) {
 			if (size < offset + PAGE_CACHE_SIZE)
 				tmp = size & ~PAGE_CACHE_MASK;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index c5b9d73c084a..4609a6c13fe9 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -9,7 +9,7 @@
  *
  *  (C) 1991  Linus Torvalds - minix filesystem
  */
-
+#include <linux/sched.h>
 #include "affs.h"
 
 extern const struct inode_operations affs_symlink_inode_operations;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c3986a1911b0..6d0ebc321530 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -15,6 +15,7 @@
 #include <linux/statfs.h>
 #include <linux/parser.h>
 #include <linux/magic.h>
+#include <linux/sched.h>
 #include "affs.h"
 
 extern struct timezone sys_tz;
@@ -87,12 +88,9 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		init_MUTEX(&ei->i_link_lock);
-		init_MUTEX(&ei->i_ext_lock);
-		inode_init_once(&ei->vfs_inode);
-	}
+	init_MUTEX(&ei->i_link_lock);
+	init_MUTEX(&ei->i_ext_lock);
+	inode_init_once(&ei->vfs_inode);
 }
 
 static int init_inodecache(void)
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 01545eb1d872..73ce561f3ea0 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -18,10 +18,11 @@ kafs-objs := \
 	security.o \
 	server.o \
 	super.o \
-	use-rtnetlink.o \
+	netdevices.o \
 	vlclient.o \
 	vlocation.o \
 	vnode.o \
-	volume.o
+	volume.o \
+	write.o
 
 obj-$(CONFIG_AFS_FS)  := kafs.o
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 52d0752265b8..245257948140 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -16,6 +16,9 @@
 
 #define AFS_MAXCELLNAME	64		/* maximum length of a cell name */
 #define AFS_MAXVOLNAME	64		/* maximum length of a volume name */
+#define AFSNAMEMAX	256		/* maximum length of a filename plus NUL */
+#define AFSPATHMAX	1024		/* maximum length of a pathname plus NUL */
+#define AFSOPAQUEMAX	1024		/* maximum length of an opaque field */
 
 typedef unsigned			afs_volid_t;
 typedef unsigned			afs_vnodeid_t;
@@ -143,4 +146,24 @@ struct afs_volsync {
 	time_t			creation;	/* volume creation time */
 };
 
+/*
+ * AFS volume status record
+ */
+struct afs_volume_status {
+	u32			vid;		/* volume ID */
+	u32			parent_id;	/* parent volume ID */
+	u8			online;		/* true if volume currently online and available */
+	u8			in_service;	/* true if volume currently in service */
+	u8			blessed;	/* same as in_service */
+	u8			needs_salvage;	/* true if consistency checking required */
+	u32			type;		/* volume type (afs_voltype_t) */
+	u32			min_quota;	/* minimum space set aside (blocks) */
+	u32			max_quota;	/* maximum space this volume may occupy (blocks) */
+	u32			blocks_in_use;	/* space this volume currently occupies (blocks) */
+	u32			part_blocks_avail; /* space available in volume's partition */
+	u32			part_max_blocks; /* size of volume's partition */
+};
+
+#define AFS_BLOCK_SIZE	1024
+
 #endif /* AFS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index 89e0d1650a72..a18c374ebe08 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -18,6 +18,8 @@
 enum AFS_FS_Operations {
 	FSFETCHDATA		= 130,	/* AFS Fetch file data */
 	FSFETCHSTATUS		= 132,	/* AFS Fetch file status */
+	FSSTOREDATA		= 133,	/* AFS Store file data */
+	FSSTORESTATUS		= 135,	/* AFS Store file status */
 	FSREMOVEFILE		= 136,	/* AFS Remove a file */
 	FSCREATEFILE		= 137,	/* AFS Create a file */
 	FSRENAME		= 138,	/* AFS Rename or move a file or directory */
@@ -26,9 +28,12 @@ enum AFS_FS_Operations {
 	FSMAKEDIR		= 141,	/* AFS Create a directory */
 	FSREMOVEDIR		= 142,	/* AFS Remove a directory */
 	FSGIVEUPCALLBACKS	= 147,	/* AFS Discard callback promises */
-	FSGETVOLUMEINFO		= 148,	/* AFS Get root volume information */
+	FSGETVOLUMEINFO		= 148,	/* AFS Get information about a volume */
+	FSGETVOLUMESTATUS	= 149,	/* AFS Get volume status information */
 	FSGETROOTVOLUME		= 151,	/* AFS Get root volume name */
 	FSLOOKUP		= 161,	/* AFS lookup file in directory */
+	FSFETCHDATA64		= 65537, /* AFS Fetch file data */
+	FSSTOREDATA64		= 65538, /* AFS Store file data */
 };
 
 enum AFS_FS_Errors {
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 639399f0ab6f..bacf518c6fa8 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/circ_buf.h>
+#include <linux/sched.h>
 #include "internal.h"
 
 unsigned afs_vnode_update_timeout = 10;
@@ -44,7 +45,7 @@ void afs_init_callback_state(struct afs_server *server)
 	while (!RB_EMPTY_ROOT(&server->cb_promises)) {
 		vnode = rb_entry(server->cb_promises.rb_node,
 				 struct afs_vnode, cb_promise);
-		_debug("UNPROMISE { vid=%x vn=%u uq=%u}",
+		_debug("UNPROMISE { vid=%x:%u uq=%u}",
 		       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
 		rb_erase(&vnode->cb_promise, &server->cb_promises);
 		vnode->cb_promised = false;
@@ -84,11 +85,8 @@ void afs_broken_callback_work(struct work_struct *work)
 
 		/* if the vnode's data version number changed then its contents
 		 * are different */
-		if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
-			_debug("zap data {%x:%u}",
-			       vnode->fid.vid, vnode->fid.vnode);
-			invalidate_remote_inode(&vnode->vfs_inode);
-		}
+		if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+			afs_zap_data(vnode);
 	}
 
 out:
@@ -468,7 +466,7 @@ int __init afs_callback_update_init(void)
 /*
  * shut down the callback update process
  */
-void __exit afs_callback_update_kill(void)
+void afs_callback_update_kill(void)
 {
 	destroy_workqueue(afs_callback_update_worker);
 }
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 9b1311a1df51..175a567db78c 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/key.h>
 #include <linux/ctype.h>
+#include <linux/sched.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
 
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 6685f4cbccb3..d5b2ad6575bc 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -443,6 +443,7 @@ static void SRXAFSCB_GetCapabilities(struct work_struct *work)
 			reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
 			reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
 		}
+		kfree(ifs);
 	}
 
 	reply.cap.capcount = htonl(1);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index dac5b990c0cd..546c59522eb1 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -16,6 +16,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
+#include <linux/sched.h>
 #include "internal.h"
 
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -55,7 +56,8 @@ const struct inode_operations afs_dir_inode_operations = {
 	.rmdir		= afs_rmdir,
 	.rename		= afs_rename,
 	.permission	= afs_permission,
-	.getattr	= afs_inode_getattr,
+	.getattr	= afs_getattr,
+	.setattr	= afs_setattr,
 };
 
 static struct dentry_operations afs_fs_dentry_operations = {
@@ -194,10 +196,7 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
 
 	page = read_mapping_page(dir->i_mapping, index, &file);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
 		if (!PageChecked(page))
 			afs_dir_check_page(dir, page);
 		if (PageError(page))
@@ -494,12 +493,12 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 
 	vnode = AFS_FS_I(dir);
 
-	_enter("{%x:%d},%p{%s},",
+	_enter("{%x:%u},%p{%s},",
 	       vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
 
 	ASSERTCMP(dentry->d_inode, ==, NULL);
 
-	if (dentry->d_name.len > 255) {
+	if (dentry->d_name.len >= AFSNAMEMAX) {
 		_leave(" = -ENAMETOOLONG");
 		return ERR_PTR(-ENAMETOOLONG);
 	}
@@ -734,11 +733,11 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	dvnode = AFS_FS_I(dir);
 
-	_enter("{%x:%d},{%s},%o",
+	_enter("{%x:%u},{%s},%o",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
 
 	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len > 255)
+	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
 
 	key = afs_request_key(dvnode->volume->cell);
@@ -799,11 +798,11 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	dvnode = AFS_FS_I(dir);
 
-	_enter("{%x:%d},{%s}",
+	_enter("{%x:%u},{%s}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
 
 	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len > 255)
+	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
 
 	key = afs_request_key(dvnode->volume->cell);
@@ -845,11 +844,11 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 
 	dvnode = AFS_FS_I(dir);
 
-	_enter("{%x:%d},{%s}",
+	_enter("{%x:%u},{%s}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
 
 	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len > 255)
+	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
 
 	key = afs_request_key(dvnode->volume->cell);
@@ -919,11 +918,11 @@ static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	dvnode = AFS_FS_I(dir);
 
-	_enter("{%x:%d},{%s},%o,",
+	_enter("{%x:%u},{%s},%o,",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
 
 	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len > 255)
+	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
 
 	key = afs_request_key(dvnode->volume->cell);
@@ -986,13 +985,13 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	vnode = AFS_FS_I(from->d_inode);
 	dvnode = AFS_FS_I(dir);
 
-	_enter("{%x:%d},{%x:%d},{%s}",
+	_enter("{%x:%u},{%x:%u},{%s}",
 	       vnode->fid.vid, vnode->fid.vnode,
 	       dvnode->fid.vid, dvnode->fid.vnode,
 	       dentry->d_name.name);
 
 	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len > 255)
+	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
 
 	key = afs_request_key(dvnode->volume->cell);
@@ -1035,16 +1034,16 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
 
 	dvnode = AFS_FS_I(dir);
 
-	_enter("{%x:%d},{%s},%s",
+	_enter("{%x:%u},{%s},%s",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
 	       content);
 
 	ret = -ENAMETOOLONG;
-	if (dentry->d_name.len > 255)
+	if (dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
 
 	ret = -EINVAL;
-	if (strlen(content) > 1023)
+	if (strlen(content) >= AFSPATHMAX)
 		goto error;
 
 	key = afs_request_key(dvnode->volume->cell);
@@ -1107,14 +1106,14 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	orig_dvnode = AFS_FS_I(old_dir);
 	new_dvnode = AFS_FS_I(new_dir);
 
-	_enter("{%x:%d},{%x:%d},{%x:%d},{%s}",
+	_enter("{%x:%u},{%x:%u},{%x:%u},{%s}",
 	       orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
 	       vnode->fid.vid, vnode->fid.vnode,
 	       new_dvnode->fid.vid, new_dvnode->fid.vnode,
 	       new_dentry->d_name.name);
 
 	ret = -ENAMETOOLONG;
-	if (new_dentry->d_name.len > 255)
+	if (new_dentry->d_name.len >= AFSNAMEMAX)
 		goto error;
 
 	key = afs_request_key(orig_dvnode->volume->cell);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index ae256498f4f7..9c0e721d9fc2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -15,32 +15,43 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/writeback.h>
 #include "internal.h"
 
-static int afs_file_readpage(struct file *file, struct page *page);
-static void afs_file_invalidatepage(struct page *page, unsigned long offset);
-static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
+static int afs_readpage(struct file *file, struct page *page);
+static void afs_invalidatepage(struct page *page, unsigned long offset);
+static int afs_releasepage(struct page *page, gfp_t gfp_flags);
+static int afs_launder_page(struct page *page);
 
 const struct file_operations afs_file_operations = {
 	.open		= afs_open,
 	.release	= afs_release,
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
+	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
+	.aio_write	= afs_file_write,
 	.mmap		= generic_file_readonly_mmap,
 	.sendfile	= generic_file_sendfile,
+	.fsync		= afs_fsync,
 };
 
 const struct inode_operations afs_file_inode_operations = {
-	.getattr	= afs_inode_getattr,
+	.getattr	= afs_getattr,
+	.setattr	= afs_setattr,
 	.permission	= afs_permission,
 };
 
 const struct address_space_operations afs_fs_aops = {
-	.readpage	= afs_file_readpage,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
-	.releasepage	= afs_file_releasepage,
-	.invalidatepage	= afs_file_invalidatepage,
+	.readpage	= afs_readpage,
+	.set_page_dirty	= afs_set_page_dirty,
+	.launder_page	= afs_launder_page,
+	.releasepage	= afs_releasepage,
+	.invalidatepage	= afs_invalidatepage,
+	.prepare_write	= afs_prepare_write,
+	.commit_write	= afs_commit_write,
+	.writepage	= afs_writepage,
+	.writepages	= afs_writepages,
 };
 
 /*
@@ -52,7 +63,7 @@ int afs_open(struct inode *inode, struct file *file)
 	struct key *key;
 	int ret;
 
-	_enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
 
 	key = afs_request_key(vnode->volume->cell);
 	if (IS_ERR(key)) {
@@ -78,7 +89,7 @@ int afs_release(struct inode *inode, struct file *file)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
-	_enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
 
 	key_put(file->private_data);
 	_leave(" = 0");
@@ -89,10 +100,10 @@ int afs_release(struct inode *inode, struct file *file)
  * deal with notification that a page was read from the cache
  */
 #ifdef AFS_CACHING_SUPPORT
-static void afs_file_readpage_read_complete(void *cookie_data,
-					    struct page *page,
-					    void *data,
-					    int error)
+static void afs_readpage_read_complete(void *cookie_data,
+				       struct page *page,
+				       void *data,
+				       int error)
 {
 	_enter("%p,%p,%p,%d", cookie_data, page, data, error);
 
@@ -109,10 +120,10 @@ static void afs_file_readpage_read_complete(void *cookie_data,
  * deal with notification that a page was written to the cache
  */
 #ifdef AFS_CACHING_SUPPORT
-static void afs_file_readpage_write_complete(void *cookie_data,
-					     struct page *page,
-					     void *data,
-					     int error)
+static void afs_readpage_write_complete(void *cookie_data,
+					struct page *page,
+					void *data,
+					int error)
 {
 	_enter("%p,%p,%p,%d", cookie_data, page, data, error);
 
@@ -121,9 +132,9 @@ static void afs_file_readpage_write_complete(void *cookie_data,
 #endif
 
 /*
- * AFS read page from file (or symlink)
+ * AFS read page from file, directory or symlink
  */
-static int afs_file_readpage(struct file *file, struct page *page)
+static int afs_readpage(struct file *file, struct page *page)
 {
 	struct afs_vnode *vnode;
 	struct inode *inode;
@@ -219,26 +230,9 @@ error:
 }
 
 /*
- * get a page cookie for the specified page
- */
-#ifdef AFS_CACHING_SUPPORT
-int afs_cache_get_page_cookie(struct page *page,
-			      struct cachefs_page **_page_cookie)
-{
-	int ret;
-
-	_enter("");
-	ret = cachefs_page_get_private(page,_page_cookie, GFP_NOIO);
-
-	_leave(" = %d", ret);
-	return ret;
-}
-#endif
-
-/*
  * invalidate part or all of a page
  */
-static void afs_file_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned long offset)
 {
 	int ret = 1;
 
@@ -247,11 +241,6 @@ static void afs_file_invalidatepage(struct page *page, unsigned long offset)
 	BUG_ON(!PageLocked(page));
 
 	if (PagePrivate(page)) {
-#ifdef AFS_CACHING_SUPPORT
-		struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
-		cachefs_uncache_page(vnode->cache,page);
-#endif
-
 		/* We release buffers only if the entire page is being
 		 * invalidated.
 		 * The get_block cached value has been unconditionally
@@ -272,25 +261,33 @@ static void afs_file_invalidatepage(struct page *page, unsigned long offset)
 }
 
 /*
+ * write back a dirty page
+ */
+static int afs_launder_page(struct page *page)
+{
+	_enter("{%lu}", page->index);
+
+	return 0;
+}
+
+/*
  * release a page and cleanup its private data
  */
-static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
+static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 {
-	struct cachefs_page *pageio;
+	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+	struct afs_writeback *wb;
 
-	_enter("{%lu},%x", page->index, gfp_flags);
+	_enter("{{%x:%u}[%lu],%lx},%x",
+	       vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
+	       gfp_flags);
 
 	if (PagePrivate(page)) {
-#ifdef AFS_CACHING_SUPPORT
-		struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
-		cachefs_uncache_page(vnode->cache, page);
-#endif
-
-		pageio = (struct cachefs_page *) page_private(page);
+		wb = (struct afs_writeback *) page_private(page);
+		ASSERT(wb != NULL);
 		set_page_private(page, 0);
 		ClearPagePrivate(page);
-
-		kfree(pageio);
+		afs_put_writeback(wb);
 	}
 
 	_leave(" = 0");
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 2393d2a08d79..5dff1308b6f0 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -33,8 +33,10 @@ static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
  */
 static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 				      struct afs_file_status *status,
-				      struct afs_vnode *vnode)
+				      struct afs_vnode *vnode,
+				      afs_dataversion_t *store_version)
 {
+	afs_dataversion_t expected_version;
 	const __be32 *bp = *_bp;
 	umode_t mode;
 	u64 data_version, size;
@@ -101,7 +103,11 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 		vnode->vfs_inode.i_atime	= vnode->vfs_inode.i_ctime;
 	}
 
-	if (status->data_version != data_version) {
+	expected_version = status->data_version;
+	if (store_version)
+		expected_version = *store_version;
+
+	if (expected_version != data_version) {
 		status->data_version = data_version;
 		if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
 			_debug("vnode modified %llx on {%x:%u}",
@@ -110,6 +116,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 			set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
 			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
 		}
+	} else if (store_version) {
+		status->data_version = data_version;
 	}
 }
 
@@ -156,6 +164,67 @@ static void xdr_decode_AFSVolSync(const __be32 **_bp,
 }
 
 /*
+ * encode the requested attributes into an AFSStoreStatus block
+ */
+static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
+{
+	__be32 *bp = *_bp;
+	u32 mask = 0, mtime = 0, owner = 0, group = 0, mode = 0;
+
+	mask = 0;
+	if (attr->ia_valid & ATTR_MTIME) {
+		mask |= AFS_SET_MTIME;
+		mtime = attr->ia_mtime.tv_sec;
+	}
+
+	if (attr->ia_valid & ATTR_UID) {
+		mask |= AFS_SET_OWNER;
+		owner = attr->ia_uid;
+	}
+
+	if (attr->ia_valid & ATTR_GID) {
+		mask |= AFS_SET_GROUP;
+		group = attr->ia_gid;
+	}
+
+	if (attr->ia_valid & ATTR_MODE) {
+		mask |= AFS_SET_MODE;
+		mode = attr->ia_mode & S_IALLUGO;
+	}
+
+	*bp++ = htonl(mask);
+	*bp++ = htonl(mtime);
+	*bp++ = htonl(owner);
+	*bp++ = htonl(group);
+	*bp++ = htonl(mode);
+	*bp++ = 0;		/* segment size */
+	*_bp = bp;
+}
+
+/*
+ * decode an AFSFetchVolumeStatus block
+ */
+static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
+					    struct afs_volume_status *vs)
+{
+	const __be32 *bp = *_bp;
+
+	vs->vid			= ntohl(*bp++);
+	vs->parent_id		= ntohl(*bp++);
+	vs->online		= ntohl(*bp++);
+	vs->in_service		= ntohl(*bp++);
+	vs->blessed		= ntohl(*bp++);
+	vs->needs_salvage	= ntohl(*bp++);
+	vs->type		= ntohl(*bp++);
+	vs->min_quota		= ntohl(*bp++);
+	vs->max_quota		= ntohl(*bp++);
+	vs->blocks_in_use	= ntohl(*bp++);
+	vs->part_blocks_avail	= ntohl(*bp++);
+	vs->part_max_blocks	= ntohl(*bp++);
+	*_bp = bp;
+}
+
+/*
  * deliver reply data to an FS.FetchStatus
  */
 static int afs_deliver_fs_fetch_status(struct afs_call *call,
@@ -175,7 +244,7 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call,
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
 	xdr_decode_AFSCallBack(&bp, vnode);
 	if (call->reply2)
 		xdr_decode_AFSVolSync(&bp, call->reply2);
@@ -206,7 +275,7 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 	struct afs_call *call;
 	__be32 *bp;
 
-	_enter(",%x,{%x:%d},,",
+	_enter(",%x,{%x:%u},,",
 	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
 
 	call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
@@ -247,10 +316,16 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 	case 0:
 		call->offset = 0;
 		call->unmarshall++;
+		if (call->operation_ID != FSFETCHDATA64) {
+			call->unmarshall++;
+			goto no_msw;
+		}
 
-		/* extract the returned data length */
+		/* extract the upper part of the returned data length of an
+		 * FSFETCHDATA64 op (which should always be 0 using this
+		 * client) */
 	case 1:
-		_debug("extract data length");
+		_debug("extract data length (MSW)");
 		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
 		switch (ret) {
 		case 0:		break;
@@ -259,37 +334,51 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		}
 
 		call->count = ntohl(call->tmp);
-		_debug("DATA length: %u", call->count);
-		if (call->count > PAGE_SIZE)
+		_debug("DATA length MSW: %u", call->count);
+		if (call->count > 0)
 			return -EBADMSG;
 		call->offset = 0;
 		call->unmarshall++;
 
-		if (call->count < PAGE_SIZE) {
-			buffer = kmap_atomic(call->reply3, KM_USER0);
-			memset(buffer + PAGE_SIZE - call->count, 0,
-			       call->count);
-			kunmap_atomic(buffer, KM_USER0);
-		}
-
-		/* extract the returned data */
+	no_msw:
+		/* extract the returned data length */
 	case 2:
-		_debug("extract data");
-		page = call->reply3;
-		buffer = kmap_atomic(page, KM_USER0);
-		ret = afs_extract_data(call, skb, last, buffer, call->count);
-		kunmap_atomic(buffer, KM_USER0);
+		_debug("extract data length");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
 		switch (ret) {
 		case 0:		break;
 		case -EAGAIN:	return 0;
 		default:	return ret;
 		}
 
+		call->count = ntohl(call->tmp);
+		_debug("DATA length: %u", call->count);
+		if (call->count > PAGE_SIZE)
+			return -EBADMSG;
 		call->offset = 0;
 		call->unmarshall++;
 
-		/* extract the metadata */
+		/* extract the returned data */
 	case 3:
+		_debug("extract data");
+		if (call->count > 0) {
+			page = call->reply3;
+			buffer = kmap_atomic(page, KM_USER0);
+			ret = afs_extract_data(call, skb, last, buffer,
+					       call->count);
+			kunmap_atomic(buffer, KM_USER0);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the metadata */
+	case 4:
 		ret = afs_extract_data(call, skb, last, call->buffer,
 				       (21 + 3 + 6) * 4);
 		switch (ret) {
@@ -299,7 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		}
 
 		bp = call->buffer;
-		xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+		xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
 		xdr_decode_AFSCallBack(&bp, vnode);
 		if (call->reply2)
 			xdr_decode_AFSVolSync(&bp, call->reply2);
@@ -307,7 +396,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 		call->offset = 0;
 		call->unmarshall++;
 
-	case 4:
+	case 5:
 		_debug("trailer");
 		if (skb->len != 0)
 			return -EBADMSG;
@@ -317,6 +406,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
 	if (!last)
 		return 0;
 
+	if (call->count < PAGE_SIZE) {
+		_debug("clear");
+		page = call->reply3;
+		buffer = kmap_atomic(page, KM_USER0);
+		memset(buffer + call->count, 0, PAGE_SIZE - call->count);
+		kunmap_atomic(buffer, KM_USER0);
+	}
+
 	_leave(" = 0 [done]");
 	return 0;
 }
@@ -331,6 +428,56 @@ static const struct afs_call_type afs_RXFSFetchData = {
 	.destructor	= afs_flat_call_destructor,
 };
 
+static const struct afs_call_type afs_RXFSFetchData64 = {
+	.name		= "FS.FetchData64",
+	.deliver	= afs_deliver_fs_fetch_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * fetch data from a very large file
+ */
+static int afs_fs_fetch_data64(struct afs_server *server,
+			       struct key *key,
+			       struct afs_vnode *vnode,
+			       off_t offset, size_t length,
+			       struct page *buffer,
+			       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	ASSERTCMP(length, <, ULONG_MAX);
+
+	call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = NULL; /* volsync */
+	call->reply3 = buffer;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->operation_ID = FSFETCHDATA64;
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSFETCHDATA64);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+	bp[4] = htonl(upper_32_bits(offset));
+	bp[5] = htonl((u32) offset);
+	bp[6] = 0;
+	bp[7] = htonl((u32) length);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
 /*
  * fetch data from a file
  */
@@ -344,6 +491,10 @@ int afs_fs_fetch_data(struct afs_server *server,
 	struct afs_call *call;
 	__be32 *bp;
 
+	if (upper_32_bits(offset) || upper_32_bits(offset + length))
+		return afs_fs_fetch_data64(server, key, vnode, offset, length,
+					   buffer, wait_mode);
+
 	_enter("");
 
 	call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
@@ -356,6 +507,7 @@ int afs_fs_fetch_data(struct afs_server *server,
 	call->reply3 = buffer;
 	call->service_id = FS_SERVICE;
 	call->port = htons(AFS_FS_PORT);
+	call->operation_ID = FSFETCHDATA;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -475,8 +627,8 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call,
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
 	xdr_decode_AFSFid(&bp, call->reply2);
-	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
-	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
 	xdr_decode_AFSCallBack_raw(&bp, call->reply4);
 	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
 
@@ -573,7 +725,7 @@ static int afs_deliver_fs_remove(struct afs_call *call,
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
 	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
 
 	_leave(" = 0 [done]");
@@ -656,8 +808,8 @@ static int afs_deliver_fs_link(struct afs_call *call,
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
-	xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL);
 	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
 
 	_leave(" = 0 [done]");
@@ -745,8 +897,8 @@ static int afs_deliver_fs_symlink(struct afs_call *call,
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
 	xdr_decode_AFSFid(&bp, call->reply2);
-	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
-	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
 	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
 
 	_leave(" = 0 [done]");
@@ -851,9 +1003,10 @@ static int afs_deliver_fs_rename(struct afs_call *call,
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode);
+	xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL);
 	if (new_dvnode != orig_dvnode)
-		xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode);
+		xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode,
+					  NULL);
 	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
 
 	_leave(" = 0 [done]");
@@ -935,3 +1088,663 @@ int afs_fs_rename(struct afs_server *server,
 
 	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
 }
+
+/*
+ * deliver reply data to an FS.StoreData
+ */
+static int afs_deliver_fs_store_data(struct afs_call *call,
+				     struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last) {
+		_leave(" = 0 [more]");
+		return 0;
+	}
+
+	if (call->reply_size != call->reply_max) {
+		_leave(" = -EBADMSG [%u != %u]",
+		       call->reply_size, call->reply_max);
+		return -EBADMSG;
+	}
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode,
+				  &call->store_version);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	afs_pages_written_back(vnode, call);
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.StoreData operation type
+ */
+static const struct afs_call_type afs_RXFSStoreData = {
+	.name		= "FS.StoreData",
+	.deliver	= afs_deliver_fs_store_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData64 = {
+	.name		= "FS.StoreData64",
+	.deliver	= afs_deliver_fs_store_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * store a set of pages to a very large file
+ */
+static int afs_fs_store_data64(struct afs_server *server,
+			       struct afs_writeback *wb,
+			       pgoff_t first, pgoff_t last,
+			       unsigned offset, unsigned to,
+			       loff_t size, loff_t pos, loff_t i_size,
+			       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_vnode *vnode = wb->vnode;
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData64,
+				   (4 + 6 + 3 * 2) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->wb = wb;
+	call->key = wb->key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->mapping = vnode->vfs_inode.i_mapping;
+	call->first = first;
+	call->last = last;
+	call->first_offset = offset;
+	call->last_to = to;
+	call->send_pages = true;
+	call->store_version = vnode->status.data_version + 1;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA64);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	*bp++ = 0; /* mask */
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = 0; /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	*bp++ = htonl(pos >> 32);
+	*bp++ = htonl((u32) pos);
+	*bp++ = htonl(size >> 32);
+	*bp++ = htonl((u32) size);
+	*bp++ = htonl(i_size >> 32);
+	*bp++ = htonl((u32) i_size);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * store a set of pages
+ */
+int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
+		      pgoff_t first, pgoff_t last,
+		      unsigned offset, unsigned to,
+		      const struct afs_wait_mode *wait_mode)
+{
+	struct afs_vnode *vnode = wb->vnode;
+	struct afs_call *call;
+	loff_t size, pos, i_size;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+
+	size = to - offset;
+	if (first != last)
+		size += (loff_t)(last - first) << PAGE_SHIFT;
+	pos = (loff_t)first << PAGE_SHIFT;
+	pos += offset;
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (pos + size > i_size)
+		i_size = size + pos;
+
+	_debug("size %llx, at %llx, i_size %llx",
+	       (unsigned long long) size, (unsigned long long) pos,
+	       (unsigned long long) i_size);
+
+	if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32)
+		return afs_fs_store_data64(server, wb, first, last, offset, to,
+					   size, pos, i_size, wait_mode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData,
+				   (4 + 6 + 3) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->wb = wb;
+	call->key = wb->key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->mapping = vnode->vfs_inode.i_mapping;
+	call->first = first;
+	call->last = last;
+	call->first_offset = offset;
+	call->last_to = to;
+	call->send_pages = true;
+	call->store_version = vnode->status.data_version + 1;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	*bp++ = 0; /* mask */
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = 0; /* unix mode */
+	*bp++ = 0; /* segment size */
+
+	*bp++ = htonl(pos);
+	*bp++ = htonl(size);
+	*bp++ = htonl(i_size);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.StoreStatus
+ */
+static int afs_deliver_fs_store_status(struct afs_call *call,
+				       struct sk_buff *skb, bool last)
+{
+	afs_dataversion_t *store_version;
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter(",,%u", last);
+
+	afs_transfer_reply(call, skb);
+	if (!last) {
+		_leave(" = 0 [more]");
+		return 0;
+	}
+
+	if (call->reply_size != call->reply_max) {
+		_leave(" = -EBADMSG [%u != %u]",
+		       call->reply_size, call->reply_max);
+		return -EBADMSG;
+	}
+
+	/* unmarshall the reply once we've received all of it */
+	store_version = NULL;
+	if (call->operation_ID == FSSTOREDATA)
+		store_version = &call->store_version;
+
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.StoreStatus operation type
+ */
+static const struct afs_call_type afs_RXFSStoreStatus = {
+	.name		= "FS.StoreStatus",
+	.deliver	= afs_deliver_fs_store_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData_as_Status = {
+	.name		= "FS.StoreData",
+	.deliver	= afs_deliver_fs_store_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData64_as_Status = {
+	.name		= "FS.StoreData64",
+	.deliver	= afs_deliver_fs_store_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * set the attributes on a very large file, using FS.StoreData rather than
+ * FS.StoreStatus so as to alter the file size also
+ */
+static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
+				 struct afs_vnode *vnode, struct iattr *attr,
+				 const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	ASSERT(attr->ia_valid & ATTR_SIZE);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status,
+				   (4 + 6 + 3 * 2) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->store_version = vnode->status.data_version + 1;
+	call->operation_ID = FSSTOREDATA;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA64);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	xdr_encode_AFS_StoreStatus(&bp, attr);
+
+	*bp++ = 0;				/* position of start of write */
+	*bp++ = 0;
+	*bp++ = 0;				/* size of write */
+	*bp++ = 0;
+	*bp++ = htonl(attr->ia_size >> 32);	/* new file length */
+	*bp++ = htonl((u32) attr->ia_size);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus
+ * so as to alter the file size also
+ */
+static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
+			       struct afs_vnode *vnode, struct iattr *attr,
+			       const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	ASSERT(attr->ia_valid & ATTR_SIZE);
+	if (attr->ia_size >> 32)
+		return afs_fs_setattr_size64(server, key, vnode, attr,
+					     wait_mode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status,
+				   (4 + 6 + 3) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->store_version = vnode->status.data_version + 1;
+	call->operation_ID = FSSTOREDATA;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTOREDATA);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	xdr_encode_AFS_StoreStatus(&bp, attr);
+
+	*bp++ = 0;				/* position of start of write */
+	*bp++ = 0;				/* size of write */
+	*bp++ = htonl(attr->ia_size);		/* new file length */
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * set the attributes on a file, using FS.StoreData if there's a change in file
+ * size, and FS.StoreStatus otherwise
+ */
+int afs_fs_setattr(struct afs_server *server, struct key *key,
+		   struct afs_vnode *vnode, struct iattr *attr,
+		   const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		return afs_fs_setattr_size(server, key, vnode, attr,
+					   wait_mode);
+
+	_enter(",%x,{%x:%u},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+	call = afs_alloc_flat_call(&afs_RXFSStoreStatus,
+				   (4 + 6) * 4,
+				   (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+	call->operation_ID = FSSTORESTATUS;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSSTORESTATUS);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	xdr_encode_AFS_StoreStatus(&bp, attr);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.GetVolumeStatus
+ */
+static int afs_deliver_fs_get_volume_status(struct afs_call *call,
+					    struct sk_buff *skb, bool last)
+{
+	const __be32 *bp;
+	char *p;
+	int ret;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the returned status record */
+	case 1:
+		_debug("extract status");
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       12 * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		bp = call->buffer;
+		xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the volume name length */
+	case 2:
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("volname length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the volume name */
+	case 3:
+		_debug("extract volname");
+		if (call->count > 0) {
+			ret = afs_extract_data(call, skb, last, call->reply3,
+					       call->count);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		p = call->reply3;
+		p[call->count] = 0;
+		_debug("volname '%s'", p);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the volume name padding */
+		if ((call->count & 3) == 0) {
+			call->unmarshall++;
+			goto no_volname_padding;
+		}
+		call->count = 4 - (call->count & 3);
+
+	case 4:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+	no_volname_padding:
+
+		/* extract the offline message length */
+	case 5:
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("offline msg length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the offline message */
+	case 6:
+		_debug("extract offline");
+		if (call->count > 0) {
+			ret = afs_extract_data(call, skb, last, call->reply3,
+					       call->count);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		p = call->reply3;
+		p[call->count] = 0;
+		_debug("offline '%s'", p);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the offline message padding */
+		if ((call->count & 3) == 0) {
+			call->unmarshall++;
+			goto no_offline_padding;
+		}
+		call->count = 4 - (call->count & 3);
+
+	case 7:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+	no_offline_padding:
+
+		/* extract the message of the day length */
+	case 8:
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->count = ntohl(call->tmp);
+		_debug("motd length: %u", call->count);
+		if (call->count >= AFSNAMEMAX)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the message of the day */
+	case 9:
+		_debug("extract motd");
+		if (call->count > 0) {
+			ret = afs_extract_data(call, skb, last, call->reply3,
+					       call->count);
+			switch (ret) {
+			case 0:		break;
+			case -EAGAIN:	return 0;
+			default:	return ret;
+			}
+		}
+
+		p = call->reply3;
+		p[call->count] = 0;
+		_debug("motd '%s'", p);
+
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the message of the day padding */
+		if ((call->count & 3) == 0) {
+			call->unmarshall++;
+			goto no_motd_padding;
+		}
+		call->count = 4 - (call->count & 3);
+
+	case 10:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
+
+		call->offset = 0;
+		call->unmarshall++;
+	no_motd_padding:
+
+	case 11:
+		_debug("trailer %d", skb->len);
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
+
+	if (!last)
+		return 0;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * destroy an FS.GetVolumeStatus call
+ */
+static void afs_get_volume_status_call_destructor(struct afs_call *call)
+{
+	kfree(call->reply3);
+	call->reply3 = NULL;
+	afs_flat_call_destructor(call);
+}
+
+/*
+ * FS.GetVolumeStatus operation type
+ */
+static const struct afs_call_type afs_RXFSGetVolumeStatus = {
+	.name		= "FS.GetVolumeStatus",
+	.deliver	= afs_deliver_fs_get_volume_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_get_volume_status_call_destructor,
+};
+
+/*
+ * fetch the status of a volume
+ */
+int afs_fs_get_volume_status(struct afs_server *server,
+			     struct key *key,
+			     struct afs_vnode *vnode,
+			     struct afs_volume_status *vs,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
+	void *tmpbuf;
+
+	_enter("");
+
+	tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
+	if (!tmpbuf)
+		return -ENOMEM;
+
+	call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
+	if (!call) {
+		kfree(tmpbuf);
+		return -ENOMEM;
+	}
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = vs;
+	call->reply3 = tmpbuf;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSGETVOLUMESTATUS);
+	bp[1] = htonl(vnode->fid.vid);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c184a4ee5995..d196840127c6 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/sched.h>
 #include "internal.h"
 
 struct afs_iget_data {
@@ -125,7 +126,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 	struct inode *inode;
 	int ret;
 
-	_enter(",{%u,%u,%u},,", fid->vid, fid->vnode, fid->unique);
+	_enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique);
 
 	as = sb->s_fs_info;
 	data.volume = as->volume;
@@ -204,6 +205,23 @@ bad_inode:
 }
 
 /*
+ * mark the data attached to an inode as obsolete due to a write on the server
+ * - might also want to ditch all the outstanding writes and dirty pages
+ */
+void afs_zap_data(struct afs_vnode *vnode)
+{
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+	/* nuke all the non-dirty pages that aren't locked, mapped or being
+	 * written back in a regular file and completely discard the pages in a
+	 * directory or symlink */
+	if (S_ISREG(vnode->vfs_inode.i_mode))
+		invalidate_remote_inode(&vnode->vfs_inode);
+	else
+		invalidate_inode_pages2(vnode->vfs_inode.i_mapping);
+}
+
+/*
  * validate a vnode/inode
  * - there are several things we need to check
  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
@@ -258,10 +276,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 
 	/* if the vnode's data version number changed then its contents are
 	 * different */
-	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
-		_debug("zap data {%x:%d}", vnode->fid.vid, vnode->fid.vnode);
-		invalidate_remote_inode(&vnode->vfs_inode);
-	}
+	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+		afs_zap_data(vnode);
 
 	clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
 	mutex_unlock(&vnode->validate_lock);
@@ -278,7 +294,7 @@ error_unlock:
 /*
  * read the attributes of an inode
  */
-int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		      struct kstat *stat)
 {
 	struct inode *inode;
@@ -301,7 +317,7 @@ void afs_clear_inode(struct inode *inode)
 
 	vnode = AFS_FS_I(inode);
 
-	_enter("{%x:%d.%d} v=%u x=%u t=%u }",
+	_enter("{%x:%u.%d} v=%u x=%u t=%u }",
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
@@ -323,6 +339,7 @@ void afs_clear_inode(struct inode *inode)
 		vnode->server = NULL;
 	}
 
+	ASSERT(list_empty(&vnode->writebacks));
 	ASSERT(!vnode->cb_promised);
 
 #ifdef AFS_CACHING_SUPPORT
@@ -339,3 +356,47 @@ void afs_clear_inode(struct inode *inode)
 
 	_leave("");
 }
+
+/*
+ * set the attributes of an inode
+ */
+int afs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	struct key *key;
+	int ret;
+
+	_enter("{%x:%u},{n=%s},%x",
+	       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+	       attr->ia_valid);
+
+	if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
+				ATTR_MTIME))) {
+		_leave(" = 0 [unsupported]");
+		return 0;
+	}
+
+	/* flush any dirty data outstanding on a regular file */
+	if (S_ISREG(vnode->vfs_inode.i_mode)) {
+		filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+		afs_writeback_all(vnode);
+	}
+
+	if (attr->ia_valid & ATTR_FILE) {
+		key = attr->ia_file->private_data;
+	} else {
+		key = afs_request_key(vnode->volume->cell);
+		if (IS_ERR(key)) {
+			ret = PTR_ERR(key);
+			goto error;
+		}
+	}
+
+	ret = afs_vnode_setattr(vnode, key, attr);
+	if (!(attr->ia_valid & ATTR_FILE))
+		key_put(key);
+
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6dd3197d1d8d..2c55dd94a1de 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -16,11 +16,15 @@
 #include <linux/skbuff.h>
 #include <linux/rxrpc.h>
 #include <linux/key.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
 #include "afs.h"
 #include "afs_vl.h"
 
 #define AFS_CELL_MAX_ADDRS 15
 
+struct pagevec;
 struct afs_call;
 
 typedef enum {
@@ -75,12 +79,15 @@ struct afs_call {
 	struct key		*key;		/* security for this call */
 	struct afs_server	*server;	/* server affected by incoming CM call */
 	void			*request;	/* request data (first part) */
-	void			*request2;	/* request data (second part) */
+	struct address_space	*mapping;	/* page set */
+	struct afs_writeback	*wb;		/* writeback being performed */
 	void			*buffer;	/* reply receive buffer */
 	void			*reply;		/* reply buffer (first part) */
 	void			*reply2;	/* reply buffer (second part) */
 	void			*reply3;	/* reply buffer (third part) */
 	void			*reply4;	/* reply buffer (fourth part) */
+	pgoff_t			first;		/* first page in mapping to deal with */
+	pgoff_t			last;		/* last page in mapping to deal with */
 	enum {					/* call state */
 		AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
 		AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
@@ -97,14 +104,18 @@ struct afs_call {
 	unsigned		request_size;	/* size of request data */
 	unsigned		reply_max;	/* maximum size of reply */
 	unsigned		reply_size;	/* current size of reply */
+	unsigned		first_offset;	/* offset into mapping[first] */
+	unsigned		last_to;	/* amount of mapping[last] */
 	unsigned short		offset;		/* offset into received data store */
 	unsigned char		unmarshall;	/* unmarshalling phase */
 	bool			incoming;	/* T if incoming call */
+	bool			send_pages;	/* T if data from mapping should be sent */
 	u16			service_id;	/* RxRPC service ID to call */
 	__be16			port;		/* target UDP port */
 	__be32			operation_ID;	/* operation ID for an incoming call */
 	u32			count;		/* count for use in unmarshalling */
 	__be32			tmp;		/* place to extract temporary data */
+	afs_dataversion_t	store_version;	/* updated version expected from store */
 };
 
 struct afs_call_type {
@@ -124,6 +135,32 @@ struct afs_call_type {
 };
 
 /*
+ * record of an outstanding writeback on a vnode
+ */
+struct afs_writeback {
+	struct list_head	link;		/* link in vnode->writebacks */
+	struct work_struct	writer;		/* work item to perform the writeback */
+	struct afs_vnode	*vnode;		/* vnode to which this write applies */
+	struct key		*key;		/* owner of this write */
+	wait_queue_head_t	waitq;		/* completion and ready wait queue */
+	pgoff_t			first;		/* first page in batch */
+	pgoff_t			point;		/* last page in current store op */
+	pgoff_t			last;		/* last page in batch (inclusive) */
+	unsigned		offset_first;	/* offset into first page of start of write */
+	unsigned		to_last;	/* offset into last page of end of write */
+	int			num_conflicts;	/* count of conflicting writes in list */
+	int			usage;
+	bool			conflicts;	/* T if has dependent conflicts */
+	enum {
+		AFS_WBACK_SYNCING,		/* synchronisation being performed */
+		AFS_WBACK_PENDING,		/* write pending */
+		AFS_WBACK_CONFLICTING,		/* conflicting writes posted */
+		AFS_WBACK_WRITING,		/* writing back */
+		AFS_WBACK_COMPLETE		/* the writeback record has been unlinked */
+	} state __attribute__((packed));
+};
+
+/*
  * AFS superblock private data
  * - there's one superblock per volume
  */
@@ -305,6 +342,7 @@ struct afs_vnode {
 	wait_queue_head_t	update_waitq;	/* status fetch waitqueue */
 	int			update_cnt;	/* number of outstanding ops that will update the
 						 * status */
+	spinlock_t		writeback_lock;	/* lock for writebacks */
 	spinlock_t		lock;		/* waitqueue/flags lock */
 	unsigned long		flags;
 #define AFS_VNODE_CB_BROKEN	0		/* set if vnode's callback was broken */
@@ -316,6 +354,8 @@ struct afs_vnode {
 
 	long			acl_order;	/* ACL check count (callback break count) */
 
+	struct list_head	writebacks;	/* alterations in pagecache that need writing */
+
 	/* outstanding callback notification on this file */
 	struct rb_node		server_rb;	/* link in server->fs_vnodes */
 	struct rb_node		cb_promise;	/* link in server->cb_promises */
@@ -349,7 +389,6 @@ struct afs_permits {
  * record of one of a system's set of network interfaces
  */
 struct afs_interface {
-	unsigned	index;		/* interface index */
 	struct in_addr	address;	/* IPv4 address bound to interface */
 	struct in_addr	netmask;	/* netmask applied to address */
 	unsigned	mtu;		/* MTU of interface */
@@ -367,7 +406,7 @@ struct afs_uuid {
 	u32		time_low;			/* low part of timestamp */
 	u16		time_mid;			/* mid part of timestamp */
 	u16		time_hi_and_version;		/* high part of timestamp and version  */
-#define AFS_UUID_TO_UNIX_TIME	0x01b21dd213814000
+#define AFS_UUID_TO_UNIX_TIME	0x01b21dd213814000ULL
 #define AFS_UUID_TIMEHI_MASK	0x0fff
 #define AFS_UUID_VERSION_TIME	0x1000	/* time-based UUID */
 #define AFS_UUID_VERSION_NAME	0x3000	/* name-based UUID */
@@ -392,7 +431,7 @@ extern void afs_give_up_callback(struct afs_vnode *);
 extern void afs_dispatch_give_up_callbacks(struct work_struct *);
 extern void afs_flush_callback_breaks(struct afs_server *);
 extern int __init afs_callback_update_init(void);
-extern void __exit afs_callback_update_kill(void);
+extern void afs_callback_update_kill(void);
 
 /*
  * cell.c
@@ -434,10 +473,6 @@ extern const struct file_operations afs_file_operations;
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
 
-#ifdef AFS_CACHING_SUPPORT
-extern int afs_cache_get_page_cookie(struct page *, struct cachefs_page **);
-#endif
-
 /*
  * fsclient.c
  */
@@ -468,6 +503,16 @@ extern int afs_fs_rename(struct afs_server *, struct key *,
 			 struct afs_vnode *, const char *,
 			 struct afs_vnode *, const char *,
 			 const struct afs_wait_mode *);
+extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *,
+			     pgoff_t, pgoff_t, unsigned, unsigned,
+			     const struct afs_wait_mode *);
+extern int afs_fs_setattr(struct afs_server *, struct key *,
+			  struct afs_vnode *, struct iattr *,
+			  const struct afs_wait_mode *);
+extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
+				    struct afs_vnode *,
+				    struct afs_volume_status *,
+				    const struct afs_wait_mode *);
 
 /*
  * inode.c
@@ -475,10 +520,10 @@ extern int afs_fs_rename(struct afs_server *, struct key *,
 extern struct inode *afs_iget(struct super_block *, struct key *,
 			      struct afs_fid *, struct afs_file_status *,
 			      struct afs_callback *);
+extern void afs_zap_data(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
-extern int afs_inode_getattr(struct vfsmount *, struct dentry *,
-			     struct kstat *);
-extern void afs_zap_permits(struct rcu_head *);
+extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int afs_setattr(struct dentry *, struct iattr *);
 extern void afs_clear_inode(struct inode *);
 
 /*
@@ -534,6 +579,7 @@ extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
  */
 extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
 extern int afs_permission(struct inode *, int, struct nameidata *);
 
@@ -564,7 +610,7 @@ extern void afs_fs_exit(void);
  * use-rtnetlink.c
  */
 extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
-extern int afs_get_MAC_address(u8 [6]);
+extern int afs_get_MAC_address(u8 *, size_t);
 
 /*
  * vlclient.c
@@ -591,7 +637,7 @@ extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
 						  struct key *,
 						  const char *, size_t);
 extern void afs_put_vlocation(struct afs_vlocation *);
-extern void __exit afs_vlocation_purge(void);
+extern void afs_vlocation_purge(void);
 
 /*
  * vnode.c
@@ -630,6 +676,11 @@ extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
 			     struct afs_file_status *, struct afs_server **);
 extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
 			    struct key *, const char *, const char *);
+extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
+				unsigned, unsigned);
+extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
+extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
+				       struct afs_volume_status *);
 
 /*
  * volume.c
@@ -646,6 +697,23 @@ extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
 extern int afs_volume_release_fileserver(struct afs_vnode *,
 					 struct afs_server *, int);
 
+/*
+ * write.c
+ */
+extern int afs_set_page_dirty(struct page *);
+extern void afs_put_writeback(struct afs_writeback *);
+extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned);
+extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int afs_writepage(struct page *, struct writeback_control *);
+extern int afs_writepages(struct address_space *, struct writeback_control *);
+extern int afs_write_inode(struct inode *, int);
+extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
+extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
+			      unsigned long, loff_t);
+extern int afs_writeback_all(struct afs_vnode *);
+extern int afs_fsync(struct file *, struct dentry *, int);
+
+
 /*****************************************************************************/
 /*
  * debug tracing
@@ -727,6 +795,21 @@ do {									\
 	}								\
 } while(0)
 
+#define ASSERTRANGE(L, OP1, N, OP2, H)					\
+do {									\
+	if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) {		\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "AFS: Assertion failed\n");		\
+		printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n",	\
+		       (unsigned long)(L), (unsigned long)(N),		\
+		       (unsigned long)(H));				\
+		printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \
+		       (unsigned long)(L), (unsigned long)(N),		\
+		       (unsigned long)(H));				\
+		BUG();							\
+	}								\
+} while(0)
+
 #define ASSERTIF(C, X)						\
 do {								\
 	if (unlikely((C) && !(X))) {				\
@@ -759,6 +842,10 @@ do {						\
 do {						\
 } while(0)
 
+#define ASSERTRANGE(L, OP1, N, OP2, H)		\
+do {						\
+} while(0)
+
 #define ASSERTIF(C, X)				\
 do {						\
 } while(0)
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 40c2704e7557..cd21195bbb24 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -13,6 +13,7 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/completion.h>
+#include <linux/sched.h>
 #include "internal.h"
 
 MODULE_DESCRIPTION("AFS Client File System");
@@ -54,7 +55,7 @@ static int __init afs_get_client_UUID(void)
 
 	/* read the MAC address of one of the external interfaces and construct
 	 * a UUID from it */
-	ret = afs_get_MAC_address(afs_uuid.node);
+	ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node));
 	if (ret < 0)
 		return ret;
 
@@ -149,6 +150,7 @@ error_cache:
 	afs_vlocation_purge();
 	afs_cell_purge();
 	afs_proc_cleanup();
+	rcu_barrier();
 	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
 	return ret;
 }
@@ -176,6 +178,7 @@ static void __exit afs_exit(void)
 	cachefs_unregister_netfs(&afs_cache_netfs);
 #endif
 	afs_proc_cleanup();
+	rcu_barrier();
 }
 
 module_exit(afs_exit);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index cdb9792d8161..d1a889c40742 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -22,6 +22,7 @@ int afs_abort_to_error(u32 abort_code)
 {
 	switch (abort_code) {
 	case 13:		return -EACCES;
+	case 27:		return -EFBIG;
 	case 30:		return -EROFS;
 	case VSALVAGE:		return -EIO;
 	case VNOVNODE:		return -ENOENT;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b905ae37f912..a3684dcc76e7 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -36,7 +36,7 @@ const struct inode_operations afs_mntpt_inode_operations = {
 	.lookup		= afs_mntpt_lookup,
 	.follow_link	= afs_mntpt_follow_link,
 	.readlink	= page_readlink,
-	.getattr	= afs_inode_getattr,
+	.getattr	= afs_getattr,
 };
 
 static LIST_HEAD(afs_vfsmounts);
@@ -58,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 	char *buf;
 	int ret;
 
-	_enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
+	_enter("{%x:%u,%u}",
+	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
 
 	/* read the contents of the symlink into the pagecache */
 	page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
@@ -68,13 +69,11 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 	}
 
 	ret = -EIO;
-	wait_on_page_locked(page);
-	buf = kmap(page);
-	if (!PageUptodate(page))
-		goto out_free;
 	if (PageError(page))
 		goto out_free;
 
+	buf = kmap(page);
+
 	/* examine the symlink's contents */
 	size = vnode->status.size;
 	_debug("symlink to %*.*s", (int) size, (int) size, buf);
@@ -91,8 +90,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 
 	ret = 0;
 
-out_free:
 	kunmap(page);
+out_free:
 	page_cache_release(page);
 out:
 	_leave(" = %d", ret);
@@ -171,8 +170,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	}
 
 	ret = -EIO;
-	wait_on_page_locked(page);
-	if (!PageUptodate(page) || PageError(page))
+	if (PageError(page))
 		goto error;
 
 	buf = kmap(page);
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
new file mode 100644
index 000000000000..fc27d4b52e5f
--- /dev/null
+++ b/fs/afs/netdevices.c
@@ -0,0 +1,68 @@
+/* AFS network device helpers
+ *
+ * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/string.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include "internal.h"
+
+/*
+ * get a MAC address from a random ethernet interface that has a real one
+ * - the buffer will normally be 6 bytes in size
+ */
+int afs_get_MAC_address(u8 *mac, size_t maclen)
+{
+	struct net_device *dev;
+	int ret = -ENODEV;
+
+	if (maclen != ETH_ALEN)
+		BUG();
+
+	rtnl_lock();
+	dev = __dev_getfirstbyhwtype(ARPHRD_ETHER);
+	if (dev) {
+		memcpy(mac, dev->dev_addr, maclen);
+		ret = 0;
+	}
+	rtnl_unlock();
+	return ret;
+}
+
+/*
+ * get a list of this system's interface IPv4 addresses, netmasks and MTUs
+ * - maxbufs must be at least 1
+ * - returns the number of interface records in the buffer
+ */
+int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
+			    bool wantloopback)
+{
+	struct net_device *dev;
+	struct in_device *idev;
+	int n = 0;
+
+	ASSERT(maxbufs > 0);
+
+	rtnl_lock();
+	for_each_netdev(dev) {
+		if (dev->type == ARPHRD_LOOPBACK && !wantloopback)
+			continue;
+		idev = __in_dev_get_rtnl(dev);
+		if (!idev)
+			continue;
+		for_primary_ifa(idev) {
+			bufs[n].address.s_addr = ifa->ifa_address;
+			bufs[n].netmask.s_addr = ifa->ifa_mask;
+			bufs[n].mtu = dev->mtu;
+			n++;
+			if (n >= maxbufs)
+				goto out;
+		} endfor_ifa(idev);
+	}
+out:
+	rtnl_unlock();
+	return n;
+}
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index d5601f617cdb..13df512aea9e 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index e7b047328a39..1b36f45076ad 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -237,6 +237,70 @@ void afs_flat_call_destructor(struct afs_call *call)
 }
 
 /*
+ * attach the data from a bunch of pages on an inode to a call
+ */
+int afs_send_pages(struct afs_call *call, struct msghdr *msg, struct kvec *iov)
+{
+	struct page *pages[8];
+	unsigned count, n, loop, offset, to;
+	pgoff_t first = call->first, last = call->last;
+	int ret;
+
+	_enter("");
+
+	offset = call->first_offset;
+	call->first_offset = 0;
+
+	do {
+		_debug("attach %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > ARRAY_SIZE(pages))
+			count = ARRAY_SIZE(pages);
+		n = find_get_pages_contig(call->mapping, first, count, pages);
+		ASSERTCMP(n, ==, count);
+
+		loop = 0;
+		do {
+			msg->msg_flags = 0;
+			to = PAGE_SIZE;
+			if (first + loop >= last)
+				to = call->last_to;
+			else
+				msg->msg_flags = MSG_MORE;
+			iov->iov_base = kmap(pages[loop]) + offset;
+			iov->iov_len = to - offset;
+			offset = 0;
+
+			_debug("- range %u-%u%s",
+			       offset, to, msg->msg_flags ? " [more]" : "");
+			msg->msg_iov = (struct iovec *) iov;
+			msg->msg_iovlen = 1;
+
+			/* have to change the state *before* sending the last
+			 * packet as RxRPC might give us the reply before it
+			 * returns from sending the request */
+			if (first + loop >= last)
+				call->state = AFS_CALL_AWAIT_REPLY;
+			ret = rxrpc_kernel_send_data(call->rxcall, msg,
+						     to - offset);
+			kunmap(pages[loop]);
+			if (ret < 0)
+				break;
+		} while (++loop < count);
+		first += count;
+
+		for (loop = 0; loop < count; loop++)
+			put_page(pages[loop]);
+		if (ret < 0)
+			break;
+	} while (first <= last);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
  * initiate a call
  */
 int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
@@ -253,8 +317,9 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	ASSERT(call->type != NULL);
 	ASSERT(call->type->name != NULL);
 
-	_debug("MAKE %p{%s} [%d]",
-	       call, call->type->name, atomic_read(&afs_outstanding_calls));
+	_debug("____MAKE %p{%s,%x} [%d]____",
+	       call, call->type->name, key_serial(call->key),
+	       atomic_read(&afs_outstanding_calls));
 
 	call->wait_mode = wait_mode;
 	INIT_WORK(&call->async_work, afs_process_async_call);
@@ -289,16 +354,23 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	msg.msg_iovlen		= 1;
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
-	msg.msg_flags		= 0;
+	msg.msg_flags		= (call->send_pages ? MSG_MORE : 0);
 
 	/* have to change the state *before* sending the last packet as RxRPC
 	 * might give us the reply before it returns from sending the
 	 * request */
-	call->state = AFS_CALL_AWAIT_REPLY;
+	if (!call->send_pages)
+		call->state = AFS_CALL_AWAIT_REPLY;
 	ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
 	if (ret < 0)
 		goto error_do_abort;
 
+	if (call->send_pages) {
+		ret = afs_send_pages(call, &msg, iov);
+		if (ret < 0)
+			goto error_do_abort;
+	}
+
 	/* at this point, an async call may no longer exist as it may have
 	 * already completed */
 	return wait_mode->wait(call);
@@ -772,7 +844,7 @@ int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
 
 	if (call->offset < count) {
 		if (last) {
-			_leave(" = -EBADMSG [%d < %lu]", call->offset, count);
+			_leave(" = -EBADMSG [%d < %zu]", call->offset, count);
 			return -EBADMSG;
 		}
 		_leave(" = -EAGAIN");
diff --git a/fs/afs/security.c b/fs/afs/security.c
index f9f424d80458..566fe712c682 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/ctype.h>
+#include <linux/sched.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
 
@@ -109,7 +110,7 @@ void afs_clear_permits(struct afs_vnode *vnode)
 {
 	struct afs_permits *permits;
 
-	_enter("{%x}", vnode->fid.vnode);
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
 
 	mutex_lock(&vnode->permits_lock);
 	permits = vnode->permits;
@@ -132,7 +133,8 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
 	struct afs_vnode *auth_vnode;
 	int count, loop;
 
-	_enter("{%x},%x,%lx", vnode->fid.vnode, key_serial(key), acl_order);
+	_enter("{%x:%u},%x,%lx",
+	       vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order);
 
 	auth_vnode = afs_get_auth_inode(vnode, key);
 	if (IS_ERR(auth_vnode)) {
@@ -220,7 +222,8 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 	bool valid;
 	int loop, ret;
 
-	_enter("");
+	_enter("{%x:%u},%x",
+	       vnode->fid.vid, vnode->fid.vnode, key_serial(key));
 
 	auth_vnode = afs_get_auth_inode(vnode, key);
 	if (IS_ERR(auth_vnode)) {
@@ -268,9 +271,9 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 			_leave(" = %d", ret);
 			return ret;
 		}
+		*_access = vnode->status.caller_access;
 	}
 
-	*_access = vnode->status.caller_access;
 	iput(&auth_vnode->vfs_inode);
 	_leave(" = 0 [access %x]", *_access);
 	return 0;
@@ -288,7 +291,7 @@ int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
 	struct key *key;
 	int ret;
 
-	_enter("{{%x:%x},%lx},%x,",
+	_enter("{{%x:%u},%lx},%x,",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
 
 	key = afs_request_key(vnode->volume->cell);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 96bb23b476a2..231ae4150279 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -252,6 +252,9 @@ static void afs_destroy_server(struct afs_server *server)
 {
 	_enter("%p", server);
 
+	ASSERTIF(server->cb_break_head != server->cb_break_tail,
+		 delayed_work_pending(&server->cb_break_work));
+
 	ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
 	ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
 	ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index cebd03c91f57..2e8496ba1205 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -20,35 +20,35 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+#include <linux/sched.h>
 #include "internal.h"
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 
 static void afs_i_init_once(void *foo, struct kmem_cache *cachep,
 			    unsigned long flags);
-
 static int afs_get_sb(struct file_system_type *fs_type,
 		      int flags, const char *dev_name,
 		      void *data, struct vfsmount *mnt);
-
 static struct inode *afs_alloc_inode(struct super_block *sb);
-
 static void afs_put_super(struct super_block *sb);
-
 static void afs_destroy_inode(struct inode *inode);
+static int afs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 struct file_system_type afs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "afs",
 	.get_sb		= afs_get_sb,
 	.kill_sb	= kill_anon_super,
-	.fs_flags	= FS_BINARY_MOUNTDATA,
+	.fs_flags	= 0,
 };
 
 static const struct super_operations afs_super_ops = {
-	.statfs		= simple_statfs,
+	.statfs		= afs_statfs,
 	.alloc_inode	= afs_alloc_inode,
-	.drop_inode	= generic_delete_inode,
+	.write_inode	= afs_write_inode,
 	.destroy_inode	= afs_destroy_inode,
 	.clear_inode	= afs_clear_inode,
 	.umount_begin	= afs_umount_begin,
@@ -58,6 +58,20 @@ static const struct super_operations afs_super_ops = {
 static struct kmem_cache *afs_inode_cachep;
 static atomic_t afs_count_active_inodes;
 
+enum {
+	afs_no_opt,
+	afs_opt_cell,
+	afs_opt_rwpath,
+	afs_opt_vol,
+};
+
+static match_table_t afs_options_list = {
+	{ afs_opt_cell,		"cell=%s"	},
+	{ afs_opt_rwpath,	"rwpath"	},
+	{ afs_opt_vol,		"vol=%s"	},
+	{ afs_no_opt,		NULL		},
+};
+
 /*
  * initialise the filesystem
  */
@@ -115,31 +129,6 @@ void __exit afs_fs_exit(void)
 }
 
 /*
- * check that an argument has a value
- */
-static int want_arg(char **_value, const char *option)
-{
-	if (!_value || !*_value || !**_value) {
-		printk(KERN_NOTICE "kAFS: %s: argument missing\n", option);
-		return 0;
-	}
-	return 1;
-}
-
-/*
- * check that there's no subsequent value
- */
-static int want_no_value(char *const *_value, const char *option)
-{
-	if (*_value && **_value) {
-		printk(KERN_NOTICE "kAFS: %s: Invalid argument: %s\n",
-		       option, *_value);
-		return 0;
-	}
-	return 1;
-}
-
-/*
  * parse the mount options
  * - this function has been shamelessly adapted from the ext3 fs which
  *   shamelessly adapted it from the msdos fs
@@ -148,48 +137,46 @@ static int afs_parse_options(struct afs_mount_params *params,
 			     char *options, const char **devname)
 {
 	struct afs_cell *cell;
-	char *key, *value;
-	int ret;
+	substring_t args[MAX_OPT_ARGS];
+	char *p;
+	int token;
 
 	_enter("%s", options);
 
 	options[PAGE_SIZE - 1] = 0;
 
-	ret = 0;
-	while ((key = strsep(&options, ","))) {
-		value = strchr(key, '=');
-		if (value)
-			*value++ = 0;
-
-		_debug("kAFS: KEY: %s, VAL:%s", key, value ?: "-");
+	while ((p = strsep(&options, ","))) {
+		if (!*p)
+			continue;
 
-		if (strcmp(key, "rwpath") == 0) {
-			if (!want_no_value(&value, "rwpath"))
-				return -EINVAL;
-			params->rwpath = 1;
-		} else if (strcmp(key, "vol") == 0) {
-			if (!want_arg(&value, "vol"))
-				return -EINVAL;
-			*devname = value;
-		} else if (strcmp(key, "cell") == 0) {
-			if (!want_arg(&value, "cell"))
-				return -EINVAL;
-			cell = afs_cell_lookup(value, strlen(value));
+		token = match_token(p, afs_options_list, args);
+		switch (token) {
+		case afs_opt_cell:
+			cell = afs_cell_lookup(args[0].from,
+					       args[0].to - args[0].from);
 			if (IS_ERR(cell))
 				return PTR_ERR(cell);
 			afs_put_cell(params->cell);
 			params->cell = cell;
-		} else {
-			printk("kAFS: Unknown mount option: '%s'\n",  key);
-			ret = -EINVAL;
-			goto error;
+			break;
+
+		case afs_opt_rwpath:
+			params->rwpath = 1;
+			break;
+
+		case afs_opt_vol:
+			*devname = args[0].from;
+			break;
+
+		default:
+			printk(KERN_ERR "kAFS:"
+			       " Unknown or invalid mount option: '%s'\n", p);
+			return -EINVAL;
 		}
 	}
 
-	ret = 0;
-error:
-	_leave(" = %d", ret);
-	return ret;
+	_leave(" = 0");
+	return 0;
 }
 
 /*
@@ -361,7 +348,6 @@ error:
 
 /*
  * get an AFS superblock
- * - TODO: don't use get_sb_nodev(), but rather call sget() directly
  */
 static int afs_get_sb(struct file_system_type *fs_type,
 		      int flags,
@@ -386,7 +372,6 @@ static int afs_get_sb(struct file_system_type *fs_type,
 			goto error;
 	}
 
-
 	ret = afs_parse_device_name(&params, dev_name);
 	if (ret < 0)
 		goto error;
@@ -467,16 +452,15 @@ static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
 {
 	struct afs_vnode *vnode = _vnode;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		memset(vnode, 0, sizeof(*vnode));
-		inode_init_once(&vnode->vfs_inode);
-		init_waitqueue_head(&vnode->update_waitq);
-		mutex_init(&vnode->permits_lock);
-		mutex_init(&vnode->validate_lock);
-		spin_lock_init(&vnode->lock);
-		INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
-	}
+	memset(vnode, 0, sizeof(*vnode));
+	inode_init_once(&vnode->vfs_inode);
+	init_waitqueue_head(&vnode->update_waitq);
+	mutex_init(&vnode->permits_lock);
+	mutex_init(&vnode->validate_lock);
+	spin_lock_init(&vnode->writeback_lock);
+	spin_lock_init(&vnode->lock);
+	INIT_LIST_HEAD(&vnode->writebacks);
+	INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
 }
 
 /*
@@ -500,6 +484,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 	vnode->flags		= 1 << AFS_VNODE_UNSET;
 	vnode->cb_promised	= false;
 
+	_leave(" = %p", &vnode->vfs_inode);
 	return &vnode->vfs_inode;
 }
 
@@ -510,7 +495,7 @@ static void afs_destroy_inode(struct inode *inode)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 
-	_enter("{%lu}", inode->i_ino);
+	_enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode);
 
 	_debug("DESTROY INODE %p", inode);
 
@@ -519,3 +504,36 @@ static void afs_destroy_inode(struct inode *inode)
 	kmem_cache_free(afs_inode_cachep, vnode);
 	atomic_dec(&afs_count_active_inodes);
 }
+
+/*
+ * return information about an AFS volume
+ */
+static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct afs_volume_status vs;
+	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	struct key *key;
+	int ret;
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	ret = afs_vnode_get_volume_status(vnode, key, &vs);
+	key_put(key);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	buf->f_type	= dentry->d_sb->s_magic;
+	buf->f_bsize	= AFS_BLOCK_SIZE;
+	buf->f_namelen	= AFSNAMEMAX - 1;
+
+	if (vs.max_quota == 0)
+		buf->f_blocks = vs.part_max_blocks;
+	else
+		buf->f_blocks = vs.max_quota;
+	buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use;
+	return 0;
+}
diff --git a/fs/afs/use-rtnetlink.c b/fs/afs/use-rtnetlink.c
deleted file mode 100644
index 82f0daa28970..000000000000
--- a/fs/afs/use-rtnetlink.c
+++ /dev/null
@@ -1,473 +0,0 @@
-/* RTNETLINK client
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/if_addr.h>
-#include <linux/if_arp.h>
-#include <linux/inetdevice.h>
-#include <net/netlink.h>
-#include "internal.h"
-
-struct afs_rtm_desc {
-	struct socket		*nlsock;
-	struct afs_interface	*bufs;
-	u8			*mac;
-	size_t			nbufs;
-	size_t			maxbufs;
-	void			*data;
-	ssize_t			datalen;
-	size_t			datamax;
-	int			msg_seq;
-	unsigned		mac_index;
-	bool			wantloopback;
-	int (*parse)(struct afs_rtm_desc *, struct nlmsghdr *);
-};
-
-/*
- * parse an RTM_GETADDR response
- */
-static int afs_rtm_getaddr_parse(struct afs_rtm_desc *desc,
-				 struct nlmsghdr *nlhdr)
-{
-	struct afs_interface *this;
-	struct ifaddrmsg *ifa;
-	struct rtattr *rtattr;
-	const char *name;
-	size_t len;
-
-	ifa = (struct ifaddrmsg *) NLMSG_DATA(nlhdr);
-
-	_enter("{ix=%d,af=%d}", ifa->ifa_index, ifa->ifa_family);
-
-	if (ifa->ifa_family != AF_INET) {
-		_leave(" = 0 [family %d]", ifa->ifa_family);
-		return 0;
-	}
-	if (desc->nbufs >= desc->maxbufs) {
-		_leave(" = 0 [max %zu/%zu]", desc->nbufs, desc->maxbufs);
-		return 0;
-	}
-
-	this = &desc->bufs[desc->nbufs];
-
-	this->index = ifa->ifa_index;
-	this->netmask.s_addr = inet_make_mask(ifa->ifa_prefixlen);
-	this->mtu = 0;
-
-	rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifaddrmsg));
-	len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifaddrmsg));
-
-	name = "unknown";
-	for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
-		switch (rtattr->rta_type) {
-		case IFA_ADDRESS:
-			memcpy(&this->address, RTA_DATA(rtattr), 4);
-			break;
-		case IFA_LABEL:
-			name = RTA_DATA(rtattr);
-			break;
-		}
-	}
-
-	_debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT,
-	       name, NIPQUAD(this->address), NIPQUAD(this->netmask));
-
-	desc->nbufs++;
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * parse an RTM_GETLINK response for MTUs
- */
-static int afs_rtm_getlink_if_parse(struct afs_rtm_desc *desc,
-				    struct nlmsghdr *nlhdr)
-{
-	struct afs_interface *this;
-	struct ifinfomsg *ifi;
-	struct rtattr *rtattr;
-	const char *name;
-	size_t len, loop;
-
-	ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
-
-	_enter("{ix=%d}", ifi->ifi_index);
-
-	for (loop = 0; loop < desc->nbufs; loop++) {
-		this = &desc->bufs[loop];
-		if (this->index == ifi->ifi_index)
-			goto found;
-	}
-
-	_leave(" = 0 [no match]");
-	return 0;
-
-found:
-	if (ifi->ifi_type == ARPHRD_LOOPBACK && !desc->wantloopback) {
-		_leave(" = 0 [loopback]");
-		return 0;
-	}
-
-	rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
-	len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
-
-	name = "unknown";
-	for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
-		switch (rtattr->rta_type) {
-		case IFLA_MTU:
-			memcpy(&this->mtu, RTA_DATA(rtattr), 4);
-			break;
-		case IFLA_IFNAME:
-			name = RTA_DATA(rtattr);
-			break;
-		}
-	}
-
-	_debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
-	       name, NIPQUAD(this->address), NIPQUAD(this->netmask),
-	       this->mtu);
-
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * parse an RTM_GETLINK response for the MAC address belonging to the lowest
- * non-internal interface
- */
-static int afs_rtm_getlink_mac_parse(struct afs_rtm_desc *desc,
-				     struct nlmsghdr *nlhdr)
-{
-	struct ifinfomsg *ifi;
-	struct rtattr *rtattr;
-	const char *name;
-	size_t remain, len;
-	bool set;
-
-	ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
-
-	_enter("{ix=%d}", ifi->ifi_index);
-
-	if (ifi->ifi_index >= desc->mac_index) {
-		_leave(" = 0 [high]");
-		return 0;
-	}
-	if (ifi->ifi_type == ARPHRD_LOOPBACK) {
-		_leave(" = 0 [loopback]");
-		return 0;
-	}
-
-	rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
-	remain = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
-
-	name = "unknown";
-	set = false;
-	for (; RTA_OK(rtattr, remain); rtattr = RTA_NEXT(rtattr, remain)) {
-		switch (rtattr->rta_type) {
-		case IFLA_ADDRESS:
-			len = RTA_PAYLOAD(rtattr);
-			memcpy(desc->mac, RTA_DATA(rtattr),
-			       min_t(size_t, len, 6));
-			desc->mac_index = ifi->ifi_index;
-			set = true;
-			break;
-		case IFLA_IFNAME:
-			name = RTA_DATA(rtattr);
-			break;
-		}
-	}
-
-	if (set)
-		_debug("%s: %02x:%02x:%02x:%02x:%02x:%02x",
-		       name,
-		       desc->mac[0], desc->mac[1], desc->mac[2],
-		       desc->mac[3], desc->mac[4], desc->mac[5]);
-
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * read the rtnetlink response and pass to parsing routine
- */
-static int afs_read_rtm(struct afs_rtm_desc *desc)
-{
-	struct nlmsghdr *nlhdr, tmphdr;
-	struct msghdr msg;
-	struct kvec iov[1];
-	void *data;
-	bool last = false;
-	int len, ret, remain;
-
-	_enter("");
-
-	do {
-		/* first of all peek to see how big the packet is */
-		memset(&msg, 0, sizeof(msg));
-		iov[0].iov_base = &tmphdr;
-		iov[0].iov_len = sizeof(tmphdr);
-		len = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
-				     sizeof(tmphdr), MSG_PEEK | MSG_TRUNC);
-		if (len < 0) {
-			_leave(" = %d [peek]", len);
-			return len;
-		}
-		if (len == 0)
-			continue;
-		if (len < sizeof(tmphdr) || len < NLMSG_PAYLOAD(&tmphdr, 0)) {
-			_leave(" = -EMSGSIZE");
-			return -EMSGSIZE;
-		}
-
-		if (desc->datamax < len) {
-			kfree(desc->data);
-			desc->data = NULL;
-			data = kmalloc(len, GFP_KERNEL);
-			if (!data)
-				return -ENOMEM;
-			desc->data = data;
-		}
-		desc->datamax = len;
-
-		/* read all the data from this packet */
-		iov[0].iov_base = desc->data;
-		iov[0].iov_len = desc->datamax;
-		desc->datalen = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
-					       desc->datamax, 0);
-		if (desc->datalen < 0) {
-			_leave(" = %ld [recv]", desc->datalen);
-			return desc->datalen;
-		}
-
-		nlhdr = desc->data;
-
-		/* check if the header is valid */
-		if (!NLMSG_OK(nlhdr, desc->datalen) ||
-		    nlhdr->nlmsg_type == NLMSG_ERROR) {
-			_leave(" = -EIO");
-			return -EIO;
-		}
-
-		/* see if this is the last message */
-		if (nlhdr->nlmsg_type == NLMSG_DONE ||
-		    !(nlhdr->nlmsg_flags & NLM_F_MULTI))
-			last = true;
-
-		/* parse the bits we got this time */
-		nlmsg_for_each_msg(nlhdr, desc->data, desc->datalen, remain) {
-			ret = desc->parse(desc, nlhdr);
-			if (ret < 0) {
-				_leave(" = %d [parse]", ret);
-				return ret;
-			}
-		}
-
-	} while (!last);
-
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * list the interface bound addresses to get the address and netmask
- */
-static int afs_rtm_getaddr(struct afs_rtm_desc *desc)
-{
-	struct msghdr msg;
-	struct kvec iov[1];
-	int ret;
-
-	struct {
-		struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
-		struct ifaddrmsg addr_msg __attribute__((aligned(NLMSG_ALIGNTO)));
-	} request;
-
-	_enter("");
-
-	memset(&request, 0, sizeof(request));
-
-	request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
-	request.nl_msg.nlmsg_type = RTM_GETADDR;
-	request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
-	request.nl_msg.nlmsg_seq = desc->msg_seq++;
-	request.nl_msg.nlmsg_pid = 0;
-
-	memset(&msg, 0, sizeof(msg));
-	iov[0].iov_base = &request;
-	iov[0].iov_len = sizeof(request);
-
-	ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * list the interface link statuses to get the MTUs
- */
-static int afs_rtm_getlink(struct afs_rtm_desc *desc)
-{
-	struct msghdr msg;
-	struct kvec iov[1];
-	int ret;
-
-	struct {
-		struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
-		struct ifinfomsg link_msg __attribute__((aligned(NLMSG_ALIGNTO)));
-	} request;
-
-	_enter("");
-
-	memset(&request, 0, sizeof(request));
-
-	request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-	request.nl_msg.nlmsg_type = RTM_GETLINK;
-	request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT;
-	request.nl_msg.nlmsg_seq = desc->msg_seq++;
-	request.nl_msg.nlmsg_pid = 0;
-
-	memset(&msg, 0, sizeof(msg));
-	iov[0].iov_base = &request;
-	iov[0].iov_len = sizeof(request);
-
-	ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * cull any interface records for which there isn't an MTU value
- */
-static void afs_cull_interfaces(struct afs_rtm_desc *desc)
-{
-	struct afs_interface *bufs = desc->bufs;
-	size_t nbufs = desc->nbufs;
-	int loop, point = 0;
-
-	_enter("{%zu}", nbufs);
-
-	for (loop = 0; loop < nbufs; loop++) {
-		if (desc->bufs[loop].mtu != 0) {
-			if (loop != point) {
-				ASSERTCMP(loop, >, point);
-				bufs[point] = bufs[loop];
-			}
-			point++;
-		}
-	}
-
-	desc->nbufs = point;
-	_leave(" [%zu/%zu]", desc->nbufs, nbufs);
-}
-
-/*
- * get a list of this system's interface IPv4 addresses, netmasks and MTUs
- * - returns the number of interface records in the buffer
- */
-int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
-			    bool wantloopback)
-{
-	struct afs_rtm_desc desc;
-	int ret, loop;
-
-	_enter("");
-
-	memset(&desc, 0, sizeof(desc));
-	desc.bufs = bufs;
-	desc.maxbufs = maxbufs;
-	desc.wantloopback = wantloopback;
-
-	ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
-			       &desc.nlsock);
-	if (ret < 0) {
-		_leave(" = %d [sock]", ret);
-		return ret;
-	}
-
-	/* issue RTM_GETADDR */
-	desc.parse = afs_rtm_getaddr_parse;
-	ret = afs_rtm_getaddr(&desc);
-	if (ret < 0)
-		goto error;
-	ret = afs_read_rtm(&desc);
-	if (ret < 0)
-		goto error;
-
-	/* issue RTM_GETLINK */
-	desc.parse = afs_rtm_getlink_if_parse;
-	ret = afs_rtm_getlink(&desc);
-	if (ret < 0)
-		goto error;
-	ret = afs_read_rtm(&desc);
-	if (ret < 0)
-		goto error;
-
-	afs_cull_interfaces(&desc);
-	ret = desc.nbufs;
-
-	for (loop = 0; loop < ret; loop++)
-		_debug("[%d] "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
-		       bufs[loop].index,
-		       NIPQUAD(bufs[loop].address),
-		       NIPQUAD(bufs[loop].netmask),
-		       bufs[loop].mtu);
-
-error:
-	kfree(desc.data);
-	sock_release(desc.nlsock);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * get a MAC address from a random ethernet interface that has a real one
- * - the buffer should be 6 bytes in size
- */
-int afs_get_MAC_address(u8 mac[6])
-{
-	struct afs_rtm_desc desc;
-	int ret;
-
-	_enter("");
-
-	memset(&desc, 0, sizeof(desc));
-	desc.mac = mac;
-	desc.mac_index = UINT_MAX;
-
-	ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
-			       &desc.nlsock);
-	if (ret < 0) {
-		_leave(" = %d [sock]", ret);
-		return ret;
-	}
-
-	/* issue RTM_GETLINK */
-	desc.parse = afs_rtm_getlink_mac_parse;
-	ret = afs_rtm_getlink(&desc);
-	if (ret < 0)
-		goto error;
-	ret = afs_read_rtm(&desc);
-	if (ret < 0)
-		goto error;
-
-	if (desc.mac_index < UINT_MAX) {
-		/* got a MAC address */
-		_debug("[%d] %02x:%02x:%02x:%02x:%02x:%02x",
-		       desc.mac_index,
-		       mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
-	} else {
-		ret = -ENONET;
-	}
-
-error:
-	sock_release(desc.nlsock);
-	_leave(" = %d", ret);
-	return ret;
-}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 74cce174882a..09e3ad0fc7cc 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 #include "internal.h"
 
 unsigned afs_vlocation_timeout = 10;	/* volume location timeout in seconds */
@@ -416,8 +417,8 @@ fill_in_record:
 		goto error_abandon;
 	spin_lock(&vl->lock);
 	vl->state = AFS_VL_VALID;
-	wake_up(&vl->waitq);
 	spin_unlock(&vl->lock);
+	wake_up(&vl->waitq);
 
 	/* schedule for regular updates */
 	afs_vlocation_queue_for_updates(vl);
@@ -442,7 +443,7 @@ found_in_memory:
 
 		_debug("invalid [state %d]", state);
 
-		if ((state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME)) {
+		if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) {
 			vl->state = AFS_VL_CREATING;
 			spin_unlock(&vl->lock);
 			goto fill_in_record;
@@ -453,11 +454,10 @@ found_in_memory:
 		_debug("wait");
 
 		spin_unlock(&vl->lock);
-		ret = wait_event_interruptible(
-			vl->waitq,
-			vl->state == AFS_VL_NEW ||
-			vl->state == AFS_VL_VALID ||
-			vl->state == AFS_VL_NO_VOLUME);
+		ret = wait_event_interruptible(vl->waitq,
+					       vl->state == AFS_VL_NEW ||
+					       vl->state == AFS_VL_VALID ||
+					       vl->state == AFS_VL_NO_VOLUME);
 		if (ret < 0)
 			goto error;
 		spin_lock(&vl->lock);
@@ -471,8 +471,8 @@ success:
 error_abandon:
 	spin_lock(&vl->lock);
 	vl->state = AFS_VL_NEW;
-	wake_up(&vl->waitq);
 	spin_unlock(&vl->lock);
+	wake_up(&vl->waitq);
 error:
 	ASSERT(vl != NULL);
 	afs_put_vlocation(vl);
@@ -603,7 +603,7 @@ int __init afs_vlocation_update_init(void)
 /*
  * discard all the volume location records for rmmod
  */
-void __exit afs_vlocation_purge(void)
+void afs_vlocation_purge(void)
 {
 	afs_vlocation_timeout = 0;
 
@@ -675,7 +675,6 @@ static void afs_vlocation_updater(struct work_struct *work)
 	case 0:
 		afs_vlocation_apply_update(vl, &vldb);
 		vl->state = AFS_VL_VALID;
-		wake_up(&vl->waitq);
 		break;
 	case -ENOMEDIUM:
 		vl->state = AFS_VL_VOLUME_DELETED;
@@ -685,6 +684,7 @@ static void afs_vlocation_updater(struct work_struct *work)
 		break;
 	}
 	spin_unlock(&vl->lock);
+	wake_up(&vl->waitq);
 
 	/* and then reschedule */
 	_debug("reschedule");
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index a1904ab8426a..232c55dc245d 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
+#include <linux/sched.h>
 #include "internal.h"
 
 #if 0
@@ -175,24 +176,33 @@ static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
 {
 	struct afs_server *server;
 
+	_enter("{%p}", vnode->server);
+
 	set_bit(AFS_VNODE_DELETED, &vnode->flags);
 
 	server = vnode->server;
-	if (vnode->cb_promised) {
-		spin_lock(&server->cb_lock);
+	if (server) {
 		if (vnode->cb_promised) {
-			rb_erase(&vnode->cb_promise, &server->cb_promises);
-			vnode->cb_promised = false;
+			spin_lock(&server->cb_lock);
+			if (vnode->cb_promised) {
+				rb_erase(&vnode->cb_promise,
+					 &server->cb_promises);
+				vnode->cb_promised = false;
+			}
+			spin_unlock(&server->cb_lock);
 		}
-		spin_unlock(&server->cb_lock);
-	}
 
-	spin_lock(&vnode->server->fs_lock);
-	rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
-	spin_unlock(&vnode->server->fs_lock);
+		spin_lock(&server->fs_lock);
+		rb_erase(&vnode->server_rb, &server->fs_vnodes);
+		spin_unlock(&server->fs_lock);
 
-	vnode->server = NULL;
-	afs_put_server(server);
+		vnode->server = NULL;
+		afs_put_server(server);
+	} else {
+		ASSERT(!vnode->cb_promised);
+	}
+
+	_leave("");
 }
 
 /*
@@ -225,7 +235,7 @@ void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
  */
 static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
 {
-	_enter("%p,%d", vnode, ret);
+	_enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret);
 
 	spin_lock(&vnode->lock);
 
@@ -261,7 +271,7 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode,
 
 	DECLARE_WAITQUEUE(myself, current);
 
-	_enter("%s,{%u,%u,%u}",
+	_enter("%s,{%x:%u.%u}",
 	       vnode->volume->vlocation->vldb.name,
 	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
 
@@ -389,7 +399,7 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
 	struct afs_server *server;
 	int ret;
 
-	_enter("%s{%u,%u,%u},%x,,,",
+	_enter("%s{%x:%u.%u},%x,,,",
 	       vnode->volume->vlocation->vldb.name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
@@ -446,7 +456,7 @@ int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
 	struct afs_server *server;
 	int ret;
 
-	_enter("%s{%u,%u,%u},%x,%s,,",
+	_enter("%s{%x:%u.%u},%x,%s,,",
 	       vnode->volume->vlocation->vldb.name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
@@ -502,7 +512,7 @@ int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
 	struct afs_server *server;
 	int ret;
 
-	_enter("%s{%u,%u,%u},%x,%s",
+	_enter("%s{%x:%u.%u},%x,%s",
 	       vnode->volume->vlocation->vldb.name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
@@ -557,7 +567,7 @@ extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
 	struct afs_server *server;
 	int ret;
 
-	_enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s",
+	_enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s",
 	       dvnode->volume->vlocation->vldb.name,
 	       dvnode->fid.vid,
 	       dvnode->fid.vnode,
@@ -628,7 +638,7 @@ int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
 	struct afs_server *server;
 	int ret;
 
-	_enter("%s{%u,%u,%u},%x,%s,%s,,,",
+	_enter("%s{%x:%u.%u},%x,%s,%s,,,",
 	       vnode->volume->vlocation->vldb.name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
@@ -687,7 +697,7 @@ int afs_vnode_rename(struct afs_vnode *orig_dvnode,
 	struct afs_server *server;
 	int ret;
 
-	_enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s,%s",
+	_enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s",
 	       orig_dvnode->volume->vlocation->vldb.name,
 	       orig_dvnode->fid.vid,
 	       orig_dvnode->fid.vnode,
@@ -753,3 +763,162 @@ no_server:
 	_leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
 	return PTR_ERR(server);
 }
+
+/*
+ * write to a file
+ */
+int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last,
+			 unsigned offset, unsigned to)
+{
+	struct afs_server *server;
+	struct afs_vnode *vnode = wb->vnode;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(wb->key),
+	       first, last, offset, to);
+
+	/* this op will fetch the status */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_store_data(server, wb, first, last, offset, to,
+					&afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
+
+/*
+ * set the attributes on a file
+ */
+int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key,
+		      struct iattr *attr)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	/* this op will fetch the status */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
+
+/*
+ * get the status of a volume
+ */
+int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
+				struct afs_volume_status *vs)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	/* this op will fetch the status */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_get_volume_status(server, key, vnode, vs, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index dd160cada45d..8bab0e3437f9 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/sched.h>
 #include "internal.h"
 
 static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
diff --git a/fs/afs/write.c b/fs/afs/write.c
new file mode 100644
index 000000000000..a03b92a0fe1d
--- /dev/null
+++ b/fs/afs/write.c
@@ -0,0 +1,827 @@
+/* handling of writes to regular files and writing back to the server
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+static int afs_write_back_from_locked_page(struct afs_writeback *wb,
+					   struct page *page);
+
+/*
+ * mark a page as having been made dirty and thus needing writeback
+ */
+int afs_set_page_dirty(struct page *page)
+{
+	_enter("");
+	return __set_page_dirty_nobuffers(page);
+}
+
+/*
+ * unlink a writeback record because its usage has reached zero
+ * - must be called with the wb->vnode->writeback_lock held
+ */
+static void afs_unlink_writeback(struct afs_writeback *wb)
+{
+	struct afs_writeback *front;
+	struct afs_vnode *vnode = wb->vnode;
+
+	list_del_init(&wb->link);
+	if (!list_empty(&vnode->writebacks)) {
+		/* if an fsync rises to the front of the queue then wake it
+		 * up */
+		front = list_entry(vnode->writebacks.next,
+				   struct afs_writeback, link);
+		if (front->state == AFS_WBACK_SYNCING) {
+			_debug("wake up sync");
+			front->state = AFS_WBACK_COMPLETE;
+			wake_up(&front->waitq);
+		}
+	}
+}
+
+/*
+ * free a writeback record
+ */
+static void afs_free_writeback(struct afs_writeback *wb)
+{
+	_enter("");
+	key_put(wb->key);
+	kfree(wb);
+}
+
+/*
+ * dispose of a reference to a writeback record
+ */
+void afs_put_writeback(struct afs_writeback *wb)
+{
+	struct afs_vnode *vnode = wb->vnode;
+
+	_enter("{%d}", wb->usage);
+
+	spin_lock(&vnode->writeback_lock);
+	if (--wb->usage == 0)
+		afs_unlink_writeback(wb);
+	else
+		wb = NULL;
+	spin_unlock(&vnode->writeback_lock);
+	if (wb)
+		afs_free_writeback(wb);
+}
+
+/*
+ * partly or wholly fill a page that's under preparation for writing
+ */
+static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
+			 unsigned start, unsigned len, struct page *page)
+{
+	int ret;
+
+	_enter(",,%u,%u", start, len);
+
+	ASSERTCMP(start + len, <=, PAGE_SIZE);
+
+	ret = afs_vnode_fetch_data(vnode, key, start, len, page);
+	if (ret < 0) {
+		if (ret == -ENOENT) {
+			_debug("got NOENT from server"
+			       " - marking file deleted and stale");
+			set_bit(AFS_VNODE_DELETED, &vnode->flags);
+			ret = -ESTALE;
+		}
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * prepare a page for being written to
+ */
+static int afs_prepare_page(struct afs_vnode *vnode, struct page *page,
+			    struct key *key, unsigned offset, unsigned to)
+{
+	unsigned eof, tail, start, stop, len;
+	loff_t i_size, pos;
+	void *p;
+	int ret;
+
+	_enter("");
+
+	if (offset == 0 && to == PAGE_SIZE)
+		return 0;
+
+	p = kmap_atomic(page, KM_USER0);
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	pos = (loff_t) page->index << PAGE_SHIFT;
+	if (pos >= i_size) {
+		/* partial write, page beyond EOF */
+		_debug("beyond");
+		if (offset > 0)
+			memset(p, 0, offset);
+		if (to < PAGE_SIZE)
+			memset(p + to, 0, PAGE_SIZE - to);
+		kunmap_atomic(p, KM_USER0);
+		return 0;
+	}
+
+	if (i_size - pos >= PAGE_SIZE) {
+		/* partial write, page entirely before EOF */
+		_debug("before");
+		tail = eof = PAGE_SIZE;
+	} else {
+		/* partial write, page overlaps EOF */
+		eof = i_size - pos;
+		_debug("overlap %u", eof);
+		tail = max(eof, to);
+		if (tail < PAGE_SIZE)
+			memset(p + tail, 0, PAGE_SIZE - tail);
+		if (offset > eof)
+			memset(p + eof, 0, PAGE_SIZE - eof);
+	}
+
+	kunmap_atomic(p, KM_USER0);
+
+	ret = 0;
+	if (offset > 0 || eof > to) {
+		/* need to fill one or two bits that aren't going to be written
+		 * (cover both fillers in one read if there are two) */
+		start = (offset > 0) ? 0 : to;
+		stop = (eof > to) ? eof : offset;
+		len = stop - start;
+		_debug("wr=%u-%u av=0-%u rd=%u@%u",
+		       offset, to, eof, start, len);
+		ret = afs_fill_page(vnode, key, start, len, page);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * prepare to perform part of a write to a page
+ * - the caller holds the page locked, preventing it from being written out or
+ *   modified by anyone else
+ */
+int afs_prepare_write(struct file *file, struct page *page,
+		      unsigned offset, unsigned to)
+{
+	struct afs_writeback *candidate, *wb;
+	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct key *key = file->private_data;
+	pgoff_t index;
+	int ret;
+
+	_enter("{%x:%u},{%lx},%u,%u",
+	       vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+
+	candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
+	if (!candidate)
+		return -ENOMEM;
+	candidate->vnode = vnode;
+	candidate->first = candidate->last = page->index;
+	candidate->offset_first = offset;
+	candidate->to_last = to;
+	candidate->usage = 1;
+	candidate->state = AFS_WBACK_PENDING;
+	init_waitqueue_head(&candidate->waitq);
+
+	if (!PageUptodate(page)) {
+		_debug("not up to date");
+		ret = afs_prepare_page(vnode, page, key, offset, to);
+		if (ret < 0) {
+			kfree(candidate);
+			_leave(" = %d [prep]", ret);
+			return ret;
+		}
+	}
+
+try_again:
+	index = page->index;
+	spin_lock(&vnode->writeback_lock);
+
+	/* see if this page is already pending a writeback under a suitable key
+	 * - if so we can just join onto that one */
+	wb = (struct afs_writeback *) page_private(page);
+	if (wb) {
+		if (wb->key == key && wb->state == AFS_WBACK_PENDING)
+			goto subsume_in_current_wb;
+		goto flush_conflicting_wb;
+	}
+
+	if (index > 0) {
+		/* see if we can find an already pending writeback that we can
+		 * append this page to */
+		list_for_each_entry(wb, &vnode->writebacks, link) {
+			if (wb->last == index - 1 && wb->key == key &&
+			    wb->state == AFS_WBACK_PENDING)
+				goto append_to_previous_wb;
+		}
+	}
+
+	list_add_tail(&candidate->link, &vnode->writebacks);
+	candidate->key = key_get(key);
+	spin_unlock(&vnode->writeback_lock);
+	SetPagePrivate(page);
+	set_page_private(page, (unsigned long) candidate);
+	_leave(" = 0 [new]");
+	return 0;
+
+subsume_in_current_wb:
+	_debug("subsume");
+	ASSERTRANGE(wb->first, <=, index, <=, wb->last);
+	if (index == wb->first && offset < wb->offset_first)
+		wb->offset_first = offset;
+	if (index == wb->last && to > wb->to_last)
+		wb->to_last = to;
+	spin_unlock(&vnode->writeback_lock);
+	kfree(candidate);
+	_leave(" = 0 [sub]");
+	return 0;
+
+append_to_previous_wb:
+	_debug("append into %lx-%lx", wb->first, wb->last);
+	wb->usage++;
+	wb->last++;
+	wb->to_last = to;
+	spin_unlock(&vnode->writeback_lock);
+	SetPagePrivate(page);
+	set_page_private(page, (unsigned long) wb);
+	kfree(candidate);
+	_leave(" = 0 [app]");
+	return 0;
+
+	/* the page is currently bound to another context, so if it's dirty we
+	 * need to flush it before we can use the new context */
+flush_conflicting_wb:
+	_debug("flush conflict");
+	if (wb->state == AFS_WBACK_PENDING)
+		wb->state = AFS_WBACK_CONFLICTING;
+	spin_unlock(&vnode->writeback_lock);
+	if (PageDirty(page)) {
+		ret = afs_write_back_from_locked_page(wb, page);
+		if (ret < 0) {
+			afs_put_writeback(candidate);
+			_leave(" = %d", ret);
+			return ret;
+		}
+	}
+
+	/* the page holds a ref on the writeback record */
+	afs_put_writeback(wb);
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+	goto try_again;
+}
+
+/*
+ * finalise part of a write to a page
+ */
+int afs_commit_write(struct file *file, struct page *page,
+		     unsigned offset, unsigned to)
+{
+	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	loff_t i_size, maybe_i_size;
+
+	_enter("{%x:%u},{%lx},%u,%u",
+	       vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+
+	maybe_i_size = (loff_t) page->index << PAGE_SHIFT;
+	maybe_i_size += to;
+
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (maybe_i_size > i_size) {
+		spin_lock(&vnode->writeback_lock);
+		i_size = i_size_read(&vnode->vfs_inode);
+		if (maybe_i_size > i_size)
+			i_size_write(&vnode->vfs_inode, maybe_i_size);
+		spin_unlock(&vnode->writeback_lock);
+	}
+
+	SetPageUptodate(page);
+	set_page_dirty(page);
+	if (PageDirty(page))
+		_debug("dirtied");
+
+	return 0;
+}
+
+/*
+ * kill all the pages in the given range
+ */
+static void afs_kill_pages(struct afs_vnode *vnode, bool error,
+			   pgoff_t first, pgoff_t last)
+{
+	struct pagevec pv;
+	unsigned count, loop;
+
+	_enter("{%x:%u},%lx-%lx",
+	       vnode->fid.vid, vnode->fid.vnode, first, last);
+
+	pagevec_init(&pv, 0);
+
+	do {
+		_debug("kill %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > PAGEVEC_SIZE)
+			count = PAGEVEC_SIZE;
+		pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
+					      first, count, pv.pages);
+		ASSERTCMP(pv.nr, ==, count);
+
+		for (loop = 0; loop < count; loop++) {
+			ClearPageUptodate(pv.pages[loop]);
+			if (error)
+				SetPageError(pv.pages[loop]);
+			end_page_writeback(pv.pages[loop]);
+		}
+
+		__pagevec_release(&pv);
+	} while (first < last);
+
+	_leave("");
+}
+
+/*
+ * synchronously write back the locked page and any subsequent non-locked dirty
+ * pages also covered by the same writeback record
+ */
+static int afs_write_back_from_locked_page(struct afs_writeback *wb,
+					   struct page *primary_page)
+{
+	struct page *pages[8], *page;
+	unsigned long count;
+	unsigned n, offset, to;
+	pgoff_t start, first, last;
+	int loop, ret;
+
+	_enter(",%lx", primary_page->index);
+
+	count = 1;
+	if (!clear_page_dirty_for_io(primary_page))
+		BUG();
+	if (test_set_page_writeback(primary_page))
+		BUG();
+
+	/* find all consecutive lockable dirty pages, stopping when we find a
+	 * page that is not immediately lockable, is not dirty or is missing,
+	 * or we reach the end of the range */
+	start = primary_page->index;
+	if (start >= wb->last)
+		goto no_more;
+	start++;
+	do {
+		_debug("more %lx [%lx]", start, count);
+		n = wb->last - start + 1;
+		if (n > ARRAY_SIZE(pages))
+			n = ARRAY_SIZE(pages);
+		n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping,
+					  start, n, pages);
+		_debug("fgpc %u", n);
+		if (n == 0)
+			goto no_more;
+		if (pages[0]->index != start) {
+			do {
+				put_page(pages[--n]);
+			} while (n > 0);
+			goto no_more;
+		}
+
+		for (loop = 0; loop < n; loop++) {
+			page = pages[loop];
+			if (page->index > wb->last)
+				break;
+			if (TestSetPageLocked(page))
+				break;
+			if (!PageDirty(page) ||
+			    page_private(page) != (unsigned long) wb) {
+				unlock_page(page);
+				break;
+			}
+			if (!clear_page_dirty_for_io(page))
+				BUG();
+			if (test_set_page_writeback(page))
+				BUG();
+			unlock_page(page);
+			put_page(page);
+		}
+		count += loop;
+		if (loop < n) {
+			for (; loop < n; loop++)
+				put_page(pages[loop]);
+			goto no_more;
+		}
+
+		start += loop;
+	} while (start <= wb->last && count < 65536);
+
+no_more:
+	/* we now have a contiguous set of dirty pages, each with writeback set
+	 * and the dirty mark cleared; the first page is locked and must remain
+	 * so, all the rest are unlocked */
+	first = primary_page->index;
+	last = first + count - 1;
+
+	offset = (first == wb->first) ? wb->offset_first : 0;
+	to = (last == wb->last) ? wb->to_last : PAGE_SIZE;
+
+	_debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
+
+	ret = afs_vnode_store_data(wb, first, last, offset, to);
+	if (ret < 0) {
+		switch (ret) {
+		case -EDQUOT:
+		case -ENOSPC:
+			set_bit(AS_ENOSPC,
+				&wb->vnode->vfs_inode.i_mapping->flags);
+			break;
+		case -EROFS:
+		case -EIO:
+		case -EREMOTEIO:
+		case -EFBIG:
+		case -ENOENT:
+		case -ENOMEDIUM:
+		case -ENXIO:
+			afs_kill_pages(wb->vnode, true, first, last);
+			set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags);
+			break;
+		case -EACCES:
+		case -EPERM:
+		case -ENOKEY:
+		case -EKEYEXPIRED:
+		case -EKEYREJECTED:
+		case -EKEYREVOKED:
+			afs_kill_pages(wb->vnode, false, first, last);
+			break;
+		default:
+			break;
+		}
+	} else {
+		ret = count;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * write a page back to the server
+ * - the caller locked the page for us
+ */
+int afs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+	struct afs_writeback *wb;
+	int ret;
+
+	_enter("{%lx},", page->index);
+
+	wb = (struct afs_writeback *) page_private(page);
+	ASSERT(wb != NULL);
+
+	ret = afs_write_back_from_locked_page(wb, page);
+	unlock_page(page);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return 0;
+	}
+
+	wbc->nr_to_write -= ret;
+	if (wbc->nonblocking && bdi_write_congested(bdi))
+		wbc->encountered_congestion = 1;
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * write a region of pages back to the server
+ */
+int afs_writepages_region(struct address_space *mapping,
+			  struct writeback_control *wbc,
+			  pgoff_t index, pgoff_t end, pgoff_t *_next)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct afs_writeback *wb;
+	struct page *page;
+	int ret, n;
+
+	_enter(",,%lx,%lx,", index, end);
+
+	do {
+		n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY,
+				       1, &page);
+		if (!n)
+			break;
+
+		_debug("wback %lx", page->index);
+
+		if (page->index > end) {
+			*_next = index;
+			page_cache_release(page);
+			_leave(" = 0 [%lx]", *_next);
+			return 0;
+		}
+
+		/* at this point we hold neither mapping->tree_lock nor lock on
+		 * the page itself: the page may be truncated or invalidated
+		 * (changing page->mapping to NULL), or even swizzled back from
+		 * swapper_space to tmpfs file mapping
+		 */
+		lock_page(page);
+
+		if (page->mapping != mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			continue;
+		}
+
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+
+		if (PageWriteback(page) || !PageDirty(page)) {
+			unlock_page(page);
+			continue;
+		}
+
+		wb = (struct afs_writeback *) page_private(page);
+		ASSERT(wb != NULL);
+
+		spin_lock(&wb->vnode->writeback_lock);
+		wb->state = AFS_WBACK_WRITING;
+		spin_unlock(&wb->vnode->writeback_lock);
+
+		ret = afs_write_back_from_locked_page(wb, page);
+		unlock_page(page);
+		page_cache_release(page);
+		if (ret < 0) {
+			_leave(" = %d", ret);
+			return ret;
+		}
+
+		wbc->nr_to_write -= ret;
+
+		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+			wbc->encountered_congestion = 1;
+			break;
+		}
+
+		cond_resched();
+	} while (index < end && wbc->nr_to_write > 0);
+
+	*_next = index;
+	_leave(" = 0 [%lx]", *_next);
+	return 0;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int afs_writepages(struct address_space *mapping,
+		   struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	pgoff_t start, end, next;
+	int ret;
+
+	_enter("");
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		_leave(" = 0 [congest]");
+		return 0;
+	}
+
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index;
+		end = -1;
+		ret = afs_writepages_region(mapping, wbc, start, end, &next);
+		if (start > 0 && wbc->nr_to_write > 0 && ret == 0 &&
+		    !(wbc->nonblocking && wbc->encountered_congestion))
+			ret = afs_writepages_region(mapping, wbc, 0, start,
+						    &next);
+		mapping->writeback_index = next;
+	} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+		end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT);
+		ret = afs_writepages_region(mapping, wbc, 0, end, &next);
+		if (wbc->nr_to_write > 0)
+			mapping->writeback_index = next;
+	} else {
+		start = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		ret = afs_writepages_region(mapping, wbc, start, end, &next);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * write an inode back
+ */
+int afs_write_inode(struct inode *inode, int sync)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	int ret;
+
+	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+
+	ret = 0;
+	if (sync) {
+		ret = filemap_fdatawait(inode->i_mapping);
+		if (ret < 0)
+			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * completion of write to server
+ */
+void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
+{
+	struct afs_writeback *wb = call->wb;
+	struct pagevec pv;
+	unsigned count, loop;
+	pgoff_t first = call->first, last = call->last;
+	bool free_wb;
+
+	_enter("{%x:%u},{%lx-%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, first, last);
+
+	ASSERT(wb != NULL);
+
+	pagevec_init(&pv, 0);
+
+	do {
+		_debug("done %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > PAGEVEC_SIZE)
+			count = PAGEVEC_SIZE;
+		pv.nr = find_get_pages_contig(call->mapping, first, count,
+					      pv.pages);
+		ASSERTCMP(pv.nr, ==, count);
+
+		spin_lock(&vnode->writeback_lock);
+		for (loop = 0; loop < count; loop++) {
+			struct page *page = pv.pages[loop];
+			end_page_writeback(page);
+			if (page_private(page) == (unsigned long) wb) {
+				set_page_private(page, 0);
+				ClearPagePrivate(page);
+				wb->usage--;
+			}
+		}
+		free_wb = false;
+		if (wb->usage == 0) {
+			afs_unlink_writeback(wb);
+			free_wb = true;
+		}
+		spin_unlock(&vnode->writeback_lock);
+		first += count;
+		if (free_wb) {
+			afs_free_writeback(wb);
+			wb = NULL;
+		}
+
+		__pagevec_release(&pv);
+	} while (first <= last);
+
+	_leave("");
+}
+
+/*
+ * write to an AFS file
+ */
+ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
+		       unsigned long nr_segs, loff_t pos)
+{
+	struct dentry *dentry = iocb->ki_filp->f_path.dentry;
+	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	ssize_t result;
+	size_t count = iov_length(iov, nr_segs);
+	int ret;
+
+	_enter("{%x.%u},{%zu},%lu,",
+	       vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
+
+	if (IS_SWAPFILE(&vnode->vfs_inode)) {
+		printk(KERN_INFO
+		       "AFS: Attempt to write to active swap file!\n");
+		return -EBUSY;
+	}
+
+	if (!count)
+		return 0;
+
+	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+	if (IS_ERR_VALUE(result)) {
+		_leave(" = %zd", result);
+		return result;
+	}
+
+	/* return error values for O_SYNC and IS_SYNC() */
+	if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) {
+		ret = afs_fsync(iocb->ki_filp, dentry, 1);
+		if (ret < 0)
+			result = ret;
+	}
+
+	_leave(" = %zd", result);
+	return result;
+}
+
+/*
+ * flush the vnode to the fileserver
+ */
+int afs_writeback_all(struct afs_vnode *vnode)
+{
+	struct address_space *mapping = vnode->vfs_inode.i_mapping;
+	struct writeback_control wbc = {
+		.bdi		= mapping->backing_dev_info,
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_to_write	= LONG_MAX,
+		.for_writepages = 1,
+		.range_cyclic	= 1,
+	};
+	int ret;
+
+	_enter("");
+
+	ret = mapping->a_ops->writepages(mapping, &wbc);
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * flush any dirty pages for this process, and check for write errors.
+ * - the return status from this call provides a reliable indication of
+ *   whether any write errors occurred for this process.
+ */
+int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct afs_writeback *wb, *xwb;
+	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	int ret;
+
+	_enter("{%x:%u},{n=%s},%d",
+	       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+	       datasync);
+
+	/* use a writeback record as a marker in the queue - when this reaches
+	 * the front of the queue, all the outstanding writes are either
+	 * completed or rejected */
+	wb = kzalloc(sizeof(*wb), GFP_KERNEL);
+	if (!wb)
+		return -ENOMEM;
+	wb->vnode = vnode;
+	wb->first = 0;
+	wb->last = -1;
+	wb->offset_first = 0;
+	wb->to_last = PAGE_SIZE;
+	wb->usage = 1;
+	wb->state = AFS_WBACK_SYNCING;
+	init_waitqueue_head(&wb->waitq);
+
+	spin_lock(&vnode->writeback_lock);
+	list_for_each_entry(xwb, &vnode->writebacks, link) {
+		if (xwb->state == AFS_WBACK_PENDING)
+			xwb->state = AFS_WBACK_CONFLICTING;
+	}
+	list_add_tail(&wb->link, &vnode->writebacks);
+	spin_unlock(&vnode->writeback_lock);
+
+	/* push all the outstanding writebacks to the server */
+	ret = afs_writeback_all(vnode);
+	if (ret < 0) {
+		afs_put_writeback(wb);
+		_leave(" = %d [wb]", ret);
+		return ret;
+	}
+
+	/* wait for the preceding writes to actually complete */
+	ret = wait_event_interruptible(wb->waitq,
+				       wb->state == AFS_WBACK_COMPLETE ||
+				       vnode->writebacks.next == &wb->link);
+	afs_put_writeback(wb);
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/aio.c b/fs/aio.c
index e4598d6d49dd..dbe699e9828c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
+#include <linux/eventfd.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -68,10 +69,8 @@ static void aio_queue_work(struct kioctx *);
  */
 static int __init aio_setup(void)
 {
-	kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
-				0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-	kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
-				0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
 	aio_wq = create_workqueue("aio");
 
@@ -348,10 +347,9 @@ void fastcall exit_aio(struct mm_struct *mm)
 
 		wait_for_all_aios(ctx);
 		/*
-		 * this is an overkill, but ensures we don't leave
-		 * the ctx on the aio_wq
+		 * Ensure we don't leave the ctx on the aio_wq
 		 */
-		flush_workqueue(aio_wq);
+		cancel_work_sync(&ctx->wq.work);
 
 		if (1 != atomic_read(&ctx->users))
 			printk(KERN_DEBUG
@@ -374,7 +372,7 @@ void fastcall __put_ioctx(struct kioctx *ctx)
 	BUG_ON(ctx->reqs_active);
 
 	cancel_delayed_work(&ctx->wq);
-	flush_workqueue(aio_wq);
+	cancel_work_sync(&ctx->wq.work);
 	aio_free_ring(ctx);
 	mmdrop(ctx->mm);
 	ctx->mm = NULL;
@@ -420,6 +418,7 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
 	req->private = NULL;
 	req->ki_iovec = NULL;
 	INIT_LIST_HEAD(&req->ki_run_list);
+	req->ki_eventfd = ERR_PTR(-EINVAL);
 
 	/* Check if the completion queue has enough free space to
 	 * accept an event from this io.
@@ -461,6 +460,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 {
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (!IS_ERR(req->ki_eventfd))
+		fput(req->ki_eventfd);
 	if (req->ki_dtor)
 		req->ki_dtor(req);
 	if (req->ki_iovec != &req->ki_inline_vec)
@@ -945,6 +946,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 		return 1;
 	}
 
+	/*
+	 * Check if the user asked us to deliver the result through an
+	 * eventfd. The eventfd_signal() function is safe to be called
+	 * from IRQ context.
+	 */
+	if (!IS_ERR(iocb->ki_eventfd))
+		eventfd_signal(iocb->ki_eventfd, 1);
+
 	info = &ctx->ring_info;
 
 	/* add a completion event to the ring buffer.
@@ -1529,8 +1538,7 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 ||
-		     iocb->aio_reserved3)) {
+	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
 		pr_debug("EINVAL: io_submit: reserve field set\n");
 		return -EINVAL;
 	}
@@ -1554,6 +1562,19 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		fput(file);
 		return -EAGAIN;
 	}
+	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
+		/*
+		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
+		 * instance of the file* now. The file descriptor must be
+		 * an eventfd() fd, and will be signaled for each completed
+		 * event using the eventfd_signal() function.
+		 */
+		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
+		if (unlikely(IS_ERR(req->ki_eventfd))) {
+			ret = PTR_ERR(req->ki_eventfd);
+			goto out_put_req;
+		}
+	}
 
 	req->ki_filp = file;
 	ret = put_user(req->ki_key, &user_iocb->aio_key);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
new file mode 100644
index 000000000000..40fe3a3222e4
--- /dev/null
+++ b/fs/anon_inodes.c
@@ -0,0 +1,200 @@
+/*
+ *  fs/anon_inodes.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ *  Thanks to Arnd Bergmann for code review and suggestions.
+ *  More changes for Thomas Gleixner suggestions.
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/magic.h>
+#include <linux/anon_inodes.h>
+
+#include <asm/uaccess.h>
+
+static struct vfsmount *anon_inode_mnt __read_mostly;
+static struct inode *anon_inode_inode;
+static const struct file_operations anon_inode_fops;
+
+static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
+			       const char *dev_name, void *data,
+			       struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC,
+			     mnt);
+}
+
+static int anon_inodefs_delete_dentry(struct dentry *dentry)
+{
+	/*
+	 * We faked vfs to believe the dentry was hashed when we created it.
+	 * Now we restore the flag so that dput() will work correctly.
+	 */
+	dentry->d_flags |= DCACHE_UNHASHED;
+	return 1;
+}
+
+static struct file_system_type anon_inode_fs_type = {
+	.name		= "anon_inodefs",
+	.get_sb		= anon_inodefs_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+static struct dentry_operations anon_inodefs_dentry_operations = {
+	.d_delete	= anon_inodefs_delete_dentry,
+};
+
+/**
+ * anon_inode_getfd - creates a new file instance by hooking it up to and
+ *                    anonymous inode, and a dentry that describe the "class"
+ *                    of the file
+ *
+ * @pfd:     [out]   pointer to the file descriptor
+ * @dpinode: [out]   pointer to the inode
+ * @pfile:   [out]   pointer to the file struct
+ * @name:    [in]    name of the "class" of the new file
+ * @fops     [in]    file operations for the new file
+ * @priv     [in]    private data for the new file (will be file's private_data)
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfd() will share a single inode, by
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup.
+ */
+int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
+		     const char *name, const struct file_operations *fops,
+		     void *priv)
+{
+	struct qstr this;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *file;
+	int error, fd;
+
+	if (IS_ERR(anon_inode_inode))
+		return -ENODEV;
+	file = get_empty_filp();
+	if (!file)
+		return -ENFILE;
+
+	inode = igrab(anon_inode_inode);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto err_put_filp;
+	}
+
+	error = get_unused_fd();
+	if (error < 0)
+		goto err_iput;
+	fd = error;
+
+	/*
+	 * Link the inode to a directory entry by creating a unique name
+	 * using the inode sequence number.
+	 */
+	error = -ENOMEM;
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = 0;
+	dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto err_put_unused_fd;
+	dentry->d_op = &anon_inodefs_dentry_operations;
+	/* Do not publish this dentry inside the global dentry hash table */
+	dentry->d_flags &= ~DCACHE_UNHASHED;
+	d_instantiate(dentry, inode);
+
+	file->f_path.mnt = mntget(anon_inode_mnt);
+	file->f_path.dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+
+	file->f_pos = 0;
+	file->f_flags = O_RDWR;
+	file->f_op = fops;
+	file->f_mode = FMODE_READ | FMODE_WRITE;
+	file->f_version = 0;
+	file->private_data = priv;
+
+	fd_install(fd, file);
+
+	*pfd = fd;
+	*pinode = inode;
+	*pfile = file;
+	return 0;
+
+err_put_unused_fd:
+	put_unused_fd(fd);
+err_iput:
+	iput(inode);
+err_put_filp:
+	put_filp(file);
+	return error;
+}
+
+/*
+ * A single inode exist for all anon_inode files. Contrary to pipes,
+ * anon_inode inodes has no per-instance data associated, so we can avoid
+ * the allocation of multiple of them.
+ */
+static struct inode *anon_inode_mkinode(void)
+{
+	struct inode *inode = new_inode(anon_inode_mnt->mnt_sb);
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_fop = &anon_inode_fops;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because mark_inode_dirty() will think
+	 * that it already _is_ on the dirty list.
+	 */
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	return inode;
+}
+
+static int __init anon_inode_init(void)
+{
+	int error;
+
+	error = register_filesystem(&anon_inode_fs_type);
+	if (error)
+		goto err_exit;
+	anon_inode_mnt = kern_mount(&anon_inode_fs_type);
+	if (IS_ERR(anon_inode_mnt)) {
+		error = PTR_ERR(anon_inode_mnt);
+		goto err_unregister_filesystem;
+	}
+	anon_inode_inode = anon_inode_mkinode();
+	if (IS_ERR(anon_inode_inode)) {
+		error = PTR_ERR(anon_inode_inode);
+		goto err_mntput;
+	}
+
+	return 0;
+
+err_mntput:
+	mntput(anon_inode_mnt);
+err_unregister_filesystem:
+	unregister_filesystem(&anon_inode_fs_type);
+err_exit:
+	panic(KERN_ERR "anon_inode_init() failed (%d)\n", error);
+}
+
+fs_initcall(anon_inode_init);
+
diff --git a/fs/attr.c b/fs/attr.c
index 97de94670878..a0a0c7b07ba3 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -9,7 +9,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/capability.h>
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 4ef544434b51..8b4cca3c4705 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -101,7 +101,7 @@ struct autofs_symlink {
 struct autofs_sb_info {
 	u32 magic;
 	struct file *pipe;
-	pid_t oz_pgrp;
+	struct pid *oz_pgrp;
 	int catatonic;
 	struct super_block *sb;
 	unsigned long exp_timeout;
@@ -122,7 +122,7 @@ static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
    filesystem without "magic".) */
 
 static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
-	return sbi->catatonic || process_group(current) == sbi->oz_pgrp;
+	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
 }
 
 /* Hash operations */
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index aa0b61ff8270..e7204d71acc9 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -34,12 +34,14 @@ void autofs_kill_sb(struct super_block *sb)
 	if (!sbi)
 		goto out_kill_sb;
 
-	if ( !sbi->catatonic )
+	if (!sbi->catatonic)
 		autofs_catatonic_mode(sbi); /* Free wait queues, close pipe */
 
+	put_pid(sbi->oz_pgrp);
+
 	autofs_hash_nuke(sbi);
-	for ( n = 0 ; n < AUTOFS_MAX_SYMLINKS ; n++ ) {
-		if ( test_bit(n, sbi->symlink_bitmap) )
+	for (n = 0; n < AUTOFS_MAX_SYMLINKS; n++) {
+		if (test_bit(n, sbi->symlink_bitmap))
 			kfree(sbi->symlink[n].data);
 	}
 
@@ -69,7 +71,8 @@ static match_table_t autofs_tokens = {
 	{Opt_err, NULL}
 };
 
-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, pid_t *pgrp, int *minproto, int *maxproto)
+static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
+		pid_t *pgrp, int *minproto, int *maxproto)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -138,9 +141,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 	int pipefd;
 	struct autofs_sb_info *sbi;
 	int minproto, maxproto;
+	pid_t pgid;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if ( !sbi )
+	if (!sbi)
 		goto fail_unlock;
 	DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
 
@@ -149,7 +153,6 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 	sbi->pipe = NULL;
 	sbi->catatonic = 1;
 	sbi->exp_timeout = 0;
-	sbi->oz_pgrp = process_group(current);
 	autofs_initialize_hash(&sbi->dirhash);
 	sbi->queues = NULL;
 	memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
@@ -169,26 +172,36 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 		goto fail_iput;
 
 	/* Can this call block?  - WTF cares? s is locked. */
-	if ( parse_options(data,&pipefd,&root_inode->i_uid,&root_inode->i_gid,&sbi->oz_pgrp,&minproto,&maxproto) ) {
+	if (parse_options(data, &pipefd, &root_inode->i_uid,
+				&root_inode->i_gid, &pgid, &minproto,
+				&maxproto)) {
 		printk("autofs: called with bogus options\n");
 		goto fail_dput;
 	}
 
 	/* Couldn't this be tested earlier? */
-	if ( minproto > AUTOFS_PROTO_VERSION || 
-	     maxproto < AUTOFS_PROTO_VERSION ) {
+	if (minproto > AUTOFS_PROTO_VERSION ||
+	     maxproto < AUTOFS_PROTO_VERSION) {
 		printk("autofs: kernel does not match daemon version\n");
 		goto fail_dput;
 	}
 
-	DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, sbi->oz_pgrp));
+	DPRINTK(("autofs: pipe fd = %d, pgrp = %u\n", pipefd, pgid));
+	sbi->oz_pgrp = find_get_pid(pgid);
+
+	if (!sbi->oz_pgrp) {
+		printk("autofs: could not find process group %d\n", pgid);
+		goto fail_dput;
+	}
+
 	pipe = fget(pipefd);
 	
-	if ( !pipe ) {
+	if (!pipe) {
 		printk("autofs: could not open pipe file descriptor\n");
-		goto fail_dput;
+		goto fail_put_pid;
 	}
-	if ( !pipe->f_op || !pipe->f_op->write )
+
+	if (!pipe->f_op || !pipe->f_op->write)
 		goto fail_fput;
 	sbi->pipe = pipe;
 	sbi->catatonic = 0;
@@ -202,6 +215,8 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 fail_fput:
 	printk("autofs: pipe file descriptor does not contain proper ops\n");
 	fput(pipe);
+fail_put_pid:
+	put_pid(sbi->oz_pgrp);
 fail_dput:
 	dput(root);
 	goto fail_free;
@@ -230,7 +245,7 @@ static void autofs_read_inode(struct inode *inode)
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_blocks = 0;
 
-	if ( ino == AUTOFS_ROOT_INO ) {
+	if (ino == AUTOFS_ROOT_INO) {
 		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 		inode->i_op = &autofs_root_inode_operations;
 		inode->i_fop = &autofs_root_operations;
@@ -241,12 +256,12 @@ static void autofs_read_inode(struct inode *inode)
 	inode->i_uid = inode->i_sb->s_root->d_inode->i_uid;
 	inode->i_gid = inode->i_sb->s_root->d_inode->i_gid;
 	
-	if ( ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO ) {
+	if (ino >= AUTOFS_FIRST_SYMLINK && ino < AUTOFS_FIRST_DIR_INO) {
 		/* Symlink inode - should be in symlink list */
 		struct autofs_symlink *sl;
 
 		n = ino - AUTOFS_FIRST_SYMLINK;
-		if ( n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
+		if (n >= AUTOFS_MAX_SYMLINKS || !test_bit(n,sbi->symlink_bitmap)) {
 			printk("autofs: Looking for bad symlink inode %u\n", (unsigned int) ino);
 			return;
 		}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index f2597205939d..c1489533277a 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -67,8 +67,8 @@ static int autofs_root_readdir(struct file *filp, void *dirent, filldir_t filldi
 		filp->f_pos = ++nr;
 		/* fall through */
 	default:
-		while ( onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent) ) {
-			if ( !ent->dentry || d_mountpoint(ent->dentry) ) {
+		while (onr = nr, ent = autofs_hash_enum(dirhash,&nr,ent)) {
+			if (!ent->dentry || d_mountpoint(ent->dentry)) {
 				if (filldir(dirent,ent->name,ent->len,onr,ent->ino,DT_UNKNOWN) < 0)
 					goto out;
 				filp->f_pos = nr;
@@ -88,10 +88,10 @@ static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, str
 	struct autofs_dir_ent *ent;
 	int status = 0;
 
-	if ( !(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)) ) {
+	if (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name))) {
 		do {
-			if ( status && dentry->d_inode ) {
-				if ( status != -ENOENT )
+			if (status && dentry->d_inode) {
+				if (status != -ENOENT)
 					printk("autofs warning: lookup failure on positive dentry, status = %d, name = %s\n", status, dentry->d_name.name);
 				return 0; /* Try to get the kernel to invalidate this dentry */
 			}
@@ -106,7 +106,7 @@ static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, str
 				return 1;
 			}
 			status = autofs_wait(sbi, &dentry->d_name);
-		} while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)) );
+		} while (!(ent = autofs_hash_lookup(&sbi->dirhash, &dentry->d_name)));
 	}
 
 	/* Abuse this field as a pointer to the directory entry, used to
@@ -124,13 +124,13 @@ static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, str
 
 	/* If this is a directory that isn't a mount point, bitch at the
 	   daemon and fix it in user space */
-	if ( S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) ) {
+	if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
 		return !autofs_wait(sbi, &dentry->d_name);
 	}
 
 	/* We don't update the usages for the autofs daemon itself, this
 	   is necessary for recursive autofs mounts */
-	if ( !autofs_oz_mode(sbi) ) {
+	if (!autofs_oz_mode(sbi)) {
 		autofs_update_usage(&sbi->dirhash,ent);
 	}
 
@@ -157,7 +157,7 @@ static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
 	sbi = autofs_sbi(dir->i_sb);
 
 	/* Pending dentry */
-	if ( dentry->d_flags & DCACHE_AUTOFS_PENDING ) {
+	if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
 		if (autofs_oz_mode(sbi))
 			res = 1;
 		else
@@ -173,7 +173,7 @@ static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
 	}
 		
 	/* Check for a non-mountpoint directory */
-	if ( S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) ) {
+	if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry)) {
 		if (autofs_oz_mode(sbi))
 			res = 1;
 		else
@@ -183,9 +183,9 @@ static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
 	}
 
 	/* Update the usage list */
-	if ( !autofs_oz_mode(sbi) ) {
+	if (!autofs_oz_mode(sbi)) {
 		ent = (struct autofs_dir_ent *) dentry->d_time;
-		if ( ent )
+		if (ent)
 			autofs_update_usage(&sbi->dirhash,ent);
 	}
 	unlock_kernel();
@@ -213,8 +213,10 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr
 	sbi = autofs_sbi(dir->i_sb);
 
 	oz_mode = autofs_oz_mode(sbi);
-	DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
-		 current->pid, process_group(current), sbi->catatonic, oz_mode));
+	DPRINTK(("autofs_lookup: pid = %u, pgrp = %u, catatonic = %d, "
+				"oz_mode = %d\n", pid_nr(task_pid(current)),
+				process_group(current), sbi->catatonic,
+				oz_mode));
 
 	/*
 	 * Mark the dentry incomplete, but add it. This is needed so
@@ -258,7 +260,7 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr
 	 * doesn't do the right thing for all system calls, but it should
 	 * be OK for the operations we permit from an autofs.
 	 */
-	if ( dentry->d_inode && d_unhashed(dentry) )
+	if (dentry->d_inode && d_unhashed(dentry))
 		return ERR_PTR(-ENOENT);
 
 	return NULL;
@@ -277,18 +279,18 @@ static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const c
 	autofs_say(dentry->d_name.name,dentry->d_name.len);
 
 	lock_kernel();
-	if ( !autofs_oz_mode(sbi) ) {
+	if (!autofs_oz_mode(sbi)) {
 		unlock_kernel();
 		return -EACCES;
 	}
 
-	if ( autofs_hash_lookup(dh, &dentry->d_name) ) {
+	if (autofs_hash_lookup(dh, &dentry->d_name)) {
 		unlock_kernel();
 		return -EEXIST;
 	}
 
 	n = find_first_zero_bit(sbi->symlink_bitmap,AUTOFS_MAX_SYMLINKS);
-	if ( n >= AUTOFS_MAX_SYMLINKS ) {
+	if (n >= AUTOFS_MAX_SYMLINKS) {
 		unlock_kernel();
 		return -ENOSPC;
 	}
@@ -297,14 +299,14 @@ static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const c
 	sl = &sbi->symlink[n];
 	sl->len = strlen(symname);
 	sl->data = kmalloc(slsize = sl->len+1, GFP_KERNEL);
-	if ( !sl->data ) {
+	if (!sl->data) {
 		clear_bit(n,sbi->symlink_bitmap);
 		unlock_kernel();
 		return -ENOSPC;
 	}
 
 	ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-	if ( !ent ) {
+	if (!ent) {
 		kfree(sl->data);
 		clear_bit(n,sbi->symlink_bitmap);
 		unlock_kernel();
@@ -312,7 +314,7 @@ static int autofs_root_symlink(struct inode *dir, struct dentry *dentry, const c
 	}
 
 	ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-	if ( !ent->name ) {
+	if (!ent->name) {
 		kfree(sl->data);
 		kfree(ent);
 		clear_bit(n,sbi->symlink_bitmap);
@@ -354,23 +356,23 @@ static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
 
 	/* This allows root to remove symlinks */
 	lock_kernel();
-	if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) {
+	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
 		unlock_kernel();
 		return -EACCES;
 	}
 
 	ent = autofs_hash_lookup(dh, &dentry->d_name);
-	if ( !ent ) {
+	if (!ent) {
 		unlock_kernel();
 		return -ENOENT;
 	}
 
 	n = ent->ino - AUTOFS_FIRST_SYMLINK;
-	if ( n >= AUTOFS_MAX_SYMLINKS ) {
+	if (n >= AUTOFS_MAX_SYMLINKS) {
 		unlock_kernel();
 		return -EISDIR;	/* It's a directory, dummy */
 	}
-	if ( !test_bit(n,sbi->symlink_bitmap) ) {
+	if (!test_bit(n,sbi->symlink_bitmap)) {
 		unlock_kernel();
 		return -EINVAL;	/* Nonexistent symlink?  Shouldn't happen */
 	}
@@ -392,23 +394,23 @@ static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
 	struct autofs_dir_ent *ent;
 
 	lock_kernel();
-	if ( !autofs_oz_mode(sbi) ) {
+	if (!autofs_oz_mode(sbi)) {
 		unlock_kernel();
 		return -EACCES;
 	}
 
 	ent = autofs_hash_lookup(dh, &dentry->d_name);
-	if ( !ent ) {
+	if (!ent) {
 		unlock_kernel();
 		return -ENOENT;
 	}
 
-	if ( (unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO ) {
+	if ((unsigned int)ent->ino < AUTOFS_FIRST_DIR_INO) {
 		unlock_kernel();
 		return -ENOTDIR; /* Not a directory */
 	}
 
-	if ( ent->dentry != dentry ) {
+	if (ent->dentry != dentry) {
 		printk("autofs_rmdir: odentry != dentry for entry %s\n", dentry->d_name.name);
 	}
 
@@ -429,18 +431,18 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ino_t ino;
 
 	lock_kernel();
-	if ( !autofs_oz_mode(sbi) ) {
+	if (!autofs_oz_mode(sbi)) {
 		unlock_kernel();
 		return -EACCES;
 	}
 
 	ent = autofs_hash_lookup(dh, &dentry->d_name);
-	if ( ent ) {
+	if (ent) {
 		unlock_kernel();
 		return -EEXIST;
 	}
 
-	if ( sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO ) {
+	if (sbi->next_dir_ino < AUTOFS_FIRST_DIR_INO) {
 		printk("autofs: Out of inode numbers -- what the heck did you do??\n");
 		unlock_kernel();
 		return -ENOSPC;
@@ -448,13 +450,13 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ino = sbi->next_dir_ino++;
 
 	ent = kmalloc(sizeof(struct autofs_dir_ent), GFP_KERNEL);
-	if ( !ent ) {
+	if (!ent) {
 		unlock_kernel();
 		return -ENOSPC;
 	}
 
 	ent->name = kmalloc(dentry->d_name.len+1, GFP_KERNEL);
-	if ( !ent->name ) {
+	if (!ent->name) {
 		kfree(ent);
 		unlock_kernel();
 		return -ENOSPC;
@@ -483,7 +485,7 @@ static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
 	    put_user(sbi->exp_timeout / HZ, p))
 		return -EFAULT;
 
-	if ( ntimeout > ULONG_MAX/HZ )
+	if (ntimeout > ULONG_MAX/HZ)
 		sbi->exp_timeout = 0;
 	else
 		sbi->exp_timeout = ntimeout * HZ;
@@ -511,15 +513,14 @@ static inline int autofs_expire_run(struct super_block *sb,
 	pkt.hdr.proto_version = AUTOFS_PROTO_VERSION;
 	pkt.hdr.type = autofs_ptype_expire;
 
-	if ( !sbi->exp_timeout ||
-	     !(ent = autofs_expire(sb,sbi,mnt)) )
+	if (!sbi->exp_timeout || !(ent = autofs_expire(sb,sbi,mnt)))
 		return -EAGAIN;
 
 	pkt.len = ent->len;
 	memcpy(pkt.name, ent->name, pkt.len);
 	pkt.name[pkt.len] = '\0';
 
-	if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+	if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
 		return -EFAULT;
 
 	return 0;
@@ -537,11 +538,11 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
 
 	DPRINTK(("autofs_ioctl: cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",cmd,arg,sbi,process_group(current)));
 
-	if ( _IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
-	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT )
+	if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
+	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
 		return -ENOTTY;
 	
-	if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) )
+	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	
 	switch(cmd) {
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 26063dc84a2a..692364e8ffc3 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -18,7 +18,6 @@
 #include <linux/pagemap.h>
 #include <linux/parser.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/magic.h>
 #include "autofs_i.h"
 #include <linux/module.h>
@@ -219,8 +218,7 @@ static match_table_t tokens = {
 };
 
 static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
-			 pid_t *pgrp, unsigned int *type,
-			 int *minproto, int *maxproto)
+		pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -315,7 +313,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	struct autofs_info *ino;
 
 	sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
-	if ( !sbi )
+	if (!sbi)
 		goto fail_unlock;
 	DPRINTK("starting up, sbi = %p",sbi);
 
@@ -364,10 +362,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	root->d_fsdata = ino;
 
 	/* Can this call block? */
-	if (parse_options(data, &pipefd,
-			  &root_inode->i_uid, &root_inode->i_gid,
-			  &sbi->oz_pgrp, &sbi->type,
-			  &sbi->min_proto, &sbi->max_proto)) {
+	if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
+				&sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
+				&sbi->max_proto)) {
 		printk("autofs: called with bogus options\n");
 		goto fail_dput;
 	}
@@ -397,11 +394,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
 	pipe = fget(pipefd);
 	
-	if ( !pipe ) {
+	if (!pipe) {
 		printk("autofs: could not open pipe file descriptor\n");
 		goto fail_dput;
 	}
-	if ( !pipe->f_op || !pipe->f_op->write )
+	if (!pipe->f_op || !pipe->f_op->write)
 		goto fail_fput;
 	sbi->pipe = pipe;
 	sbi->pipefd = pipefd;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d0e9b3a3905d..2d4c8a3e604e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -17,7 +17,6 @@
 #include <linux/stat.h>
 #include <linux/param.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include "autofs_i.h"
 
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
@@ -760,7 +759,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	struct autofs_info *p_ino;
 	
 	/* This allows root to remove symlinks */
-	if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) )
+	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
 	if (atomic_dec_and_test(&ino->count)) {
@@ -834,7 +833,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct autofs_info *p_ino;
 	struct inode *inode;
 
-	if ( !autofs4_oz_mode(sbi) )
+	if (!autofs4_oz_mode(sbi))
 		return -EACCES;
 
 	DPRINTK("dentry %p, creating %.*s",
@@ -872,11 +871,11 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
 	int rv;
 	unsigned long ntimeout;
 
-	if ( (rv = get_user(ntimeout, p)) ||
-	     (rv = put_user(sbi->exp_timeout/HZ, p)) )
+	if ((rv = get_user(ntimeout, p)) ||
+	     (rv = put_user(sbi->exp_timeout/HZ, p)))
 		return rv;
 
-	if ( ntimeout > ULONG_MAX/HZ )
+	if (ntimeout > ULONG_MAX/HZ)
 		sbi->exp_timeout = 0;
 	else
 		sbi->exp_timeout = ntimeout * HZ;
@@ -907,7 +906,7 @@ static inline int autofs4_ask_reghost(struct autofs_sb_info *sbi, int __user *p)
 	DPRINTK("returning %d", sbi->needs_reghost);
 
 	status = put_user(sbi->needs_reghost, p);
-	if ( status )
+	if (status)
 		return status;
 
 	sbi->needs_reghost = 0;
@@ -976,11 +975,11 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
 	DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
 		cmd,arg,sbi,process_group(current));
 
-	if ( _IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
-	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT )
+	if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
+	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
 		return -ENOTTY;
 	
-	if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) )
+	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	
 	switch(cmd) {
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index efeab2fab40b..329ee473eede 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -12,7 +12,6 @@
 #include <linux/module.h>
 #include <linux/stat.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index cc6cc8ed2e39..a5c5171c2828 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -292,11 +292,8 @@ befs_destroy_inode(struct inode *inode)
 static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
-	
-	        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-		            SLAB_CTOR_CONSTRUCTOR) {
-			inode_init_once(&bi->vfs_inode);
-		}
+
+	inode_init_once(&bi->vfs_inode);
 }
 
 static void
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 93d6219243ad..58c7bd9f5301 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,9 +248,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct bfs_inode_info *bi = foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&bi->vfs_inode);
+	inode_init_once(&bi->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9cc4f0a8aaae..fa8ea33ab0be 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,7 +31,6 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/compiler.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
@@ -39,6 +38,7 @@
 #include <linux/syscalls.h>
 #include <linux/random.h>
 #include <linux/elf.h>
+#include <linux/utsname.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -871,6 +871,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 				elf_prot, elf_flags);
 		if (BAD_ADDR(error)) {
 			send_sig(SIGKILL, current, 0);
+			retval = IS_ERR((void *)error) ?
+				PTR_ERR((void*)error) : -EINVAL;
 			goto out_free_dentry;
 		}
 
@@ -900,6 +902,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		    TASK_SIZE - elf_ppnt->p_memsz < k) {
 			/* set_brk can never work. Avoid overflows. */
 			send_sig(SIGKILL, current, 0);
+			retval = -EINVAL;
 			goto out_free_dentry;
 		}
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f3ddca4a387b..9d62fbad3d4b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -30,7 +30,6 @@
 #include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/elf.h>
 #include <linux/elf-fdpic.h>
 #include <linux/elfcore.h>
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 1f2d1ad63319..576dd7de2278 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -12,7 +12,6 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/binfmts.h>
 #include <linux/elf.h>
 #include <linux/init.h>
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e6f57990b121..330fd3fe8546 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -18,7 +18,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-
+#include <linux/sched.h>
 #include <linux/binfmts.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
@@ -675,19 +675,8 @@ static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
 	char *s = enabled ? "enabled" : "disabled";
-	int len = strlen(s);
-	loff_t pos = *ppos;
 
-	if (pos < 0)
-		return -EINVAL;
-	if (pos >= len)
-		return 0;
-	if (len < pos + nbytes)
-		nbytes = len - pos;
-	if (copy_to_user(buf, s + pos, nbytes))
-		return -EFAULT;
-	*ppos = pos + nbytes;
-	return nbytes;
+	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
 
 static ssize_t bm_status_write(struct file * file, const char __user * buffer,
@@ -727,8 +716,8 @@ static const struct super_operations s_ops = {
 static int bm_fill_super(struct super_block * sb, void * data, int silent)
 {
 	static struct tree_descr bm_files[] = {
-		[1] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
-		[2] = {"register", &bm_register_operations, S_IWUSR},
+		[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
+		[3] = {"register", &bm_register_operations, S_IWUSR},
 		/* last one */ {""}
 	};
 	int err = simple_fill_super(sb, 0x42494e4d, bm_files);
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 1edbcca25a73..304c88544d89 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -12,7 +12,6 @@
 #include <linux/binfmts.h>
 #include <linux/init.h>
 #include <linux/file.h>
-#include <linux/smp_lock.h>
 #include <linux/err.h>
 #include <linux/fs.h>
 
diff --git a/fs/bio.c b/fs/bio.c
index 7618bcb18368..093345f00128 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,7 +28,7 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-#define BIO_POOL_SIZE 256
+#define BIO_POOL_SIZE 2
 
 static struct kmem_cache *bio_slab __read_mostly;
 
@@ -38,7 +38,7 @@ static struct kmem_cache *bio_slab __read_mostly;
  * a small number of entries is fine, not going to be performance critical.
  * basically we just need to survive
  */
-#define BIO_SPLIT_ENTRIES 8	
+#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
 
 struct biovec_slab {
@@ -1120,7 +1120,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
+static int biovec_create_pools(struct bio_set *bs, int pool_entries)
 {
 	int i;
 
@@ -1128,9 +1128,6 @@ static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
 		struct biovec_slab *bp = bvec_slabs + i;
 		mempool_t **bvp = bs->bvec_pools + i;
 
-		if (pool_entries > 1 && i >= scale)
-			pool_entries >>= 1;
-
 		*bvp = mempool_create_slab_pool(pool_entries, bp->slab);
 		if (!*bvp)
 			return -ENOMEM;
@@ -1161,7 +1158,7 @@ void bioset_free(struct bio_set *bs)
 	kfree(bs);
 }
 
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
+struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 {
 	struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
 
@@ -1172,7 +1169,7 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (!biovec_create_pools(bs, bvec_pool_size, scale))
+	if (!biovec_create_pools(bs, bvec_pool_size))
 		return bs;
 
 bad:
@@ -1196,38 +1193,11 @@ static void __init biovec_init_slabs(void)
 
 static int __init init_bio(void)
 {
-	int megabytes, bvec_pool_entries;
-	int scale = BIOVEC_NR_POOLS;
-
-	bio_slab = kmem_cache_create("bio", sizeof(struct bio), 0,
-				SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+	bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
 	biovec_init_slabs();
 
-	megabytes = nr_free_pages() >> (20 - PAGE_SHIFT);
-
-	/*
-	 * find out where to start scaling
-	 */
-	if (megabytes <= 16)
-		scale = 0;
-	else if (megabytes <= 32)
-		scale = 1;
-	else if (megabytes <= 64)
-		scale = 2;
-	else if (megabytes <= 96)
-		scale = 3;
-	else if (megabytes <= 128)
-		scale = 4;
-
-	/*
-	 * Limit number of entries reserved -- mempools are only used when
-	 * the system is completely unable to allocate memory, so we only
-	 * need enough to make progress.
-	 */
-	bvec_pool_entries = 1 + scale;
-
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, bvec_pool_entries, scale);
+	fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 575076c018f4..ea1480a16f51 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -22,6 +22,7 @@
 #include <linux/mount.h>
 #include <linux/uio.h>
 #include <linux/namei.h>
+#include <linux/log2.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -55,17 +56,19 @@ static sector_t max_block(struct block_device *bdev)
 	return retval;
 }
 
-/* Kill _all_ buffers, dirty or not.. */
+/* Kill _all_ buffers and pagecache , dirty or not.. */
 static void kill_bdev(struct block_device *bdev)
 {
-	invalidate_bdev(bdev, 1);
+	if (bdev->bd_inode->i_mapping->nrpages == 0)
+		return;
+	invalidate_bh_lrus();
 	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 }	
 
 int set_blocksize(struct block_device *bdev, int size)
 {
 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
-	if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
+	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
 		return -EINVAL;
 
 	/* Size cannot be smaller than the size supported by the device */
@@ -455,19 +458,15 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-	{
-		memset(bdev, 0, sizeof(*bdev));
-		mutex_init(&bdev->bd_mutex);
-		sema_init(&bdev->bd_mount_sem, 1);
-		INIT_LIST_HEAD(&bdev->bd_inodes);
-		INIT_LIST_HEAD(&bdev->bd_list);
+	memset(bdev, 0, sizeof(*bdev));
+	mutex_init(&bdev->bd_mutex);
+	sema_init(&bdev->bd_mount_sem, 1);
+	INIT_LIST_HEAD(&bdev->bd_inodes);
+	INIT_LIST_HEAD(&bdev->bd_list);
 #ifdef CONFIG_SYSFS
-		INIT_LIST_HEAD(&bdev->bd_holder_list);
+	INIT_LIST_HEAD(&bdev->bd_holder_list);
 #endif
-		inode_init_once(&ei->vfs_inode);
-	}
+	inode_init_once(&ei->vfs_inode);
 }
 
 static inline void __bd_forget(struct inode *inode)
@@ -1478,7 +1477,7 @@ int __invalidate_device(struct block_device *bdev)
 		res = invalidate_inodes(sb);
 		drop_super(sb);
 	}
-	invalidate_bdev(bdev, 0);
+	invalidate_bdev(bdev);
 	return res;
 }
 EXPORT_SYMBOL(__invalidate_device);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1d0852fa728b..aa68206bd517 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -24,7 +24,6 @@
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/capability.h>
 #include <linux/blkdev.h>
 #include <linux/file.h>
@@ -44,7 +43,6 @@
 #include <linux/bit_spinlock.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static void invalidate_bh_lrus(void);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -333,7 +331,7 @@ out:
    we think the disk contains more recent information than the buffercache.
    The update == 1 pass marks the buffers we need to update, the update == 2
    pass does the actual I/O. */
-void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
+void invalidate_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
@@ -341,11 +339,6 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 		return;
 
 	invalidate_bh_lrus();
-	/*
-	 * FIXME: what about destroy_dirty_buffers?
-	 * We really want to use invalidate_inode_pages2() for
-	 * that, but not until that's cleaned up.
-	 */
 	invalidate_mapping_pages(mapping, 0, -1);
 }
 
@@ -988,7 +981,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	struct page *page;
 	struct buffer_head *bh;
 
-	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	page = find_or_create_page(inode->i_mapping, index,
+		mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 	if (!page)
 		return NULL;
 
@@ -1408,7 +1402,7 @@ static void invalidate_bh_lru(void *arg)
 	put_cpu_var(bh_lrus);
 }
 	
-static void invalidate_bh_lrus(void)
+void invalidate_bh_lrus(void)
 {
 	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
 }
@@ -1700,17 +1694,8 @@ done:
 		 * clean.  Someone wrote them back by hand with
 		 * ll_rw_block/submit_bh.  A rare case.
 		 */
-		int uptodate = 1;
-		do {
-			if (!buffer_uptodate(bh)) {
-				uptodate = 0;
-				break;
-			}
-			bh = bh->b_this_page;
-		} while (bh != head);
-		if (uptodate)
-			SetPageUptodate(page);
 		end_page_writeback(page);
+
 		/*
 		 * The page and buffer_heads can be released at any time from
 		 * here on.
@@ -1742,6 +1727,7 @@ recover:
 	} while ((bh = bh->b_this_page) != head);
 	SetPageError(page);
 	BUG_ON(PageWriteback(page));
+	mapping_set_error(page->mapping, err);
 	set_page_writeback(page);
 	do {
 		struct buffer_head *next = bh->b_this_page;
@@ -1861,13 +1847,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 		if (block_start >= to)
 			break;
 		if (buffer_new(bh)) {
-			void *kaddr;
-
 			clear_buffer_new(bh);
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr+block_start, 0, bh->b_size);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
+			zero_user_page(page, block_start, bh->b_size, KM_USER0);
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
 		}
@@ -1955,10 +1936,8 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 					SetPageError(page);
 			}
 			if (!buffer_mapped(bh)) {
-				void *kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + i * blocksize, 0, blocksize);
-				flush_dcache_page(page);
-				kunmap_atomic(kaddr, KM_USER0);
+				zero_user_page(page, i * blocksize, blocksize,
+						KM_USER0);
 				if (!err)
 					set_buffer_uptodate(bh);
 				continue;
@@ -2101,7 +2080,6 @@ int cont_prepare_write(struct page *page, unsigned offset,
 	long status;
 	unsigned zerofrom;
 	unsigned blocksize = 1 << inode->i_blkbits;
-	void *kaddr;
 
 	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
 		status = -ENOMEM;
@@ -2123,10 +2101,8 @@ int cont_prepare_write(struct page *page, unsigned offset,
 						PAGE_CACHE_SIZE, get_block);
 		if (status)
 			goto out_unmap;
-		kaddr = kmap_atomic(new_page, KM_USER0);
-		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
-		flush_dcache_page(new_page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(new_page, zerofrom, PAGE_CACHE_SIZE - zerofrom,
+				KM_USER0);
 		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
 		unlock_page(new_page);
 		page_cache_release(new_page);
@@ -2153,10 +2129,7 @@ int cont_prepare_write(struct page *page, unsigned offset,
 	if (status)
 		goto out1;
 	if (zerofrom < offset) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr+zerofrom, 0, offset-zerofrom);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, zerofrom, offset - zerofrom, KM_USER0);
 		__block_commit_write(inode, page, zerofrom, offset);
 	}
 	return 0;
@@ -2355,10 +2328,7 @@ failed:
 	 * Error recovery is pretty slack.  Clear the page and mark it dirty
 	 * so we'll later zero out any blocks which _were_ allocated.
 	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr, 0, PAGE_CACHE_SIZE);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
 	SetPageUptodate(page);
 	set_page_dirty(page);
 	return ret;
@@ -2397,7 +2367,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
-	void *kaddr;
 	int ret;
 
 	/* Is the page fully inside i_size? */
@@ -2428,10 +2397,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
@@ -2452,7 +2418,6 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
 	unsigned to;
 	struct page *page;
 	const struct address_space_operations *a_ops = mapping->a_ops;
-	char *kaddr;
 	int ret = 0;
 
 	if ((offset & (blocksize - 1)) == 0)
@@ -2466,10 +2431,8 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
 	to = (offset + blocksize) & ~(blocksize - 1);
 	ret = a_ops->prepare_write(NULL, page, offset, to);
 	if (ret == 0) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
+				KM_USER0);
 		/*
 		 * It would be more correct to call aops->commit_write()
 		 * here, but this is more efficient.
@@ -2495,7 +2458,6 @@ int block_truncate_page(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head *bh;
-	void *kaddr;
 	int err;
 
 	blocksize = 1 << inode->i_blkbits;
@@ -2549,11 +2511,7 @@ int block_truncate_page(struct address_space *mapping,
 			goto unlock;
 	}
 
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, length);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-
+	zero_user_page(page, offset, length, KM_USER0);
 	mark_buffer_dirty(bh);
 	err = 0;
 
@@ -2574,7 +2532,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
-	void *kaddr;
 
 	/* Is the page fully inside i_size? */
 	if (page->index < end_index)
@@ -2600,10 +2557,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
 	return __block_write_full_page(inode, page, get_block, wbc);
 }
 
@@ -2945,8 +2899,9 @@ static void recalc_bh_state(void)
 	
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
-	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
 	if (ret) {
+		INIT_LIST_HEAD(&ret->b_assoc_buffers);
 		get_cpu_var(bh_accounting).nr++;
 		recalc_bh_state();
 		put_cpu_var(bh_accounting);
@@ -2965,18 +2920,6 @@ void free_buffer_head(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(free_buffer_head);
 
-static void
-init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
-{
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-			    SLAB_CTOR_CONSTRUCTOR) {
-		struct buffer_head * bh = (struct buffer_head *)data;
-
-		memset(bh, 0, sizeof(*bh));
-		INIT_LIST_HEAD(&bh->b_assoc_buffers);
-	}
-}
-
 static void buffer_exit_cpu(int cpu)
 {
 	int i;
@@ -2994,7 +2937,7 @@ static void buffer_exit_cpu(int cpu)
 static int buffer_cpu_notify(struct notifier_block *self,
 			      unsigned long action, void *hcpu)
 {
-	if (action == CPU_DEAD)
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
 		buffer_exit_cpu((unsigned long)hcpu);
 	return NOTIFY_OK;
 }
@@ -3003,12 +2946,8 @@ void __init buffer_init(void)
 {
 	int nrpages;
 
-	bh_cachep = kmem_cache_create("buffer_head",
-					sizeof(struct buffer_head), 0,
-					(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-					SLAB_MEM_SPREAD),
-					init_buffer_head,
-					NULL);
+	bh_cachep = KMEM_CACHE(buffer_head,
+			SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
 
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 5d1f4873d701..a9b6bc5157b8 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,4 +1,16 @@
-Verison 1.48
+Version 1.49
+------------
+IPv6 support.  Enable ipv6 addresses to be passed on mount (put the ipv6
+address after the "ip=" mount option, at least until mount.cifs is fixed to
+handle DNS host to ipv6 name translation).  Accept override of uid or gid
+on mount even when Unix Extensions are negotiated (it used to be ignored
+when Unix Extensions were ignored).  This allows users to override the
+default uid and gid for files when they are certain that the uids or
+gids on the server do not match those of the client.  Make "sec=none"
+mount override username (so that null user connection is attempted)
+to match what documentation said.
+
+Version 1.48
 ------------
 Fix mtime bouncing around from local idea of last write times to remote time.
 Fix hang (in i_size_read) when simultaneous size update of same remote file
@@ -9,7 +21,13 @@ from read-only back to read-write, reflect this change in default file mode
 (we had been leaving a file's mode read-only until the inode were reloaded).
 Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
 when archive dos attribute not set and we are changing mode back to writeable
-on server which does not support the Unix Extensions).
+on server which does not support the Unix Extensions).  Remove read only dos
+attribute on chmod when adding any write permission (ie on any of
+user/group/other (not all of user/group/other ie  0222) when
+mounted to windows.  Add support for POSIX MkDir (slight performance
+enhancement and eliminates the network race between the mkdir and set 
+path info of the mode).
+
 
 Version 1.47
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 080c5eba112b..4d01697722cc 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -257,13 +257,19 @@ A partial list of the supported mount options follows:
 		mount.	
   domain	Set the SMB/CIFS workgroup name prepended to the
 		username during CIFS session establishment
-  uid		If CIFS Unix extensions are not supported by the server
-		this overrides the default uid for inodes. For mounts to
-		servers which do support the CIFS Unix extensions, such
-		as a properly configured Samba server, the server provides
-		the uid, gid and mode.  For servers which do not support
-		the Unix extensions, the default uid (and gid) returned on
-		lookup of existing files is the uid (gid) of the person
+  uid		Set the default uid for inodes. For mounts to servers
+		which do support the CIFS Unix extensions, such as a
+		properly configured Samba server, the server provides
+		the uid, gid and mode so this parameter should  not be
+		specified unless the server and clients uid and gid
+		numbering differ.  If the server and client are in the
+		same domain (e.g. running winbind or nss_ldap) and
+		the server supports the Unix Extensions then the uid
+		and gid can be retrieved from the server (and uid
+		and gid would not have to be specifed on the mount. 
+		For servers which do not support the CIFS Unix
+		extensions, the default uid (and gid) returned on lookup
+		of existing files will be the uid (gid) of the person
 		who executed the mount (root, except when mount.cifs
 		is configured setuid for user mounts) unless the "uid=" 
 		(gid) mount option is specified.  For the uid (gid) of newly
@@ -281,8 +287,7 @@ A partial list of the supported mount options follows:
 		the client.  Note that the mount.cifs helper must be
 		at version 1.10 or higher to support specifying the uid
 		(or gid) in non-numberic form.
-  gid		If CIFS Unix extensions are not supported by the server
-		this overrides the default gid for inodes.
+  gid		Set the default gid for inodes (similar to above).
   file_mode     If CIFS Unix extensions are not supported by the server
 		this overrides the default mode for file inodes.
   dir_mode      If CIFS Unix extensions are not supported by the server 
@@ -467,7 +472,7 @@ including:
 	-V      print mount.cifs version
 	-?      display simple usage information
 
-With recent 2.6 kernel versions of modutils, the version of the cifs kernel
+With most 2.6 kernel versions of modutils, the version of the cifs kernel
 module can be displayed via modinfo.
 
 Misc /proc/fs/cifs Flags and Debug Info
@@ -516,8 +521,22 @@ SecurityFlags		Flags which control security negotiation and
 			must use plaintext passwords			0x20020
 			(reserved for future packet encryption)		0x00040
 
-cifsFYI			If set to one, additional debug information is
-			logged to the system error log. (default 0)
+cifsFYI			If set to non-zero value, additional debug information
+			will be logged to the system error log.  This field
+			contains three flags controlling different classes of
+			debugging entries.  The maximum value it can be set
+			to is 7 which enables all debugging points (default 0).
+			Some debugging statements are not compiled into the
+			cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
+			kernel configuration. cifsFYI may be set to one or
+			nore of the following flags (7 sets them all):
+
+			log cifs informational messages			0x01
+			log return codes from cifs entry points		0x02
+			log slow responses (ie which take longer than 1 second)
+			  CONFIG_CIFS_STATS2 must be enabled in .config	0x04
+				
+				
 traceSMB		If set to one, debug information is logged to the
 			system error log with the start of smb requests
 			and responses (default 0)
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index d7b9c27c942d..78b620e332bd 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
-Version 1.39 November 30, 2005
+Version 1.49 April 26, 2007
 
 A Partial List of Missing Features
 ==================================
@@ -18,7 +18,7 @@ better)
 
 d) Kerberos/SPNEGO session setup support - (started)
 
-e) NTLMv2 authentication (mostly implemented - double check
+e) More testing of NTLMv2 authentication (mostly implemented - double check
 that NTLMv2 signing works, also need to cleanup now unneeded SessSetup code in
 fs/cifs/connect.c)
 
@@ -27,55 +27,44 @@ used (Kerberos or NTLMSSP). Signing alreadyimplemented for NTLM
 and raw NTLMSSP already. This is important when enabling
 extended security and mounting to Windows 2003 Servers
 
-f) Directory entry caching relies on a 1 second timer, rather than 
+g) Directory entry caching relies on a 1 second timer, rather than 
 using FindNotify or equivalent.  - (started)
 
-g) A few byte range testcases fail due to POSIX vs. Windows/CIFS
-style byte range lock differences.  Save byte range locks so
-reconnect can replay them.  
-
-h) Support unlock all (unlock 0,MAX_OFFSET)
-by unlocking all known byte range locks that we locked on the file.
-
-i) quota support (needs minor kernel change since quota calls
+h) quota support (needs minor kernel change since quota calls
 to make it to network filesystems or deviceless filesystems)
 
-j) investigate sync behavior (including syncpage) and check  
+i) investigate sync behavior (including syncpage) and check  
 for proper behavior of intr/nointr
 
-k) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
+j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
 extra copy in/out of the socket buffers in some cases.
 
-l) finish support for IPv6.  This is mostly complete but
-needs a simple conversion of ipv6 to sin6_addr from the
-address in string representation.
-
-m) Better optimize open (and pathbased setfilesize) to reduce the
+k) Better optimize open (and pathbased setfilesize) to reduce the
 oplock breaks coming from windows srv.  Piggyback identical file
 opens on top of each other by incrementing reference count rather
 than resending (helps reduce server resource utilization and avoid
 spurious oplock breaks).
 
-o) Improve performance of readpages by sending more than one read
+l) Improve performance of readpages by sending more than one read
 at a time when 8 pages or more are requested. In conjuntion
 add support for async_cifs_readpages.
 
-p) Add support for storing symlink info to Windows servers 
+m) Add support for storing symlink info to Windows servers 
 in the Extended Attribute format their SFU clients would recognize.
 
-q) Finish fcntl D_NOTIFY support so kde and gnome file list windows
+n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
 will autorefresh (partially complete by Asser). Needs minor kernel
 vfs change to support removing D_NOTIFY on a file.   
 
-r) Add GUI tool to configure /proc/fs/cifs settings and for display of
+o) Add GUI tool to configure /proc/fs/cifs settings and for display of
 the CIFS statistics (started)
 
-s) implement support for security and trusted categories of xattrs
+p) implement support for security and trusted categories of xattrs
 (requires minor protocol extension) to enable better support for SELINUX
 
-t) Implement O_DIRECT flag on open (already supported on mount)
+q) Implement O_DIRECT flag on open (already supported on mount)
 
-u) Create UID mapping facility so server UIDs can be mapped on a per
+r) Create UID mapping facility so server UIDs can be mapped on a per
 mount or a per server basis to client UIDs or nobody if no mapping
 exists.  This is helpful when Unix extensions are negotiated to
 allow better permission checking when UIDs differ on the server
@@ -83,19 +72,26 @@ and client.  Add new protocol request to the CIFS protocol
 standard for asking the server for the corresponding name of a
 particular uid.
 
-v) Add support for CIFS Unix and also the newer POSIX extensions to the
+s) Add support for CIFS Unix and also the newer POSIX extensions to the
 server side for Samba 4.
 
-w) Finish up the dos time conversion routines needed to return old server
-time to the client (default time, of now or time 0 is used now for these 
-very old servers)
-
-x) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
+t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
 need to add ability to set time to server (utimes command)
 
-y) Finish testing of Windows 9x/Windows ME server support (started).
+u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
+
+v) mount check for unmatched uids
+
+w) Add mount option for Linux extension disable per mount, and partial
+disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?)
 
-KNOWN BUGS (updated February 26, 2007)
+x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
+processes can proceed better in parallel (on the server)
+
+y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
+restriction of wsize max being 127K) 
+
+KNOWN BUGS (updated April 24, 2007)
 ====================================
 See http://bugzilla.samba.org - search on product "CifsVFS" for
 current bug list.
@@ -127,10 +123,3 @@ negotiated size) and send larger write sizes to modern servers.
 4) More exhaustively test against less common servers.  More testing
 against Windows 9x, Windows ME servers.
 
-DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
-
-mount check for unmatched uids - and uid override
-
-Add mount option for Linux extension disable per mount, and partial disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?) 
-
-Free threads at umount --force that are stuck on the sesSem
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index fd1e52ebcee6..4cc2012e9322 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -22,12 +22,14 @@
 #define CIFS_MOUNT_SET_UID      2 /* set current->euid in create etc. */
 #define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server */
 #define CIFS_MOUNT_DIRECT_IO    8 /* do not write nor read through page cache */
-#define CIFS_MOUNT_NO_XATTR  0x10 /* if set - disable xattr support */
-#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames */
-#define CIFS_MOUNT_POSIX_PATHS	0x40 /* Negotiate posix pathnames if possible. */
-#define CIFS_MOUNT_UNX_EMUL	0x80 /* Network compat with SFUnix emulation */
-#define CIFS_MOUNT_NO_BRL	0x100 /* No sending byte range locks to srv */
-#define CIFS_MOUNT_CIFS_ACL	0x200 /* send ACL requests to non-POSIX srv */
+#define CIFS_MOUNT_NO_XATTR     0x10  /* if set - disable xattr support       */
+#define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames   */
+#define CIFS_MOUNT_POSIX_PATHS  0x40  /* Negotiate posix pathnames if possible*/
+#define CIFS_MOUNT_UNX_EMUL     0x80  /* Network compat with SFUnix emulation */
+#define CIFS_MOUNT_NO_BRL       0x100 /* No sending byte range locks to srv   */
+#define CIFS_MOUNT_CIFS_ACL     0x200 /* send ACL requests to non-POSIX srv   */
+#define CIFS_MOUNT_OVERR_UID    0x400 /* override uid returned from server    */
+#define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
 
 struct cifs_sb_info {
 	struct cifsTconInfo *tcon;	/* primary mount */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index d2a8b2941fc2..793c4b95c164 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -74,8 +74,8 @@ cifs_strtoUCS(__le16 * to, const char *from, int len,
 		charlen = codepage->char2uni(from, len, &wchar_to[i]);
 		if (charlen < 1) {
 			cERROR(1,
-			       ("cifs_strtoUCS: char2uni returned %d",
-				charlen));
+			       ("strtoUCS: char2uni of %d returned %d",
+				(int)*from, charlen));
 			/* A question mark */
 			to[i] = cpu_to_le16(0x003f);
 			charlen = 1;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index faba4d69fe91..d38c69b591cf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -100,7 +100,7 @@ cifs_read_super(struct super_block *sb, void *data,
 	sb->s_flags |= MS_NODIRATIME | MS_NOATIME;
 	sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info),GFP_KERNEL);
 	cifs_sb = CIFS_SB(sb);
-	if(cifs_sb == NULL)
+	if (cifs_sb == NULL)
 		return -ENOMEM;
 
 	rc = cifs_mount(sb, cifs_sb, data, devname);
@@ -115,10 +115,10 @@ cifs_read_super(struct super_block *sb, void *data,
 	sb->s_magic = CIFS_MAGIC_NUMBER;
 	sb->s_op = &cifs_super_ops;
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	if(experimEnabled != 0)
+	if (experimEnabled != 0)
 		sb->s_export_op = &cifs_export_ops;
 #endif /* EXPERIMENTAL */	
-/*	if(cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
+/*	if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
 	    sb->s_blocksize = cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
 #ifdef CONFIG_CIFS_QUOTA
 	sb->s_qcop = &cifs_quotactl_ops;
@@ -147,8 +147,8 @@ out_no_root:
 		iput(inode);
 
 out_mount_failed:
-	if(cifs_sb) {
-		if(cifs_sb->local_nls)
+	if (cifs_sb) {
+		if (cifs_sb->local_nls)
 			unload_nls(cifs_sb->local_nls);	
 		kfree(cifs_sb);
 	}
@@ -163,7 +163,7 @@ cifs_put_super(struct super_block *sb)
 
 	cFYI(1, ("In cifs_put_super"));
 	cifs_sb = CIFS_SB(sb);
-	if(cifs_sb == NULL) {
+	if (cifs_sb == NULL) {
 		cFYI(1,("Empty cifs superblock info passed to unmount"));
 		return;
 	}
@@ -208,14 +208,14 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
     /* Only need to call the old QFSInfo if failed
     on newer one */
-    if(rc)
-	if(pTcon->ses->capabilities & CAP_NT_SMBS)
+    if (rc)
+	if (pTcon->ses->capabilities & CAP_NT_SMBS)
 		rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */
 
 	/* Some old Windows servers also do not support level 103, retry with
 	   older level one if old server failed the previous call or we
 	   bypassed it because we detected that this was an older LANMAN sess */
-	if(rc)
+	if (rc)
 		rc = SMBOldQFSInfo(xid, pTcon, buf);
 	/*     
 	   int f_type;
@@ -301,11 +301,19 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 				if (cifs_sb->tcon->ses->userName)
 					seq_printf(s, ",username=%s",
 					   cifs_sb->tcon->ses->userName);
-				if(cifs_sb->tcon->ses->domainName)
+				if (cifs_sb->tcon->ses->domainName)
 					seq_printf(s, ",domain=%s",
 					   cifs_sb->tcon->ses->domainName);
 			}
 		}
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+			seq_printf(s, ",posixpaths");
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) ||
+		   !(cifs_sb->tcon->ses->capabilities & CAP_UNIX))
+			seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
+		   !(cifs_sb->tcon->ses->capabilities & CAP_UNIX))
+			seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
 		seq_printf(s, ",rsize=%d",cifs_sb->rsize);
 		seq_printf(s, ",wsize=%d",cifs_sb->wsize);
 	}
@@ -321,14 +329,14 @@ int cifs_xquota_set(struct super_block * sb, int quota_type, qid_t qid,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *pTcon;
 	
-	if(cifs_sb)
+	if (cifs_sb)
 		pTcon = cifs_sb->tcon;
 	else
 		return -EIO;
 
 
 	xid = GetXid();
-	if(pTcon) {
+	if (pTcon) {
 		cFYI(1,("set type: 0x%x id: %d",quota_type,qid));		
 	} else {
 		return -EIO;
@@ -346,13 +354,13 @@ int cifs_xquota_get(struct super_block * sb, int quota_type, qid_t qid,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *pTcon;
 
-	if(cifs_sb)
+	if (cifs_sb)
 		pTcon = cifs_sb->tcon;
 	else
 		return -EIO;
 
 	xid = GetXid();
-	if(pTcon) {
+	if (pTcon) {
                 cFYI(1,("set type: 0x%x id: %d",quota_type,qid));
 	} else {
 		rc = -EIO;
@@ -369,13 +377,13 @@ int cifs_xstate_set(struct super_block * sb, unsigned int flags, int operation)
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *pTcon;
 
-	if(cifs_sb)
+	if (cifs_sb)
 		pTcon = cifs_sb->tcon;
 	else
 		return -EIO;
 
 	xid = GetXid();
-	if(pTcon) {
+	if (pTcon) {
                 cFYI(1,("flags: 0x%x operation: 0x%x",flags,operation));
 	} else {
 		rc = -EIO;
@@ -392,13 +400,13 @@ int cifs_xstate_get(struct super_block * sb, struct fs_quota_stat *qstats)
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsTconInfo *pTcon;
 
-	if(cifs_sb) {
+	if (cifs_sb) {
 		pTcon = cifs_sb->tcon;
 	} else {
 		return -EIO;
 	}
 	xid = GetXid();
-	if(pTcon) {
+	if (pTcon) {
 		cFYI(1,("pqstats %p",qstats));		
 	} else {
 		rc = -EIO;
@@ -424,11 +432,11 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 	if (!(flags & MNT_FORCE))
 		return;
 	cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
-	if(cifs_sb == NULL)
+	if (cifs_sb == NULL)
 		return;
 
 	tcon = cifs_sb->tcon;
-	if(tcon == NULL)
+	if (tcon == NULL)
 		return;
 	down(&tcon->tconSem);
 	if (atomic_read(&tcon->useCount) == 1)
@@ -437,7 +445,7 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
-	if(tcon->ses && tcon->ses->server)
+	if (tcon->ses && tcon->ses->server)
 	{
 		cFYI(1,("wake up tasks now - umount begin not complete"));
 		wake_up_all(&tcon->ses->server->request_q);
@@ -529,8 +537,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		/* some applications poll for the file length in this strange
 		   way so we must seek to end on non-oplocked files by
 		   setting the revalidate time to zero */
-		if(file->f_path.dentry->d_inode)		
-			CIFS_I(file->f_path.dentry->d_inode)->time = 0;
+		CIFS_I(file->f_path.dentry->d_inode)->time = 0;
 
 		retval = cifs_revalidate(file->f_path.dentry);
 		if (retval < 0)
@@ -694,11 +701,8 @@ cifs_init_once(void *inode, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct cifsInodeInfo *cifsi = inode;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		inode_init_once(&cifsi->vfs_inode);
-		INIT_LIST_HEAD(&cifsi->lockList);
-	}
+	inode_init_once(&cifsi->vfs_inode);
+	INIT_LIST_HEAD(&cifsi->lockList);
 }
 
 static int
@@ -724,7 +728,7 @@ cifs_destroy_inodecache(void)
 static int
 cifs_init_request_bufs(void)
 {
-	if(CIFSMaxBufSize < 8192) {
+	if (CIFSMaxBufSize < 8192) {
 	/* Buffer size can not be smaller than 2 * PATH_MAX since maximum
 	Unicode path name has to fit in any SMB/CIFS path based frames */
 		CIFSMaxBufSize = 8192;
@@ -741,7 +745,7 @@ cifs_init_request_bufs(void)
 	if (cifs_req_cachep == NULL)
 		return -ENOMEM;
 
-	if(cifs_min_rcv < 1)
+	if (cifs_min_rcv < 1)
 		cifs_min_rcv = 1;
 	else if (cifs_min_rcv > 64) {
 		cifs_min_rcv = 64;
@@ -751,7 +755,7 @@ cifs_init_request_bufs(void)
 	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
 						  cifs_req_cachep);
 
-	if(cifs_req_poolp == NULL) {
+	if (cifs_req_poolp == NULL) {
 		kmem_cache_destroy(cifs_req_cachep);
 		return -ENOMEM;
 	}
@@ -772,7 +776,7 @@ cifs_init_request_bufs(void)
 		return -ENOMEM;              
 	}
 
-	if(cifs_min_small < 2)
+	if (cifs_min_small < 2)
 		cifs_min_small = 2;
 	else if (cifs_min_small > 256) {
 		cifs_min_small = 256;
@@ -782,7 +786,7 @@ cifs_init_request_bufs(void)
 	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
 						     cifs_sm_req_cachep);
 
-	if(cifs_sm_req_poolp == NULL) {
+	if (cifs_sm_req_poolp == NULL) {
 		mempool_destroy(cifs_req_poolp);
 		kmem_cache_destroy(cifs_req_cachep);
 		kmem_cache_destroy(cifs_sm_req_cachep);
@@ -812,7 +816,7 @@ cifs_init_mids(void)
 
 	/* 3 is a reasonable minimum number of simultaneous operations */
 	cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
-	if(cifs_mid_poolp == NULL) {
+	if (cifs_mid_poolp == NULL) {
 		kmem_cache_destroy(cifs_mid_cachep);
 		return -ENOMEM;
 	}
@@ -850,14 +854,14 @@ static int cifs_oplock_thread(void * dummyarg)
 			continue;
 		
 		spin_lock(&GlobalMid_Lock);
-		if(list_empty(&GlobalOplock_Q)) {
+		if (list_empty(&GlobalOplock_Q)) {
 			spin_unlock(&GlobalMid_Lock);
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(39*HZ);
 		} else {
 			oplock_item = list_entry(GlobalOplock_Q.next, 
 				struct oplock_q_entry, qhead);
-			if(oplock_item) {
+			if (oplock_item) {
 				cFYI(1,("found oplock item to write out")); 
 				pTcon = oplock_item->tcon;
 				inode = oplock_item->pinode;
@@ -871,7 +875,7 @@ static int cifs_oplock_thread(void * dummyarg)
 				/* mutex_lock(&inode->i_mutex);*/
 				if (S_ISREG(inode->i_mode)) {
 					rc = filemap_fdatawrite(inode->i_mapping);
-					if(CIFS_I(inode)->clientCanCacheRead == 0) {
+					if (CIFS_I(inode)->clientCanCacheRead == 0) {
 						filemap_fdatawait(inode->i_mapping);
 						invalidate_remote_inode(inode);
 					}
@@ -888,7 +892,7 @@ static int cifs_oplock_thread(void * dummyarg)
 				not bother sending an oplock release if session 
 				to server still is disconnected since oplock 
 				already released by the server in that case */
-				if(pTcon->tidStatus != CifsNeedReconnect) {
+				if (pTcon->tidStatus != CifsNeedReconnect) {
 				    rc = CIFSSMBLock(0, pTcon, netfid,
 					    0 /* len */ , 0 /* offset */, 0, 
 					    0, LOCKING_ANDX_OPLOCK_RELEASE,
@@ -922,7 +926,7 @@ static int cifs_dnotify_thread(void * dummyarg)
 		list_for_each(tmp, &GlobalSMBSessionList) {
 			ses = list_entry(tmp, struct cifsSesInfo, 
 				cifsSessionList);
-			if(ses && ses->server && 
+			if (ses && ses->server && 
 			     atomic_read(&ses->server->inFlight))
 				wake_up_all(&ses->server->response_q);
 		}
@@ -971,10 +975,10 @@ init_cifs(void)
 	rwlock_init(&GlobalSMBSeslock);
 	spin_lock_init(&GlobalMid_Lock);
 
-	if(cifs_max_pending < 2) {
+	if (cifs_max_pending < 2) {
 		cifs_max_pending = 2;
 		cFYI(1,("cifs_max_pending set to min of 2"));
-	} else if(cifs_max_pending > 256) {
+	} else if (cifs_max_pending > 256) {
 		cifs_max_pending = 256;
 		cFYI(1,("cifs_max_pending set to max of 256"));
 	}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2c2c384894d8..c235d32ad4a8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern int cifs_ioctl (struct inode * inode, struct file * filep,
 		       unsigned int command, unsigned long arg);
-#define CIFS_VERSION   "1.48"
+#define CIFS_VERSION   "1.49"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e4de8eba4780..23655de2f4a4 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -311,7 +311,7 @@ struct cifsFileInfo {
 	/* lock scope id (0 if none) */
 	struct file * pfile; /* needed for writepage */
 	struct inode * pInode; /* needed for oplock break */
-	struct semaphore lock_sem;
+	struct mutex lock_mutex;
 	struct list_head llist; /* list of byte range locks we have. */
 	unsigned closePend:1;	/* file is marked to close */
 	unsigned invalidHandle:1;  /* file closed via session abend */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 4d8948e8762c..d619ca7d1416 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1388,7 +1388,7 @@ struct smb_t2_rsp {
 #define SMB_SET_POSIX_LOCK              0x208
 #define SMB_POSIX_OPEN                  0x209
 #define SMB_POSIX_UNLINK                0x20a
-#define SMB_SET_FILE_UNIX_INFO2
+#define SMB_SET_FILE_UNIX_INFO2         0x20b
 #define SMB_SET_FILE_BASIC_INFO2        0x3ec
 #define SMB_SET_FILE_RENAME_INFORMATION 0x3f2 /* BB check if qpathinfo too */
 #define SMB_FILE_ALL_INFO2              0x3fa
@@ -2109,22 +2109,40 @@ struct cifs_posix_acl { /* access conrol list  (ACL) */
 
 /* end of POSIX ACL definitions */
 
+/* POSIX Open Flags */
+#define SMB_O_RDONLY 	 0x1
+#define SMB_O_WRONLY 	0x2
+#define SMB_O_RDWR 	0x4
+#define SMB_O_CREAT 	0x10
+#define SMB_O_EXCL 	0x20
+#define SMB_O_TRUNC 	0x40
+#define SMB_O_APPEND 	0x80
+#define SMB_O_SYNC 	0x100
+#define SMB_O_DIRECTORY 0x200
+#define SMB_O_NOFOLLOW 	0x400
+#define SMB_O_DIRECT 	0x800
+
 typedef struct {
-	__u32 OpenFlags; /* same as NT CreateX */
-	__u32 PosixOpenFlags;
-	__u32 Mode;
-	__u16 Level; /* reply level requested (see QPathInfo levels) */
-	__u16 Pad;  /* reserved - MBZ */
+	__le32 OpenFlags; /* same as NT CreateX */
+	__le32 PosixOpenFlags;
+	__le64 Permissions;
+	__le16 Level; /* reply level requested (see QPathInfo levels) */
 } __attribute__((packed)) OPEN_PSX_REQ; /* level 0x209 SetPathInfo data */
 
 typedef struct {
-	/* reply varies based on requested level */
+	__le16 OplockFlags;
+	__u16 Fid;
+	__le32 CreateAction;
+	__le16 ReturnedLevel;
+	__le16 Pad;
+	/* struct following varies based on requested level */
 } __attribute__((packed)) OPEN_PSX_RSP; /* level 0x209 SetPathInfo data */
 
 
 struct file_internal_info {
 	__u64  UniqueId; /* inode number */
 } __attribute__((packed));      /* level 0x3ee */
+
 struct file_mode_info {
 	__le32	Mode;
 } __attribute__((packed));      /* level 0x3f8 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 32eb1acab630..5d163e2b6143 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifsproto.h
  *
- *   Copyright (c) International Business Machines  Corp., 2002,2006
+ *   Copyright (c) International Business Machines  Corp., 2002,2007
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -244,6 +244,11 @@ extern int SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
 			const int access_flags, const int omode,
 			__u16 * netfid, int *pOplock, FILE_ALL_INFO *,
 			const struct nls_table *nls_codepage, int remap);
+extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, 
+			u32 posix_flags, __u64 mode, __u16 * netfid,
+			FILE_UNIX_BASIC_INFO *pRetData,
+			__u32 *pOplock, const char *name,
+			const struct nls_table *nls_codepage, int remap);			
 extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
 			const int smb_file_id);
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 48fc0c2ab0e5..14de58fa1437 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifssmb.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2006
+ *   Copyright (C) International Business Machines  Corp., 2002,2007
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   Contains the routines for constructing the SMB PDUs themselves
@@ -24,8 +24,8 @@
  /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c   */
  /* These are mostly routines that operate on a pathname, or on a tree id     */
  /* (mounted volume), but there are eight handle based routines which must be */
- /* treated slightly different for reconnection purposes since we never want  */
- /* to reuse a stale file handle and the caller knows the file handle */
+ /* treated slightly differently for reconnection purposes since we never     */
+ /* want to reuse a stale file handle and only the caller knows the file info */
 
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -913,6 +913,130 @@ MkDirRetry:
 	return rc;
 }
 
+int
+CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
+		__u64 mode, __u16 * netfid, FILE_UNIX_BASIC_INFO *pRetData,
+		__u32 *pOplock, const char *name, 
+		const struct nls_table *nls_codepage, int remap)
+{
+	TRANSACTION2_SPI_REQ *pSMB = NULL;
+	TRANSACTION2_SPI_RSP *pSMBr = NULL;
+	int name_len;
+	int rc = 0;
+	int bytes_returned = 0;
+	char *data_offset;
+	__u16 params, param_offset, offset, byte_count, count;
+	OPEN_PSX_REQ * pdata;
+	OPEN_PSX_RSP * psx_rsp;
+
+	cFYI(1, ("In POSIX Create"));
+PsxCreat:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+		name_len =
+		    cifsConvertToUCS((__le16 *) pSMB->FileName, name,
+				     PATH_MAX, nls_codepage, remap);
+		name_len++;	/* trailing null */
+		name_len *= 2;
+	} else {	/* BB improve the check for buffer overruns BB */
+		name_len = strnlen(name, PATH_MAX);
+		name_len++;	/* trailing null */
+		strncpy(pSMB->FileName, name, name_len);
+	}
+
+	params = 6 + name_len;
+	count = sizeof(OPEN_PSX_REQ);
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	pSMB->MaxDataCount = cpu_to_le16(1000);	/* large enough */
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_spi_req,
+                                     InformationLevel) - 4;
+	offset = param_offset + params;
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+	pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset);
+	pdata->Level = SMB_QUERY_FILE_UNIX_BASIC;
+	pdata->Permissions = cpu_to_le64(mode);
+	pdata->PosixOpenFlags = cpu_to_le32(posix_flags); 
+	pdata->OpenFlags =  cpu_to_le32(*pOplock);
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_PATH_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
+	pSMB->Reserved4 = 0;
+	pSMB->hdr.smb_buf_length += byte_count; 
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, ("Posix create returned %d", rc));
+		goto psx_create_err;
+	}
+
+	cFYI(1,("copying inode info"));
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+	if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
+		rc = -EIO;	/* bad smb */
+		goto psx_create_err;
+	}
+
+	/* copy return information to pRetData */
+	psx_rsp = (OPEN_PSX_RSP *)((char *) &pSMBr->hdr.Protocol 
+			+ le16_to_cpu(pSMBr->t2.DataOffset));
+		
+	*pOplock = le16_to_cpu(psx_rsp->OplockFlags);
+	if(netfid)
+		*netfid = psx_rsp->Fid;   /* cifs fid stays in le */
+	/* Let caller know file was created so we can set the mode. */
+	/* Do we care about the CreateAction in any other cases? */
+	if(cpu_to_le32(FILE_CREATE) == psx_rsp->CreateAction)
+		*pOplock |= CIFS_CREATE_ACTION;
+	/* check to make sure response data is there */
+	if(psx_rsp->ReturnedLevel != SMB_QUERY_FILE_UNIX_BASIC) {
+		pRetData->Type = -1; /* unknown */
+#ifdef CONFIG_CIFS_DEBUG2
+		cFYI(1,("unknown type"));
+#endif
+	} else {
+		if(pSMBr->ByteCount < sizeof(OPEN_PSX_RSP) 
+					+ sizeof(FILE_UNIX_BASIC_INFO)) {
+			cERROR(1,("Open response data too small"));
+			pRetData->Type = -1;
+			goto psx_create_err;
+		}
+		memcpy((char *) pRetData, 
+			(char *)psx_rsp + sizeof(OPEN_PSX_RSP),
+			sizeof (FILE_UNIX_BASIC_INFO));
+	}
+			
+
+psx_create_err:
+	cifs_buf_release(pSMB);
+
+	cifs_stats_inc(&tcon->num_mkdirs);
+
+	if (rc == -EAGAIN)
+		goto PsxCreat;
+
+	return rc;	
+}
+
 static __u16 convert_disposition(int disposition)
 {
 	__u16 ofun = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 20ba7dcc9959..216fb625843f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -30,6 +30,7 @@
 #include <linux/mempool.h>
 #include <linux/delay.h>
 #include <linux/completion.h>
+#include <linux/kthread.h>
 #include <linux/pagevec.h>
 #include <linux/freezer.h>
 #include <asm/uaccess.h>
@@ -74,6 +75,8 @@ struct smb_vol {
 	unsigned retry:1;
 	unsigned intr:1;
 	unsigned setuids:1;
+	unsigned override_uid:1;
+	unsigned override_gid:1;
 	unsigned noperm:1;
 	unsigned no_psx_acl:1; /* set if posix acl support should be disabled */
 	unsigned cifs_acl:1;
@@ -120,7 +123,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	struct mid_q_entry * mid_entry;
 	
 	spin_lock(&GlobalMid_Lock);
-	if(server->tcpStatus == CifsExiting) {
+	if( kthread_should_stop() ) {
 		/* the demux thread will exit normally 
 		next time through the loop */
 		spin_unlock(&GlobalMid_Lock);
@@ -182,7 +185,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	spin_unlock(&GlobalMid_Lock);
 	up(&server->tcpSem); 
 
-	while ((server->tcpStatus != CifsExiting) && (server->tcpStatus != CifsGood))
+	while ( (!kthread_should_stop()) && (server->tcpStatus != CifsGood))
 	{
 		try_to_freeze();
 		if(server->protocolType == IPV6) {
@@ -199,7 +202,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		} else {
 			atomic_inc(&tcpSesReconnectCount);
 			spin_lock(&GlobalMid_Lock);
-			if(server->tcpStatus != CifsExiting)
+			if( !kthread_should_stop() )
 				server->tcpStatus = CifsGood;
 			server->sequence_number = 0;
 			spin_unlock(&GlobalMid_Lock);			
@@ -345,7 +348,6 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	int isMultiRsp;
 	int reconnect;
 
-	daemonize("cifsd");
 	allow_signal(SIGKILL);
 	current->flags |= PF_MEMALLOC;
 	server->tsk = current;	/* save process info to wake at shutdown */
@@ -361,7 +363,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 			GFP_KERNEL);
 	}
 
-	while (server->tcpStatus != CifsExiting) {
+	while (!kthread_should_stop()) {
 		if (try_to_freeze())
 			continue;
 		if (bigbuf == NULL) {
@@ -400,7 +402,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		    kernel_recvmsg(csocket, &smb_msg,
 				 &iov, 1, 4, 0 /* BB see socket.h flags */);
 
-		if (server->tcpStatus == CifsExiting) {
+		if ( kthread_should_stop() ) {
 			break;
 		} else if (server->tcpStatus == CifsNeedReconnect) {
 			cFYI(1, ("Reconnect after server stopped responding"));
@@ -524,7 +526,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		     total_read += length) {
 			length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
 						pdu_length - total_read, 0);
-			if((server->tcpStatus == CifsExiting) ||
+			if( kthread_should_stop() ||
 			    (length == -EINTR)) {
 				/* then will exit */
 				reconnect = 2;
@@ -757,7 +759,6 @@ multi_t2_fnd:
 			GFP_KERNEL);
 	}
 	
-	complete_and_exit(&cifsd_complete, 0);
 	return 0;
 }
 
@@ -973,7 +974,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 			}
 			if ((temp_len = strnlen(value, 300)) < 300) {
 				vol->UNC = kmalloc(temp_len+1,GFP_KERNEL);
-				if(vol->UNC == NULL)
+				if (vol->UNC == NULL)
 					return 1;
 				strcpy(vol->UNC,value);
 				if (strncmp(vol->UNC, "//", 2) == 0) {
@@ -1010,12 +1011,12 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                                 return 1;       /* needs_arg; */
                         }
                         if ((temp_len = strnlen(value, 1024)) < 1024) {
-				if(value[0] != '/')
+				if (value[0] != '/')
 					temp_len++;  /* missing leading slash */
                                 vol->prepath = kmalloc(temp_len+1,GFP_KERNEL);
-                                if(vol->prepath == NULL)
+                                if (vol->prepath == NULL)
                                         return 1;
-				if(value[0] != '/') {
+				if (value[0] != '/') {
 					vol->prepath[0] = '/';
 	                                strcpy(vol->prepath+1,value);
 				} else
@@ -1031,7 +1032,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				return 1;	/* needs_arg; */
 			}
 			if (strnlen(value, 65) < 65) {
-				if(strnicmp(value,"default",7))
+				if (strnicmp(value,"default",7))
 					vol->iocharset = value;
 				/* if iocharset not set load_nls_default used by caller */
 				cFYI(1, ("iocharset set to %s",value));
@@ -1043,11 +1044,13 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 			if (value && *value) {
 				vol->linux_uid =
 					simple_strtoul(value, &value, 0);
+				vol->override_uid = 1;
 			}
 		} else if (strnicmp(data, "gid", 3) == 0) {
 			if (value && *value) {
 				vol->linux_gid =
 					simple_strtoul(value, &value, 0);
+				vol->override_gid = 1;
 			}
 		} else if (strnicmp(data, "file_mode", 4) == 0) {
 			if (value && *value) {
@@ -1102,7 +1105,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				}
 				/* The string has 16th byte zero still from
 				set at top of the function  */
-				if((i==15) && (value[i] != 0))
+				if ((i==15) && (value[i] != 0))
 					printk(KERN_WARNING "CIFS: netbiosname longer than 15 truncated.\n");
 			}
 		} else if (strnicmp(data, "servern", 7) == 0) {
@@ -1126,7 +1129,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				}
 				/* The string has 16th byte zero still from
 				   set at top of the function  */
-				if((i==15) && (value[i] != 0))
+				if ((i==15) && (value[i] != 0))
 					printk(KERN_WARNING "CIFS: server netbiosname longer than 15 truncated.\n");
 			}
 		} else if (strnicmp(data, "credentials", 4) == 0) {
@@ -1233,13 +1236,13 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 			printk(KERN_WARNING "CIFS: Unknown mount option %s\n",data);
 	}
 	if (vol->UNC == NULL) {
-		if(devname == NULL) {
+		if (devname == NULL) {
 			printk(KERN_WARNING "CIFS: Missing UNC name for mount target\n");
 			return 1;
 		}
 		if ((temp_len = strnlen(devname, 300)) < 300) {
 			vol->UNC = kmalloc(temp_len+1,GFP_KERNEL);
-			if(vol->UNC == NULL)
+			if (vol->UNC == NULL)
 				return 1;
 			strcpy(vol->UNC,devname);
 			if (strncmp(vol->UNC, "//", 2) == 0) {
@@ -1663,7 +1666,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo * tcon,
 				CIFS_SB(sb)->mnt_cifs_flags |= 
 					CIFS_MOUNT_POSIX_PATHS;
 		}
-			
+	
+		/* We might be setting the path sep back to a different
+		form if we are reconnecting and the server switched its
+		posix path capability for this share */	
+		if(sb && (CIFS_SB(sb)->prepathlen > 0))
+			CIFS_SB(sb)->prepath[0] = CIFS_DIR_SEP(CIFS_SB(sb));
+	
 		cFYI(1,("Negotiate caps 0x%x",(int)cap));
 #ifdef CONFIG_CIFS_DEBUG2
 		if(cap & CIFS_UNIX_FCNTL_CAP)
@@ -1712,12 +1721,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		return -EINVAL;
 	}
 
-	if (volume_info.username) {
+	if (volume_info.nullauth) {
+		cFYI(1,("null user"));
+		volume_info.username = NULL;
+	} else if (volume_info.username) {
 		/* BB fixme parse for domain name here */
 		cFYI(1, ("Username: %s ", volume_info.username));
-
-	} else if (volume_info.nullauth) {
-		cFYI(1,("null user"));
 	} else {
 		cifserror("No username specified");
         /* In userspace mount helper we can get user name from alternate
@@ -1791,11 +1800,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr,
 			NULL /* no ipv6 addr */,
 			volume_info.username, &srvTcp);
-	else if(address_type == AF_INET6)
+	else if(address_type == AF_INET6) {
+		cFYI(1,("looking for ipv6 address"));
 		existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
 			&sin_server6.sin6_addr,
 			volume_info.username, &srvTcp);
-	else {
+	} else {
 		kfree(volume_info.UNC);
 		kfree(volume_info.password);
 		kfree(volume_info.prepath);
@@ -1807,17 +1817,23 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	if (srvTcp) {
 		cFYI(1, ("Existing tcp session with server found"));                
 	} else {	/* create socket */
-		if(volume_info.port)
+		if (volume_info.port)
 			sin_server.sin_port = htons(volume_info.port);
 		else
 			sin_server.sin_port = 0;
-		rc = ipv4_connect(&sin_server,&csocket,
+		if (address_type == AF_INET6) {
+			cFYI(1,("attempting ipv6 connect"));
+			/* BB should we allow ipv6 on port 139? */
+			/* other OS never observed in Wild doing 139 with v6 */
+			rc = ipv6_connect(&sin_server6,&csocket);
+		} else 
+			rc = ipv4_connect(&sin_server,&csocket,
 				  volume_info.source_rfc1001_name,
 				  volume_info.target_rfc1001_name);
 		if (rc < 0) {
 			cERROR(1,
-			       ("Error connecting to IPv4 socket. Aborting operation"));
-			if(csocket != NULL)
+			       ("Error connecting to IPv4 socket. Aborting operation"));			       
+			if (csocket != NULL)
 				sock_release(csocket);
 			kfree(volume_info.UNC);
 			kfree(volume_info.password);
@@ -1850,10 +1866,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			so no need to spinlock this init of tcpStatus */
 			srvTcp->tcpStatus = CifsNew;
 			init_MUTEX(&srvTcp->tcpSem);
-			rc = (int)kernel_thread((void *)(void *)cifs_demultiplex_thread, srvTcp,
-				      CLONE_FS | CLONE_FILES | CLONE_VM);
-			if(rc < 0) {
-				rc = -ENOMEM;
+			srvTcp->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, srvTcp, "cifsd");
+			if ( IS_ERR(srvTcp->tsk) ) {
+				rc = PTR_ERR(srvTcp->tsk);
+				cERROR(1,("error %d create cifsd thread", rc));
+				srvTcp->tsk = NULL;
 				sock_release(csocket);
 				kfree(volume_info.UNC);
 				kfree(volume_info.password);
@@ -1896,7 +1913,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				int len = strlen(volume_info.domainname);
 				pSesInfo->domainName = 
 					kmalloc(len + 1, GFP_KERNEL);
-				if(pSesInfo->domainName)
+				if (pSesInfo->domainName)
 					strcpy(pSesInfo->domainName,
 						volume_info.domainname);
 			}
@@ -1906,7 +1923,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			/* BB FIXME need to pass vol->secFlgs BB */
 			rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls);
 			up(&pSesInfo->sesSem);
-			if(!rc)
+			if (!rc)
 				atomic_inc(&srvTcp->socketUseCount);
 		} else
 			kfree(volume_info.password);
@@ -1914,7 +1931,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
     
 	/* search for existing tcon to this server share */
 	if (!rc) {
-		if(volume_info.rsize > CIFSMaxBufSize) {
+		if (volume_info.rsize > CIFSMaxBufSize) {
 			cERROR(1,("rsize %d too large, using MaxBufSize",
 				volume_info.rsize));
 			cifs_sb->rsize = CIFSMaxBufSize;
@@ -1923,11 +1940,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		else /* default */
 			cifs_sb->rsize = CIFSMaxBufSize;
 
-		if(volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
+		if (volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
 			cERROR(1,("wsize %d too large using 4096 instead",
 				  volume_info.wsize));
 			cifs_sb->wsize = 4096;
-		} else if(volume_info.wsize)
+		} else if (volume_info.wsize)
 			cifs_sb->wsize = volume_info.wsize;
 		else
 			cifs_sb->wsize = 
@@ -1940,14 +1957,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			   conjunction with 52K kvec constraint on arch with 4K
 			   page size  */
 
-		if(cifs_sb->rsize < 2048) {
+		if (cifs_sb->rsize < 2048) {
 			cifs_sb->rsize = 2048; 
 			/* Windows ME may prefer this */
 			cFYI(1,("readsize set to minimum 2048"));
 		}
 		/* calculate prepath */
 		cifs_sb->prepath = volume_info.prepath;
-		if(cifs_sb->prepath) {
+		if (cifs_sb->prepath) {
 			cifs_sb->prepathlen = strlen(cifs_sb->prepath);
 			cifs_sb->prepath[0] = CIFS_DIR_SEP(cifs_sb);
 			volume_info.prepath = NULL;
@@ -1960,24 +1977,27 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		cFYI(1,("file mode: 0x%x  dir mode: 0x%x",
 			cifs_sb->mnt_file_mode,cifs_sb->mnt_dir_mode));
 
-		if(volume_info.noperm)
+		if (volume_info.noperm)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
-		if(volume_info.setuids)
+		if (volume_info.setuids)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
-		if(volume_info.server_ino)
+		if (volume_info.server_ino)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
-		if(volume_info.remap)
+		if (volume_info.remap)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
-		if(volume_info.no_xattr)
+		if (volume_info.no_xattr)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
-		if(volume_info.sfu_emul)
+		if (volume_info.sfu_emul)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
-		if(volume_info.nobrl)
+		if (volume_info.nobrl)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
-		if(volume_info.cifs_acl)
+		if (volume_info.cifs_acl)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
-
-		if(volume_info.direct_io) {
+		if (volume_info.override_uid)
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
+		if (volume_info.override_gid)
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+		if (volume_info.direct_io) {
 			cFYI(1,("mounting share using direct i/o"));
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 		}
@@ -2030,7 +2050,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			}
 		}
 	}
-	if(pSesInfo) {
+	if (pSesInfo) {
 		if (pSesInfo->capabilities & CAP_LARGE_FILES) {
 			sb->s_maxbytes = (u64) 1 << 63;
 		} else
@@ -2044,13 +2064,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	if (rc) {
 		/* if session setup failed, use count is zero but
 		we still need to free cifsd thread */
-		if(atomic_read(&srvTcp->socketUseCount) == 0) {
+		if (atomic_read(&srvTcp->socketUseCount) == 0) {
 			spin_lock(&GlobalMid_Lock);
 			srvTcp->tcpStatus = CifsExiting;
 			spin_unlock(&GlobalMid_Lock);
-			if(srvTcp->tsk) {
+			if (srvTcp->tsk) {
 				send_sig(SIGKILL,srvTcp->tsk,1);
-				wait_for_completion(&cifsd_complete);
+				kthread_stop(srvTcp->tsk);
 			}
 		}
 		 /* If find_unc succeeded then rc == 0 so we can not end */
@@ -2063,10 +2083,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 					int temp_rc;
 					temp_rc = CIFSSMBLogoff(xid, pSesInfo);
 					/* if the socketUseCount is now zero */
-					if((temp_rc == -ESHUTDOWN) &&
-					   (pSesInfo->server->tsk)) {
+					if ((temp_rc == -ESHUTDOWN) &&
+					   (pSesInfo->server) && (pSesInfo->server->tsk)) {
 						send_sig(SIGKILL,pSesInfo->server->tsk,1);
-						wait_for_completion(&cifsd_complete);
+						kthread_stop(pSesInfo->server->tsk);
 					}
 				} else
 					cFYI(1, ("No session or bad tcon"));
@@ -2127,7 +2147,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	__u16 count;
 
 	cFYI(1, ("In sesssetup"));
-	if(ses == NULL)
+	if (ses == NULL)
 		return -EINVAL;
 	user = ses->userName;
 	domain = ses->domainName;
@@ -2182,7 +2202,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 			*bcc_ptr = 0;
 			bcc_ptr++;
 		}
-		if(user == NULL)
+		if (user == NULL)
 			bytes_returned = 0; /* skip null user */
 	        else
 			bytes_returned =
@@ -2216,7 +2236,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 		bcc_ptr += 2 * bytes_returned;
 		bcc_ptr += 2;
 	} else {
-		if(user != NULL) {                
+		if (user != NULL) {                
 		    strncpy(bcc_ptr, user, 200);
 		    bcc_ptr += strnlen(user, 200);
 		}
@@ -3316,7 +3336,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 				cFYI(1,("Waking up socket by sending it signal"));
 				if(cifsd_task) {
 					send_sig(SIGKILL,cifsd_task,1);
-					wait_for_completion(&cifsd_complete);
+					kthread_stop(cifsd_task);
 				}
 				rc = 0;
 			} /* else - we have an smb session
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3fad638d26d3..e5210519ac4b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -274,7 +274,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			pCifsFile->invalidHandle = FALSE;
 			pCifsFile->closePend     = FALSE;
 			init_MUTEX(&pCifsFile->fh_sem);
-			init_MUTEX(&pCifsFile->lock_sem);
+			mutex_init(&pCifsFile->lock_mutex);
 			INIT_LIST_HEAD(&pCifsFile->llist);
 			atomic_set(&pCifsFile->wrtPending,0);
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 2d3275bedb55..94d5b49049df 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -27,7 +27,6 @@
 #include <linux/fcntl.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
-#include <linux/smp_lock.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/delay.h>
@@ -48,7 +47,7 @@ static inline struct cifsFileInfo *cifs_init_private(
 	private_data->netfid = netfid;
 	private_data->pid = current->tgid;	
 	init_MUTEX(&private_data->fh_sem);
-	init_MUTEX(&private_data->lock_sem);
+	mutex_init(&private_data->lock_mutex);
 	INIT_LIST_HEAD(&private_data->llist);
 	private_data->pfile = file; /* needed for writepage */
 	private_data->pInode = inode;
@@ -338,8 +337,7 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
 	return rc;
 }
 
-static int cifs_reopen_file(struct inode *inode, struct file *file, 
-	int can_flush)
+static int cifs_reopen_file(struct file *file, int can_flush)
 {
 	int rc = -EACCES;
 	int xid, oplock;
@@ -347,13 +345,12 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
 	struct cifsTconInfo *pTcon;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
+	struct inode * inode;
 	char *full_path = NULL;
 	int desiredAccess;
 	int disposition = FILE_OPEN;
 	__u16 netfid;
 
-	if (inode == NULL)
-		return -EBADF;
 	if (file->private_data) {
 		pCifsFile = (struct cifsFileInfo *)file->private_data;
 	} else
@@ -368,25 +365,37 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
 	}
 
 	if (file->f_path.dentry == NULL) {
-		up(&pCifsFile->fh_sem);
-		cFYI(1, ("failed file reopen, no valid name if dentry freed"));
-		FreeXid(xid);
-		return -EBADF;
+		cERROR(1, ("no valid name if dentry freed"));
+		dump_stack();
+		rc = -EBADF;
+		goto reopen_error_exit;
 	}
+
+	inode = file->f_path.dentry->d_inode;
+	if(inode == NULL) {
+		cERROR(1, ("inode not valid"));
+		dump_stack();
+		rc = -EBADF;
+		goto reopen_error_exit;
+	}
+		
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
+
 /* can not grab rename sem here because various ops, including
    those that already have the rename sem can end up causing writepage
    to get called and if the server was down that means we end up here,
    and we can never tell if the caller already has the rename_sem */
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
+		rc = -ENOMEM;
+reopen_error_exit:
 		up(&pCifsFile->fh_sem);
 		FreeXid(xid);
-		return -ENOMEM;
+		return rc;
 	}
 
-	cFYI(1, (" inode = 0x%p file flags are 0x%x for %s",
+	cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
 		 inode, file->f_flags,full_path));
 	desiredAccess = cifs_convert_flags(file->f_flags);
 
@@ -401,13 +410,6 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
 	   and server version of file size can be stale. If we knew for sure
 	   that inode was not dirty locally we could do this */
 
-/*	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-	if (buf == 0) {
-		up(&pCifsFile->fh_sem);
-		kfree(full_path);
-		FreeXid(xid);
-		return -ENOMEM;
-	} */
 	rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess,
 			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 
@@ -508,12 +510,12 @@ int cifs_close(struct inode *inode, struct file *file)
 
 		/* Delete any outstanding lock records.
 		   We'll lose them when the file is closed anyway. */
-		down(&pSMBFile->lock_sem);
+		mutex_lock(&pSMBFile->lock_mutex);
 		list_for_each_entry_safe(li, tmp, &pSMBFile->llist, llist) {
 			list_del(&li->llist);
 			kfree(li);
 		}
-		up(&pSMBFile->lock_sem);
+		mutex_unlock(&pSMBFile->lock_mutex);
 
 		write_lock(&GlobalSMBSeslock);
 		list_del(&pSMBFile->flist);
@@ -598,9 +600,9 @@ static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
 	li->offset = offset;
 	li->length = len;
 	li->type = lockType;
-	down(&fid->lock_sem);
+	mutex_lock(&fid->lock_mutex);
 	list_add(&li->llist, &fid->llist);
-	up(&fid->lock_sem);
+	mutex_unlock(&fid->lock_mutex);
 	return 0;
 }
 
@@ -757,7 +759,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 			struct cifsLockInfo *li, *tmp;
 
 			rc = 0;
-			down(&fid->lock_sem);
+			mutex_lock(&fid->lock_mutex);
 			list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
 				if (pfLock->fl_start <= li->offset &&
 						length >= li->length) {
@@ -771,7 +773,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 					kfree(li);
 				}
 			}
-			up(&fid->lock_sem);
+			mutex_unlock(&fid->lock_mutex);
 		}
 	}
 
@@ -792,12 +794,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	int xid, long_op;
 	struct cifsFileInfo *open_file;
 
-	if (file->f_path.dentry == NULL)
-		return -EBADF;
-
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	if (cifs_sb == NULL)
-		return -EBADF;
 
 	pTcon = cifs_sb->tcon;
 
@@ -807,14 +804,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	if (file->private_data == NULL)
 		return -EBADF;
-	else
-		open_file = (struct cifsFileInfo *) file->private_data;
+	open_file = (struct cifsFileInfo *) file->private_data;
 	
 	xid = GetXid();
-	if (file->f_path.dentry->d_inode == NULL) {
-		FreeXid(xid);
-		return -EBADF;
-	}
 
 	if (*poffset > file->f_path.dentry->d_inode->i_size)
 		long_op = 2; /* writes past end of file can take a long time */
@@ -841,17 +833,11 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 					return -EBADF;
 			}
 			if (open_file->invalidHandle) {
-				if ((file->f_path.dentry == NULL) ||
-				    (file->f_path.dentry->d_inode == NULL)) {
-					FreeXid(xid);
-					return total_written;
-				}
 				/* we could deadlock if we called
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to server
 				   now */
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, FALSE);
+				rc = cifs_reopen_file(file, FALSE);
 				if (rc != 0)
 					break;
 			}
@@ -908,12 +894,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	int xid, long_op;
 	struct cifsFileInfo *open_file;
 
-	if (file->f_path.dentry == NULL)
-		return -EBADF;
-
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	if (cifs_sb == NULL)
-		return -EBADF;
 
 	pTcon = cifs_sb->tcon;
 
@@ -922,14 +903,9 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 
 	if (file->private_data == NULL)
 		return -EBADF;
-	else
-		open_file = (struct cifsFileInfo *)file->private_data;
+	open_file = (struct cifsFileInfo *)file->private_data;
 	
 	xid = GetXid();
-	if (file->f_path.dentry->d_inode == NULL) {
-		FreeXid(xid);
-		return -EBADF;
-	}
 
 	if (*poffset > file->f_path.dentry->d_inode->i_size)
 		long_op = 2; /* writes past end of file can take a long time */
@@ -957,17 +933,11 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 					return -EBADF;
 			}
 			if (open_file->invalidHandle) {
-				if ((file->f_path.dentry == NULL) ||
-				   (file->f_path.dentry->d_inode == NULL)) {
-					FreeXid(xid);
-					return total_written;
-				}
 				/* we could deadlock if we called
 				   filemap_fdatawait from here so tell
 				   reopen_file not to flush data to 
 				   server now */
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, FALSE);
+				rc = cifs_reopen_file(file, FALSE);
 				if (rc != 0)
 					break;
 			}
@@ -1056,8 +1026,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 			read_unlock(&GlobalSMBSeslock);
 			if((open_file->invalidHandle) && 
 			   (!open_file->closePend) /* BB fixme -since the second clause can not be true remove it BB */) {
-				rc = cifs_reopen_file(&cifs_inode->vfs_inode, 
-						      open_file->pfile, FALSE);
+				rc = cifs_reopen_file(open_file->pfile, FALSE);
 				/* if it fails, try another handle - might be */
 				/* dangerous to hold up writepages with retry */
 				if(rc) {
@@ -1404,32 +1373,6 @@ static int cifs_commit_write(struct file *file, struct page *page,
 	spin_lock(&inode->i_lock);
 	if (position > inode->i_size) {
 		i_size_write(inode, position);
-		/* if (file->private_data == NULL) {
-			rc = -EBADF;
-		} else {
-			open_file = (struct cifsFileInfo *)file->private_data;
-			cifs_sb = CIFS_SB(inode->i_sb);
-			rc = -EAGAIN;
-			while (rc == -EAGAIN) {
-				if ((open_file->invalidHandle) && 
-				    (!open_file->closePend)) {
-					rc = cifs_reopen_file(
-						file->f_path.dentry->d_inode, file);
-					if (rc != 0)
-						break;
-				}
-				if (!open_file->closePend) {
-					rc = CIFSSMBSetFileSize(xid,
-						cifs_sb->tcon, position,
-						open_file->netfid,
-						open_file->pid, FALSE);
-				} else {
-					rc = -EBADF;
-					break;
-				}
-			}
-			cFYI(1, (" SetEOF (commit write) rc = %d", rc));
-		} */
 	}
 	spin_unlock(&inode->i_lock);
 	if (!PageUptodate(page)) {
@@ -1573,8 +1516,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 			int buf_type = CIFS_NO_BUFFER;
 			if ((open_file->invalidHandle) && 
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, TRUE);
+				rc = cifs_reopen_file(file, TRUE);
 				if (rc != 0)
 					break;
 			}
@@ -1660,8 +1602,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) && 
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, TRUE);
+				rc = cifs_reopen_file(file, TRUE);
 				if (rc != 0)
 					break;
 			}
@@ -1817,8 +1758,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) && 
 			    (!open_file->closePend)) {
-				rc = cifs_reopen_file(file->f_path.dentry->d_inode,
-					file, TRUE);
+				rc = cifs_reopen_file(file, TRUE);
 				if (rc != 0)
 					break;
 			}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f414526e476a..3e87dad3367c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/inode.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2005
+ *   Copyright (C) International Business Machines  Corp., 2002,2007
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -90,7 +90,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 				(*pinode)->i_ino =
 					(unsigned long)findData.UniqueId;
 			} /* note ino incremented to unique num in new_inode */
-			if(sb->s_flags & MS_NOATIME)
+			if (sb->s_flags & MS_NOATIME)
 				(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
 				
 			insert_inode_hash(*pinode);
@@ -139,8 +139,17 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 			inode->i_mode |= S_IFREG;
 			cFYI(1,("unknown type %d",type));
 		}
-		inode->i_uid = le64_to_cpu(findData.Uid);
-		inode->i_gid = le64_to_cpu(findData.Gid);
+		
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+			inode->i_uid = cifs_sb->mnt_uid;
+		else
+			inode->i_uid = le64_to_cpu(findData.Uid);
+
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+			inode->i_gid = cifs_sb->mnt_gid;
+		else
+			inode->i_gid = le64_to_cpu(findData.Gid);
+			
 		inode->i_nlink = le64_to_cpu(findData.Nlinks);
 
 		spin_lock(&inode->i_lock);
@@ -178,13 +187,13 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 						&cifs_file_direct_nobrl_ops;
 				else
 					inode->i_fop = &cifs_file_direct_ops;
-			} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 				inode->i_fop = &cifs_file_nobrl_ops;
 			else /* not direct, send byte range locks */ 
 				inode->i_fop = &cifs_file_ops;
 
 			/* check if server can support readpages */
-			if(pTcon->ses->server->maxBuf < 
+			if (pTcon->ses->server->maxBuf < 
 			    PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
 				inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 			else
@@ -220,7 +229,7 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
 
 	pbuf = buf;
 
-	if(size == 0) {
+	if (size == 0) {
 		inode->i_mode |= S_IFIFO;
 		return 0;
 	} else if (size < 8) {
@@ -239,11 +248,11 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
 			         netfid,
 				 24 /* length */, 0 /* offset */,
 				 &bytes_read, &pbuf, &buf_type);
-		if((rc == 0) && (bytes_read >= 8)) {
-			if(memcmp("IntxBLK", pbuf, 8) == 0) {
+		if ((rc == 0) && (bytes_read >= 8)) {
+			if (memcmp("IntxBLK", pbuf, 8) == 0) {
 				cFYI(1,("Block device"));
 				inode->i_mode |= S_IFBLK;
-				if(bytes_read == 24) {
+				if (bytes_read == 24) {
 					/* we have enough to decode dev num */
 					__u64 mjr; /* major */
 					__u64 mnr; /* minor */
@@ -251,10 +260,10 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
 					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
 					inode->i_rdev = MKDEV(mjr, mnr);
 				}
-			} else if(memcmp("IntxCHR", pbuf, 8) == 0) {
+			} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
 				cFYI(1,("Char device"));
 				inode->i_mode |= S_IFCHR;
-				if(bytes_read == 24) {
+				if (bytes_read == 24) {
 					/* we have enough to decode dev num */
 					__u64 mjr; /* major */
 					__u64 mnr; /* minor */
@@ -262,7 +271,7 @@ static int decode_sfu_inode(struct inode * inode, __u64 size,
 					mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
 					inode->i_rdev = MKDEV(mjr, mnr);
                                 }
-			} else if(memcmp("IntxLNK", pbuf, 7) == 0) {
+			} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
 				cFYI(1,("Symlink"));
 				inode->i_mode |= S_IFLNK;
 			} else {
@@ -293,7 +302,7 @@ static int get_sfu_uid_mode(struct inode * inode,
 	rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
 			ea_value, 4 /* size of buf */, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if(rc < 0)
+	if (rc < 0)
 		return (int)rc;
 	else if (rc > 3) {
 		mode = le32_to_cpu(*((__le32 *)ea_value));
@@ -348,7 +357,7 @@ int cifs_get_inode_info(struct inode **pinode,
 		/* BB optimize code so we do not make the above call
 		when server claims no NT SMB support and the above call
 		failed at least once - set flag in tcon or mount */
-		if((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
+		if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
 			rc = SMBQueryInformation(xid, pTcon, search_path,
 					pfindData, cifs_sb->local_nls, 
 					cifs_sb->mnt_cifs_flags &
@@ -425,7 +434,7 @@ int cifs_get_inode_info(struct inode **pinode,
 				} else /* do we need cast or hash to ino? */
 					(*pinode)->i_ino = inode_num;
 			} /* else ino incremented to unique num in new_inode*/
-			if(sb->s_flags & MS_NOATIME)
+			if (sb->s_flags & MS_NOATIME)
 				(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
 			insert_inode_hash(*pinode);
 		}
@@ -442,7 +451,7 @@ int cifs_get_inode_info(struct inode **pinode,
 		(pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
 
 		/* Linux can not store file creation time so ignore it */
-		if(pfindData->LastAccessTime)
+		if (pfindData->LastAccessTime)
 			inode->i_atime = cifs_NTtimeToUnix
 				(le64_to_cpu(pfindData->LastAccessTime));
 		else /* do not need to use current_fs_time - time not stored */
@@ -452,7 +461,7 @@ int cifs_get_inode_info(struct inode **pinode,
 		inode->i_ctime =
 		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
 		cFYI(0, ("Attributes came in as 0x%x", attr));
-		if(adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
+		if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
 			inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
 	                inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
 		}
@@ -521,8 +530,10 @@ int cifs_get_inode_info(struct inode **pinode,
 
 		/* BB fill in uid and gid here? with help from winbind? 
 		   or retrieve from NTFS stream extended attribute */
-		if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
 			/* fill in uid, gid, mode from server ACL */
+			/* BB FIXME this should also take into account the
+			 * default uid specified on mount if present */
 			get_sfu_uid_mode(inode, search_path, cifs_sb, xid);
 		} else if (atomic_read(&cifsInfo->inUse) == 0) {
 			inode->i_uid = cifs_sb->mnt_uid;
@@ -541,12 +552,12 @@ int cifs_get_inode_info(struct inode **pinode,
 						&cifs_file_direct_nobrl_ops;
 				else
 					inode->i_fop = &cifs_file_direct_ops;
-			} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 				inode->i_fop = &cifs_file_nobrl_ops;
 			else /* not direct, send byte range locks */
 				inode->i_fop = &cifs_file_ops;
 
-			if(pTcon->ses->server->maxBuf < 
+			if (pTcon->ses->server->maxBuf < 
 			     PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
 				inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 			else
@@ -597,7 +608,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 
 	xid = GetXid();
 
-	if(inode)
+	if (inode)
 		cifs_sb = CIFS_SB(inode->i_sb);
 	else
 		cifs_sb = CIFS_SB(direntry->d_sb);
@@ -723,7 +734,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 					   when needed */
 		direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
 	}
-	if(inode) {
+	if (inode) {
 		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 		cifsInode = CIFS_I(inode);
 		cifsInode->time = 0;	/* force revalidate of dir as well */
@@ -734,6 +745,136 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 	return rc;
 }
 
+static void posix_fill_in_inode(struct inode *tmp_inode,
+	FILE_UNIX_BASIC_INFO *pData, int *pobject_type, int isNewInode)
+{
+	loff_t local_size;
+	struct timespec local_mtime;
+
+	struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
+
+	__u32 type = le32_to_cpu(pData->Type);
+	__u64 num_of_bytes = le64_to_cpu(pData->NumOfBytes);
+	__u64 end_of_file = le64_to_cpu(pData->EndOfFile);
+	cifsInfo->time = jiffies;
+	atomic_inc(&cifsInfo->inUse);
+
+	/* save mtime and size */
+	local_mtime = tmp_inode->i_mtime;
+	local_size  = tmp_inode->i_size;
+
+	tmp_inode->i_atime =
+	    cifs_NTtimeToUnix(le64_to_cpu(pData->LastAccessTime));
+	tmp_inode->i_mtime =
+	    cifs_NTtimeToUnix(le64_to_cpu(pData->LastModificationTime));
+	tmp_inode->i_ctime =
+	    cifs_NTtimeToUnix(le64_to_cpu(pData->LastStatusChange));
+
+	tmp_inode->i_mode = le64_to_cpu(pData->Permissions);
+	/* since we set the inode type below we need to mask off type
+           to avoid strange results if bits above were corrupt */
+        tmp_inode->i_mode &= ~S_IFMT;
+	if (type == UNIX_FILE) {
+		*pobject_type = DT_REG;
+		tmp_inode->i_mode |= S_IFREG;
+	} else if (type == UNIX_SYMLINK) {
+		*pobject_type = DT_LNK;
+		tmp_inode->i_mode |= S_IFLNK;
+	} else if (type == UNIX_DIR) {
+		*pobject_type = DT_DIR;
+		tmp_inode->i_mode |= S_IFDIR;
+	} else if (type == UNIX_CHARDEV) {
+		*pobject_type = DT_CHR;
+		tmp_inode->i_mode |= S_IFCHR;
+		tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
+				le64_to_cpu(pData->DevMinor) & MINORMASK);
+	} else if (type == UNIX_BLOCKDEV) {
+		*pobject_type = DT_BLK;
+		tmp_inode->i_mode |= S_IFBLK;
+		tmp_inode->i_rdev = MKDEV(le64_to_cpu(pData->DevMajor),
+				le64_to_cpu(pData->DevMinor) & MINORMASK);
+	} else if (type == UNIX_FIFO) {
+		*pobject_type = DT_FIFO;
+		tmp_inode->i_mode |= S_IFIFO;
+	} else if (type == UNIX_SOCKET) {
+		*pobject_type = DT_SOCK;
+		tmp_inode->i_mode |= S_IFSOCK;
+	} else {
+		/* safest to just call it a file */
+		*pobject_type = DT_REG;
+		tmp_inode->i_mode |= S_IFREG;
+		cFYI(1,("unknown inode type %d",type)); 
+	}
+
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1,("object type: %d", type));
+#endif
+	tmp_inode->i_uid = le64_to_cpu(pData->Uid);
+	tmp_inode->i_gid = le64_to_cpu(pData->Gid);
+	tmp_inode->i_nlink = le64_to_cpu(pData->Nlinks);
+
+	spin_lock(&tmp_inode->i_lock);
+	if (is_size_safe_to_change(cifsInfo, end_of_file)) {
+		/* can not safely change the file size here if the 
+		client is writing to it due to potential races */
+		i_size_write(tmp_inode, end_of_file);
+
+	/* 512 bytes (2**9) is the fake blocksize that must be used */
+	/* for this calculation, not the real blocksize */
+		tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9;
+	}
+	spin_unlock(&tmp_inode->i_lock);
+
+	if (S_ISREG(tmp_inode->i_mode)) {
+		cFYI(1, ("File inode"));
+		tmp_inode->i_op = &cifs_file_inode_ops;
+
+		if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+				tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
+			else
+				tmp_inode->i_fop = &cifs_file_direct_ops;
+		
+		} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			tmp_inode->i_fop = &cifs_file_nobrl_ops;
+		else
+			tmp_inode->i_fop = &cifs_file_ops;
+
+		if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+		   (cifs_sb->tcon->ses->server->maxBuf < 
+			PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
+			tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+		else
+			tmp_inode->i_data.a_ops = &cifs_addr_ops;
+
+		if(isNewInode)
+			return; /* No sense invalidating pages for new inode since we
+					   have not started caching readahead file data yet */
+
+		if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) &&
+			(local_size == tmp_inode->i_size)) {
+			cFYI(1, ("inode exists but unchanged"));
+		} else {
+			/* file may have changed on server */
+			cFYI(1, ("invalidate inode, readdir detected change"));
+			invalidate_remote_inode(tmp_inode);
+		}
+	} else if (S_ISDIR(tmp_inode->i_mode)) {
+		cFYI(1, ("Directory inode"));
+		tmp_inode->i_op = &cifs_dir_inode_ops;
+		tmp_inode->i_fop = &cifs_dir_ops;
+	} else if (S_ISLNK(tmp_inode->i_mode)) {
+		cFYI(1, ("Symbolic Link inode"));
+		tmp_inode->i_op = &cifs_symlink_inode_ops;
+/* tmp_inode->i_fop = *//* do not need to set to anything */
+	} else {
+		cFYI(1, ("Special inode")); 
+		init_special_inode(tmp_inode, tmp_inode->i_mode,
+				   tmp_inode->i_rdev);
+	}	
+}
+
 int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 {
 	int rc = 0;
@@ -755,6 +896,71 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		FreeXid(xid);
 		return -ENOMEM;
 	}
+	
+	if((pTcon->ses->capabilities & CAP_UNIX) && 
+		(CIFS_UNIX_POSIX_PATH_OPS_CAP &	
+			le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
+		u32 oplock = 0;
+		FILE_UNIX_BASIC_INFO * pInfo = 
+			kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+		if(pInfo == NULL) {
+			rc = -ENOMEM;
+			goto mkdir_out;
+		}
+			
+		rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT,
+				mode, NULL /* netfid */, pInfo, &oplock,
+				full_path, cifs_sb->local_nls, 
+				cifs_sb->mnt_cifs_flags & 
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc) {
+			cFYI(1, ("posix mkdir returned 0x%x", rc));
+			d_drop(direntry);
+		} else {
+			int obj_type;
+			if (pInfo->Type == -1) /* no return info - go query */
+				goto mkdir_get_info; 
+/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need to set uid/gid */
+			inc_nlink(inode);
+			if (pTcon->nocase)
+				direntry->d_op = &cifs_ci_dentry_ops;
+			else
+				direntry->d_op = &cifs_dentry_ops;
+
+			newinode = new_inode(inode->i_sb);
+			if (newinode == NULL)
+				goto mkdir_get_info;
+			/* Is an i_ino of zero legal? */
+			/* Are there sanity checks we can use to ensure that
+			   the server is really filling in that field? */
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+				newinode->i_ino =
+					(unsigned long)pInfo->UniqueId;
+			} /* note ino incremented to unique num in new_inode */
+			if(inode->i_sb->s_flags & MS_NOATIME)
+				newinode->i_flags |= S_NOATIME | S_NOCMTIME;
+			newinode->i_nlink = 2;
+
+			insert_inode_hash(newinode);
+			d_instantiate(direntry, newinode);
+
+			/* we already checked in POSIXCreate whether
+			   frame was long enough */
+			posix_fill_in_inode(direntry->d_inode,
+					pInfo, &obj_type, 1 /* NewInode */);
+#ifdef CONFIG_CIFS_DEBUG2
+			cFYI(1,("instantiated dentry %p %s to inode %p",
+				direntry, direntry->d_name.name, newinode));
+
+			if(newinode->i_nlink != 2)
+				cFYI(1,("unexpected number of links %d",
+					newinode->i_nlink));
+#endif
+		}
+		kfree(pInfo);
+		goto mkdir_out;
+	}	
+	
 	/* BB add setting the equivalent of mode via CreateX w/ACLs */
 	rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
 			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -762,6 +968,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		cFYI(1, ("cifs_mkdir returned 0x%x", rc));
 		d_drop(direntry);
 	} else {
+mkdir_get_info:		
 		inc_nlink(inode);
 		if (pTcon->ses->capabilities & CAP_UNIX)
 			rc = cifs_get_inode_info_unix(&newinode, full_path,
@@ -775,8 +982,10 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		else
 			direntry->d_op = &cifs_dentry_ops;
 		d_instantiate(direntry, newinode);
-		if (direntry->d_inode)
-			direntry->d_inode->i_nlink = 2;
+		 /* setting nlink not necessary except in cases where we
+		  * failed to get it from the server or was set bogus */ 
+		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
+				direntry->d_inode->i_nlink = 2; 
 		if (cifs_sb->tcon->ses->capabilities & CAP_UNIX)
 			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
 				CIFSSMBUnixSetPerms(xid, pTcon, full_path,
@@ -812,6 +1021,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			}
 		}
 	}
+mkdir_out:	
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
@@ -1339,17 +1549,17 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 					cpu_to_le32(cifsInode->cifsAttrs |
 						    ATTR_READONLY);
 			}
-		} else if ((mode & S_IWUGO) == S_IWUGO) {
-			if (cifsInode->cifsAttrs & ATTR_READONLY) {
-				set_dosattr = TRUE;
-				time_buf.Attributes =
-					cpu_to_le32(cifsInode->cifsAttrs &
-						    (~ATTR_READONLY));
-				/* Windows ignores set to zero */
-				if(time_buf.Attributes == 0)
-					time_buf.Attributes |= 
-						cpu_to_le32(ATTR_NORMAL);
-			}
+		} else if (cifsInode->cifsAttrs & ATTR_READONLY) {
+			/* If file is readonly on server, we would
+			not be able to write to it - so if any write
+			bit is enabled for user or group or other we
+			need to at least try to remove r/o dos attr */
+			set_dosattr = TRUE;
+			time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs &
+					    (~ATTR_READONLY));
+			/* Windows ignores set to zero */
+			if(time_buf.Attributes == 0)
+				time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
 		}
 		/* BB to be implemented -
 		   via Windows security descriptors or streams */
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 992e80edc720..53e304d59544 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -30,6 +30,9 @@
 #include <linux/fs.h>
 #include <asm/div64.h>
 #include <asm/byteorder.h>
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+#include <linux/inet.h>
+#endif
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -129,11 +132,27 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
 /* Convert string containing dotted ip address to binary form */
 /* returns 0 if invalid address */
 
-/* BB add address family, change rc to status flag and return union or for ipv6 */
-/*  will need parent to call something like inet_pton to convert ipv6 address  BB */
 int
 cifs_inet_pton(int address_family, char *cp,void *dst)
 {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+	int ret = 0;
+
+	/* calculate length by finding first slash or NULL */
+	/* BB Should we convert '/' slash to '\' here since it seems already done
+	   before this */
+	if( address_family == AF_INET ){
+		ret = in4_pton(cp, -1 /* len */, dst , '\\', NULL);	
+	} else if( address_family == AF_INET6 ){
+		ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
+	}
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1,("address conversion returned %d for %s", ret, cp));
+#endif
+	if (ret > 0)
+		ret = 1;
+	return ret;
+#else
 	int value;
 	int digit;
 	int i;
@@ -192,6 +211,7 @@ cifs_inet_pton(int address_family, char *cp,void *dst)
 
 	*((__be32 *)dst) = *((__be32 *) bytes) | htonl(value);
 	return 1; /* success */
+#endif /* EXPERIMENTAL */	
 }
 
 /*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 2a374d5215ab..c08bda9fcac6 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -23,7 +23,6 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/stat.h>
-#include <linux/smp_lock.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -37,19 +36,19 @@ static void dump_cifs_file_struct(struct file *file, char *label)
 {
 	struct cifsFileInfo * cf;
 
-	if(file) {
+	if (file) {
 		cf = file->private_data;
-		if(cf == NULL) {
+		if (cf == NULL) {
 			cFYI(1,("empty cifs private file data"));
 			return;
 		}
-		if(cf->invalidHandle) {
+		if (cf->invalidHandle) {
 			cFYI(1,("invalid handle"));
 		}
-		if(cf->srch_inf.endOfSearch) {
+		if (cf->srch_inf.endOfSearch) {
 			cFYI(1,("end of search"));
 		}
-		if(cf->srch_inf.emptyDir) {
+		if (cf->srch_inf.emptyDir) {
 			cFYI(1,("empty dir"));
 		}
 		
@@ -77,17 +76,17 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 		cFYI(0, ("existing dentry with inode 0x%p", tmp_dentry->d_inode));
 		*ptmp_inode = tmp_dentry->d_inode;
 /* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
-		if(*ptmp_inode == NULL) {
+		if (*ptmp_inode == NULL) {
 			*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
-			if(*ptmp_inode == NULL)
+			if (*ptmp_inode == NULL)
 				return rc;
 			rc = 1;
 		}
-		if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
+		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
 			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 	} else {
 		tmp_dentry = d_alloc(file->f_path.dentry, qstring);
-		if(tmp_dentry == NULL) {
+		if (tmp_dentry == NULL) {
 			cERROR(1,("Failed allocating dentry"));
 			*ptmp_inode = NULL;
 			return rc;
@@ -98,9 +97,9 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 			tmp_dentry->d_op = &cifs_ci_dentry_ops;
 		else
 			tmp_dentry->d_op = &cifs_dentry_ops;
-		if(*ptmp_inode == NULL)
+		if (*ptmp_inode == NULL)
 			return rc;
-		if(file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
+		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
 			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;			
 		rc = 2;
 	}
@@ -112,7 +111,7 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 
 static void AdjustForTZ(struct cifsTconInfo * tcon, struct inode * inode)
 {
-	if((tcon) && (tcon->ses) && (tcon->ses->server)) {
+	if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
 		inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
 		inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
 		inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
@@ -137,7 +136,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	local_mtime = tmp_inode->i_mtime;
 	local_size  = tmp_inode->i_size;
 
-	if(new_buf_type) {
+	if (new_buf_type) {
 		FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf;
 
 		attr = le32_to_cpu(pfindData->ExtFileAttributes);
@@ -193,7 +192,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	if (attr & ATTR_DIRECTORY) {
 		*pobject_type = DT_DIR;
 		/* override default perms since we do not lock dirs */
-		if(atomic_read(&cifsInfo->inUse) == 0) {
+		if (atomic_read(&cifsInfo->inUse) == 0) {
 			tmp_inode->i_mode = cifs_sb->mnt_dir_mode;
 		}
 		tmp_inode->i_mode |= S_IFDIR;
@@ -250,25 +249,25 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	if (S_ISREG(tmp_inode->i_mode)) {
 		cFYI(1, ("File inode"));
 		tmp_inode->i_op = &cifs_file_inode_ops;
-		if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-			if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 				tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
 			else
 				tmp_inode->i_fop = &cifs_file_direct_ops;
 		
-		} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 			tmp_inode->i_fop = &cifs_file_nobrl_ops;
 		else
 			tmp_inode->i_fop = &cifs_file_ops;
 
-		if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+		if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
 		   (cifs_sb->tcon->ses->server->maxBuf <
 			PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
 			tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 		else
 			tmp_inode->i_data.a_ops = &cifs_addr_ops;
 
-		if(isNewInode)
+		if (isNewInode)
 			return; /* No sense invalidating pages for new inode
 				   since have not started caching readahead file
 				   data yet */
@@ -357,8 +356,14 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 		cFYI(1,("unknown inode type %d",type)); 
 	}
 
-	tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
-	tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+		tmp_inode->i_uid = cifs_sb->mnt_uid;
+	else
+		tmp_inode->i_uid = le64_to_cpu(pfindData->Uid);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+		tmp_inode->i_gid = cifs_sb->mnt_gid;
+	else
+		tmp_inode->i_gid = le64_to_cpu(pfindData->Gid);
 	tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks);
 
 	spin_lock(&tmp_inode->i_lock);
@@ -377,25 +382,24 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 		cFYI(1, ("File inode"));
 		tmp_inode->i_op = &cifs_file_inode_ops;
 
-		if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
-			if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 				tmp_inode->i_fop = &cifs_file_direct_nobrl_ops;
 			else
 				tmp_inode->i_fop = &cifs_file_direct_ops;
-		
-		} else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+		} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
 			tmp_inode->i_fop = &cifs_file_nobrl_ops;
 		else
 			tmp_inode->i_fop = &cifs_file_ops;
 
-		if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
+		if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
 		   (cifs_sb->tcon->ses->server->maxBuf < 
 			PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
 			tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 		else
 			tmp_inode->i_data.a_ops = &cifs_addr_ops;
 
-		if(isNewInode)
+		if (isNewInode)
 			return; /* No sense invalidating pages for new inode since we
 					   have not started caching readahead file data yet */
 
@@ -430,34 +434,28 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 
-	if(file->private_data == NULL) {
+	if (file->private_data == NULL) {
 		file->private_data = 
-			kmalloc(sizeof(struct cifsFileInfo),GFP_KERNEL);
+			kzalloc(sizeof(struct cifsFileInfo),GFP_KERNEL);
 	}
 
-	if(file->private_data == NULL) {
+	if (file->private_data == NULL)
 		return -ENOMEM;
-	} else {
-		memset(file->private_data,0,sizeof(struct cifsFileInfo));
-	}
 	cifsFile = file->private_data;
 	cifsFile->invalidHandle = TRUE;
 	cifsFile->srch_inf.endOfSearch = FALSE;
 
-	if(file->f_path.dentry == NULL)
-		return -ENOENT;
-
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	if(cifs_sb == NULL)
+	if (cifs_sb == NULL)
 		return -EINVAL;
 
 	pTcon = cifs_sb->tcon;
-	if(pTcon == NULL)
+	if (pTcon == NULL)
 		return -EINVAL;
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
 
-	if(full_path == NULL) {
+	if (full_path == NULL) {
 		return -ENOMEM;
 	}
 
@@ -480,9 +478,9 @@ ffirst_retry:
 		&cifsFile->netfid, &cifsFile->srch_inf,
 		cifs_sb->mnt_cifs_flags & 
 			CIFS_MOUNT_MAP_SPECIAL_CHR, CIFS_DIR_SEP(cifs_sb));
-	if(rc == 0)
+	if (rc == 0)
 		cifsFile->invalidHandle = FALSE;
-	if((rc == -EOPNOTSUPP) && 
+	if ((rc == -EOPNOTSUPP) && 
 		(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
 		goto ffirst_retry;
@@ -498,7 +496,7 @@ static int cifs_unicode_bytelen(char *str)
 	__le16 * ustr = (__le16 *)str;
 
 	for(len=0;len <= PATH_MAX;len++) {
-		if(ustr[len] == 0)
+		if (ustr[len] == 0)
 			return len << 1;
 	}
 	cFYI(1,("Unicode string longer than PATH_MAX found"));
@@ -510,7 +508,7 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 	char * new_entry;
 	FILE_DIRECTORY_INFO * pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
 
-	if(level == SMB_FIND_FILE_INFO_STANDARD) {
+	if (level == SMB_FIND_FILE_INFO_STANDARD) {
 		FIND_FILE_STANDARD_INFO * pfData;
 		pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
 
@@ -520,12 +518,12 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 		new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
 	cFYI(1,("new entry %p old entry %p",new_entry,old_entry));
 	/* validate that new_entry is not past end of SMB */
-	if(new_entry >= end_of_smb) {
+	if (new_entry >= end_of_smb) {
 		cERROR(1,
 		      ("search entry %p began after end of SMB %p old entry %p",
 			new_entry, end_of_smb, old_entry)); 
 		return NULL;
-	} else if(((level == SMB_FIND_FILE_INFO_STANDARD) &&
+	} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
 		   (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) ||
 		  ((level != SMB_FIND_FILE_INFO_STANDARD) &&
 		   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
@@ -546,39 +544,39 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
 	char * filename = NULL;
 	int len = 0; 
 
-	if(cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
+	if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
 		FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
-		if(cfile->srch_inf.unicode) {
+		if (cfile->srch_inf.unicode) {
 			len = cifs_unicode_bytelen(filename);
 		} else {
 			/* BB should we make this strnlen of PATH_MAX? */
 			len = strnlen(filename, 5);
 		}
-	} else if(cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
+	} else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
 		FILE_DIRECTORY_INFO * pFindData = 
 			(FILE_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level == 
+	} else if (cfile->srch_inf.info_level == 
 			SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
 		FILE_FULL_DIRECTORY_INFO * pFindData = 
 			(FILE_FULL_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level ==
+	} else if (cfile->srch_inf.info_level ==
 			SMB_FIND_FILE_ID_FULL_DIR_INFO) {
 		SEARCH_ID_FULL_DIR_INFO * pFindData = 
 			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level == 
+	} else if (cfile->srch_inf.info_level == 
 			SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
 		FILE_BOTH_DIRECTORY_INFO * pFindData = 
 			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
+	} else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
 		FIND_FILE_STANDARD_INFO * pFindData =
 			(FIND_FILE_STANDARD_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
@@ -587,25 +585,25 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
 		cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level));
 	}
 
-	if(filename) {
-		if(cfile->srch_inf.unicode) {
+	if (filename) {
+		if (cfile->srch_inf.unicode) {
 			__le16 *ufilename = (__le16 *)filename;
-			if(len == 2) {
+			if (len == 2) {
 				/* check for . */
-				if(ufilename[0] == UNICODE_DOT)
+				if (ufilename[0] == UNICODE_DOT)
 					rc = 1;
-			} else if(len == 4) {
+			} else if (len == 4) {
 				/* check for .. */
-				if((ufilename[0] == UNICODE_DOT)
+				if ((ufilename[0] == UNICODE_DOT)
 				   &&(ufilename[1] == UNICODE_DOT))
 					rc = 2;
 			}
 		} else /* ASCII */ {
-			if(len == 1) {
-				if(filename[0] == '.') 
+			if (len == 1) {
+				if (filename[0] == '.') 
 					rc = 1;
-			} else if(len == 2) {
-				if((filename[0] == '.') && (filename[1] == '.')) 
+			} else if (len == 2) {
+				if((filename[0] == '.') && (filename[1] == '.'))
 					rc = 2;
 			}
 		}
@@ -618,20 +616,10 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
    whether we can use the cached search results from the previous search */
 static int is_dir_changed(struct file * file)
 {
-	struct inode * inode;
-	struct cifsInodeInfo *cifsInfo;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
 
-	if(file->f_path.dentry == NULL)
-		return 0;
-
-	inode = file->f_path.dentry->d_inode;
-
-	if(inode == NULL)
-		return 0;
-
-	cifsInfo = CIFS_I(inode);
-
-	if(cifsInfo->time == 0)
+	if (cifsInfo->time == 0)
 		return 1; /* directory was changed, perhaps due to unlink */
 	else
 		return 0;
@@ -654,7 +642,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	struct cifsFileInfo * cifsFile = file->private_data;
 	/* check if index in the buffer */
 	
-	if((cifsFile == NULL) || (ppCurrentEntry == NULL) || 
+	if ((cifsFile == NULL) || (ppCurrentEntry == NULL) || 
 	   (num_to_ret == NULL))
 		return -ENOENT;
 	
@@ -672,7 +660,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 #ifdef CONFIG_CIFS_DEBUG2
 	dump_cifs_file_struct(file, "In fce ");
 #endif
-	if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
+	if (((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
 	     is_dir_changed(file)) || 
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
@@ -681,9 +669,9 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		CIFSFindClose(xid, pTcon, cifsFile->netfid);
 		kfree(cifsFile->search_resume_name);
 		cifsFile->search_resume_name = NULL;
-		if(cifsFile->srch_inf.ntwrk_buf_start) {
+		if (cifsFile->srch_inf.ntwrk_buf_start) {
 			cFYI(1,("freeing SMB ff cache buf on search rewind"));
-			if(cifsFile->srch_inf.smallBuf)
+			if (cifsFile->srch_inf.smallBuf)
 				cifs_small_buf_release(cifsFile->srch_inf.
 						ntwrk_buf_start);
 			else
@@ -691,7 +679,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 						ntwrk_buf_start);
 		}
 		rc = initiate_cifs_search(xid,file);
-		if(rc) {
+		if (rc) {
 			cFYI(1,("error %d reinitiating a search on rewind",rc));
 			return rc;
 		}
@@ -702,10 +690,10 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	 	cFYI(1,("calling findnext2"));
 		rc = CIFSFindNext(xid,pTcon,cifsFile->netfid, 
 				  &cifsFile->srch_inf);
-		if(rc)
+		if (rc)
 			return -ENOENT;
 	}
-	if(index_to_find < cifsFile->srch_inf.index_of_last_entry) {
+	if (index_to_find < cifsFile->srch_inf.index_of_last_entry) {
 		/* we found the buffer that contains the entry */
 		/* scan and find it */
 		int i;
@@ -851,9 +839,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	if((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL))
 		return -ENOENT;
 
-	if(file->f_path.dentry == NULL)
-		return -ENOENT;
-
 	rc = cifs_entry_is_dot(pfindEntry,pCifsF);
 	/* skip . and .. since we added them first */
 	if(rc != 0) 
@@ -997,11 +982,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 
 	xid = GetXid();
 
-	if(file->f_path.dentry == NULL) {
-		FreeXid(xid);
-		return -EIO;
-	}
-
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	pTcon = cifs_sb->tcon;
 	if(pTcon == NULL)
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 5d0527133266..fcb88fa8d2f2 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -16,6 +16,7 @@
 #include <asm/uaccess.h>
 #include <linux/string.h>
 #include <linux/list.h>
+#include <linux/sched.h>
 
 #include <linux/coda.h>
 #include <linux/coda_linux.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 614175a3b02e..dbff1bd4fb96 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -62,9 +62,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
  
 int coda_init_inodecache(void)
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index a5b5e631ba61..5faacdb1a479 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -16,7 +16,7 @@
 
 #include <asm/system.h>
 #include <linux/signal.h>
-
+#include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
diff --git a/fs/compat.c b/fs/compat.c
index 040a8be38a48..4db6216e5266 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -15,6 +15,7 @@
  *  published by the Free Software Foundation.
  */
 
+#include <linux/kernel.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
 #include <linux/errno.h>
@@ -24,10 +25,8 @@
 #include <linux/namei.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
-#include <linux/ioctl32.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <linux/sockios.h>	/* for SIOCDEVPRIVATE */
 #include <linux/smb.h>
 #include <linux/smb_mount.h>
 #include <linux/ncp_mount.h>
@@ -45,13 +44,13 @@
 #include <linux/personality.h>
 #include <linux/rwsem.h>
 #include <linux/tsacct_kern.h>
+#include <linux/security.h>
 #include <linux/highmem.h>
+#include <linux/signal.h>
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
 
-#include <net/sock.h>		/* siocdevprivate_ioctl */
-
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/ioctls.h>
@@ -79,30 +78,57 @@ int compat_printk(const char *fmt, ...)
  */
 asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __user *t)
 {
-	struct timeval tv[2];
+	struct timespec tv[2];
 
 	if (t) {
 		if (get_user(tv[0].tv_sec, &t->actime) ||
 		    get_user(tv[1].tv_sec, &t->modtime))
 			return -EFAULT;
-		tv[0].tv_usec = 0;
-		tv[1].tv_usec = 0;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
+	}
+	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
+}
+
+asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags)
+{
+	struct timespec tv[2];
+
+	if  (t) {
+		if (get_compat_timespec(&tv[0], &t[0]) ||
+		    get_compat_timespec(&tv[1], &t[1]))
+			return -EFAULT;
+
+		if ((tv[0].tv_nsec == UTIME_OMIT || tv[0].tv_nsec == UTIME_NOW)
+		    && tv[0].tv_sec != 0)
+			return -EINVAL;
+		if ((tv[1].tv_nsec == UTIME_OMIT || tv[1].tv_nsec == UTIME_NOW)
+		    && tv[1].tv_sec != 0)
+			return -EINVAL;
+
+		if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
+			return 0;
 	}
-	return do_utimes(AT_FDCWD, filename, t ? tv : NULL);
+	return do_utimes(dfd, filename, t ? tv : NULL, flags);
 }
 
 asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t)
 {
-	struct timeval tv[2];
+	struct timespec tv[2];
 
 	if (t) {
 		if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
-		    get_user(tv[0].tv_usec, &t[0].tv_usec) ||
+		    get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
 		    get_user(tv[1].tv_sec, &t[1].tv_sec) ||
-		    get_user(tv[1].tv_usec, &t[1].tv_usec))
+		    get_user(tv[1].tv_nsec, &t[1].tv_usec))
 			return -EFAULT;
+		if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
+		    tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
+			return -EINVAL;
+		tv[0].tv_nsec *= 1000;
+		tv[1].tv_nsec *= 1000;
 	}
-	return do_utimes(dfd, filename, t ? tv : NULL);
+	return do_utimes(dfd, filename, t ? tv : NULL, 0);
 }
 
 asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
@@ -312,162 +338,6 @@ out:
 	return error;
 }
 
-/* ioctl32 stuff, used by sparc64, parisc, s390x, ppc64, x86_64, MIPS */
-
-#define IOCTL_HASHSIZE 256
-static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
-
-static inline unsigned long ioctl32_hash(unsigned long cmd)
-{
-	return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE;
-}
-
-static void ioctl32_insert_translation(struct ioctl_trans *trans)
-{
-	unsigned long hash;
-	struct ioctl_trans *t;
-
-	hash = ioctl32_hash (trans->cmd);
-	if (!ioctl32_hash_table[hash])
-		ioctl32_hash_table[hash] = trans;
-	else {
-		t = ioctl32_hash_table[hash];
-		while (t->next)
-			t = t->next;
-		trans->next = NULL;
-		t->next = trans;
-	}
-}
-
-static int __init init_sys32_ioctl(void)
-{
-	int i;
-
-	for (i = 0; i < ioctl_table_size; i++) {
-		if (ioctl_start[i].next != 0) { 
-			printk("ioctl translation %d bad\n",i); 
-			return -1;
-		}
-
-		ioctl32_insert_translation(&ioctl_start[i]);
-	}
-	return 0;
-}
-
-__initcall(init_sys32_ioctl);
-
-static void compat_ioctl_error(struct file *filp, unsigned int fd,
-		unsigned int cmd, unsigned long arg)
-{
-	char buf[10];
-	char *fn = "?";
-	char *path;
-
-	/* find the name of the device. */
-	path = (char *)__get_free_page(GFP_KERNEL);
-	if (path) {
-		fn = d_path(filp->f_path.dentry, filp->f_path.mnt, path, PAGE_SIZE);
-		if (IS_ERR(fn))
-			fn = "?";
-	}
-
-	sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
-	if (!isprint(buf[1]))
-		sprintf(buf, "%02x", buf[1]);
-	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
-			"cmd(%08x){%s} arg(%08x) on %s\n",
-			current->comm, current->pid,
-			(int)fd, (unsigned int)cmd, buf,
-			(unsigned int)arg, fn);
-
-	if (path)
-		free_page((unsigned long)path);
-}
-
-asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
-				unsigned long arg)
-{
-	struct file *filp;
-	int error = -EBADF;
-	struct ioctl_trans *t;
-	int fput_needed;
-
-	filp = fget_light(fd, &fput_needed);
-	if (!filp)
-		goto out;
-
-	/* RED-PEN how should LSM module know it's handling 32bit? */
-	error = security_file_ioctl(filp, cmd, arg);
-	if (error)
-		goto out_fput;
-
-	/*
-	 * To allow the compat_ioctl handlers to be self contained
-	 * we need to check the common ioctls here first.
-	 * Just handle them with the standard handlers below.
-	 */
-	switch (cmd) {
-	case FIOCLEX:
-	case FIONCLEX:
-	case FIONBIO:
-	case FIOASYNC:
-	case FIOQSIZE:
-		break;
-
-	case FIBMAP:
-	case FIGETBSZ:
-	case FIONREAD:
-		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
-			break;
-		/*FALL THROUGH*/
-
-	default:
-		if (filp->f_op && filp->f_op->compat_ioctl) {
-			error = filp->f_op->compat_ioctl(filp, cmd, arg);
-			if (error != -ENOIOCTLCMD)
-				goto out_fput;
-		}
-
-		if (!filp->f_op ||
-		    (!filp->f_op->ioctl && !filp->f_op->unlocked_ioctl))
-			goto do_ioctl;
-		break;
-	}
-
-	for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) {
-		if (t->cmd == cmd)
-			goto found_handler;
-	}
-
-	if (S_ISSOCK(filp->f_path.dentry->d_inode->i_mode) &&
-	    cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
-		error = siocdevprivate_ioctl(fd, cmd, arg);
-	} else {
-		static int count;
-
-		if (++count <= 50)
-			compat_ioctl_error(filp, fd, cmd, arg);
-		error = -EINVAL;
-	}
-
-	goto out_fput;
-
- found_handler:
-	if (t->handler) {
-		lock_kernel();
-		error = t->handler(fd, cmd, arg, filp);
-		unlock_kernel();
-		goto out_fput;
-	}
-
- do_ioctl:
-	error = vfs_ioctl(filp, fd, cmd, arg);
- out_fput:
-	fput_light(filp, fput_needed);
- out:
-	return error;
-}
-
 static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
 {
 	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
@@ -901,8 +771,6 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
 }
 
 #define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
-#define COMPAT_ROUND_UP(x) (((x)+sizeof(compat_long_t)-1) & \
-				~(sizeof(compat_long_t)-1))
 
 struct compat_old_linux_dirent {
 	compat_ulong_t	d_ino;
@@ -990,7 +858,7 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
 	struct compat_linux_dirent __user * dirent;
 	struct compat_getdents_callback *buf = __buf;
 	compat_ulong_t d_ino;
-	int reclen = COMPAT_ROUND_UP(NAME_OFFSET(dirent) + namlen + 2);
+	int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t));
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
@@ -1065,7 +933,6 @@ out:
 }
 
 #ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
-#define COMPAT_ROUND_UP64(x) (((x)+sizeof(u64)-1) & ~(sizeof(u64)-1))
 
 struct compat_getdents_callback64 {
 	struct linux_dirent64 __user *current_dir;
@@ -1080,7 +947,7 @@ static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t
 	struct linux_dirent64 __user *dirent;
 	struct compat_getdents_callback64 *buf = __buf;
 	int jj = NAME_OFFSET(dirent);
-	int reclen = COMPAT_ROUND_UP64(jj + namlen + 1);
+	int reclen = ALIGN(jj + namlen + 1, sizeof(u64));
 	u64 off;
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
@@ -1593,8 +1460,6 @@ out_ret:
 
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
-#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
-
 /*
  * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
  * 64-bit unsigned longs.
@@ -1603,7 +1468,7 @@ static
 int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 			unsigned long *fdset)
 {
-	nr = ROUND_UP(nr, __COMPAT_NFDBITS);
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
 	if (ufdset) {
 		unsigned long odd;
 
@@ -1637,7 +1502,7 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 		      unsigned long *fdset)
 {
 	unsigned long odd;
-	nr = ROUND_UP(nr, __COMPAT_NFDBITS);
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
 
 	if (!ufdset)
 		return 0;
@@ -1679,9 +1544,10 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
 {
 	fd_set_bits fds;
-	char *bits;
+	void *bits;
 	int size, max_fds, ret = -EINVAL;
 	struct fdtable *fdt;
+	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
 
 	if (n < 0)
 		goto out_nofds;
@@ -1699,11 +1565,14 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	 * since we used fdset we need to allocate memory in units of
 	 * long-words.
 	 */
-	ret = -ENOMEM;
 	size = FDS_BYTES(n);
-	bits = kmalloc(6 * size, GFP_KERNEL);
-	if (!bits)
-		goto out_nofds;
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		bits = kmalloc(6 * size, GFP_KERNEL);
+		ret = -ENOMEM;
+		if (!bits)
+			goto out_nofds;
+	}
 	fds.in      = (unsigned long *)  bits;
 	fds.out     = (unsigned long *) (bits +   size);
 	fds.ex      = (unsigned long *) (bits + 2*size);
@@ -1735,7 +1604,8 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	    compat_set_fd_set(n, exp, fds.res_ex))
 		ret = -EFAULT;
 out:
-	kfree(bits);
+	if (bits != stack_fds)
+		kfree(bits);
 out_nofds:
 	return ret;
 }
@@ -1759,7 +1629,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
 			timeout = -1;	/* infinite */
 		else {
-			timeout = ROUND_UP(tv.tv_usec, 1000000/HZ);
+			timeout = DIV_ROUND_UP(tv.tv_usec, 1000000/HZ);
 			timeout += tv.tv_sec * HZ;
 		}
 	}
@@ -1827,7 +1697,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 	do {
 		if (tsp) {
 			if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
-				timeout = ROUND_UP(ts.tv_nsec, 1000000000/HZ);
+				timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
 				timeout += ts.tv_sec * (unsigned long)HZ;
 				ts.tv_sec = 0;
 				ts.tv_nsec = 0;
@@ -1923,7 +1793,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		/* We assume that ts.tv_sec is always lower than
 		   the number of seconds that can be expressed in
 		   an s64. Otherwise the compiler bitches at us */
-		timeout = ROUND_UP(ts.tv_nsec, 1000000000/HZ);
+		timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
 		timeout += ts.tv_sec * HZ;
 	}
 
@@ -2335,3 +2205,46 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 #endif /* TIF_RESTORE_SIGMASK */
 
 #endif /* CONFIG_EPOLL */
+
+#ifdef CONFIG_SIGNALFD
+
+asmlinkage long compat_sys_signalfd(int ufd,
+				    const compat_sigset_t __user *sigmask,
+				    compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t tmp;
+	sigset_t __user *ksigmask;
+
+	if (sigsetsize != sizeof(compat_sigset_t))
+		return -EINVAL;
+	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		return -EFAULT;
+	sigset_from_compat(&tmp, &ss32);
+	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
+	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
+		return -EFAULT;
+
+	return sys_signalfd(ufd, ksigmask, sizeof(sigset_t));
+}
+
+#endif /* CONFIG_SIGNALFD */
+
+#ifdef CONFIG_TIMERFD
+
+asmlinkage long compat_sys_timerfd(int ufd, int clockid, int flags,
+				   const struct compat_itimerspec __user *utmr)
+{
+	struct itimerspec t;
+	struct itimerspec __user *ut;
+
+	if (get_compat_itimerspec(&t, utmr))
+		return -EFAULT;
+	ut = compat_alloc_user_space(sizeof(*ut));
+	if (copy_to_user(ut, &t, sizeof(t)))
+		return -EFAULT;
+
+	return sys_timerfd(ufd, clockid, flags, ut);
+}
+
+#endif /* CONFIG_TIMERFD */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c68b055fa26e..6b44cdc96fac 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -17,7 +17,6 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
@@ -58,7 +57,6 @@
 #include <linux/serial.h>
 #include <linux/if_tun.h>
 #include <linux/ctype.h>
-#include <linux/ioctl32.h>
 #include <linux/syscalls.h>
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
@@ -66,7 +64,6 @@
 #include <linux/atalk.h>
 #include <linux/blktrace_api.h>
 
-#include <net/sock.h>          /* siocdevprivate_ioctl */
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
 #include <net/bluetooth/rfcomm.h>
@@ -475,7 +472,7 @@ static int bond_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 	};
 }
 
-int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+static int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	struct ifreq __user *u_ifreq64;
 	struct ifreq32 __user *u_ifreq32 = compat_ptr(arg);
@@ -687,8 +684,10 @@ static int hdio_getgeo(unsigned int fd, unsigned int cmd, unsigned long arg)
 	if (!err) {
 		err = copy_to_user (ugeo, &geo, 4);
 		err |= __put_user (geo.start, &ugeo->start);
+		if (err)
+			err = -EFAULT;
 	}
-	return err ? -EFAULT : 0;
+	return err;
 }
 
 static int hdio_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
@@ -1195,6 +1194,7 @@ static int vt_check(struct file *file)
 {
 	struct tty_struct *tty;
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct vc_data *vc;
 	
 	if (file->f_op->ioctl != tty_ioctl)
 		return -EINVAL;
@@ -1205,12 +1205,16 @@ static int vt_check(struct file *file)
 	                                                
 	if (tty->driver->ioctl != vt_ioctl)
 		return -EINVAL;
-	
+
+	vc = (struct vc_data *)tty->driver_data;
+	if (!vc_cons_allocated(vc->vc_num)) 	/* impossible? */
+		return -ENOIOCTLCMD;
+
 	/*
 	 * To have permissions to do most of the vt ioctls, we either have
-	 * to be the owner of the tty, or super-user.
+	 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
 	 */
-	if (current->signal->tty == tty || capable(CAP_SYS_ADMIN))
+	if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG))
 		return 1;
 	return 0;                                                    
 }
@@ -1311,16 +1315,28 @@ static int do_unimap_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg,
 	struct unimapdesc32 tmp;
 	struct unimapdesc32 __user *user_ud = compat_ptr(arg);
 	int perm = vt_check(file);
-	
-	if (perm < 0) return perm;
+	struct vc_data *vc;
+
+	if (perm < 0)
+		return perm;
 	if (copy_from_user(&tmp, user_ud, sizeof tmp))
 		return -EFAULT;
+	if (tmp.entries)
+		if (!access_ok(VERIFY_WRITE, compat_ptr(tmp.entries),
+				tmp.entry_ct*sizeof(struct unipair)))
+			return -EFAULT;
+	vc = ((struct tty_struct *)file->private_data)->driver_data;
 	switch (cmd) {
 	case PIO_UNIMAP:
-		if (!perm) return -EPERM;
-		return con_set_unimap(vc_cons[fg_console].d, tmp.entry_ct, compat_ptr(tmp.entries));
+		if (!perm)
+			return -EPERM;
+		return con_set_unimap(vc, tmp.entry_ct,
+						compat_ptr(tmp.entries));
 	case GIO_UNIMAP:
-		return con_get_unimap(vc_cons[fg_console].d, tmp.entry_ct, &(user_ud->entry_ct), compat_ptr(tmp.entries));
+		if (!perm && fg_console != vc->vc_num)
+			return -EPERM;
+		return con_get_unimap(vc, tmp.entry_ct, &(user_ud->entry_ct),
+						compat_ptr(tmp.entries));
 	}
 	return 0;
 }
@@ -2385,6 +2401,16 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return sys_ioctl(fd, cmd, (unsigned long)tn);
 }
 
+
+typedef int (*ioctl_trans_handler_t)(unsigned int, unsigned int,
+					unsigned long, struct file *);
+
+struct ioctl_trans {
+	unsigned long cmd;
+	ioctl_trans_handler_t handler;
+	struct ioctl_trans *next;
+};
+
 #define HANDLE_IOCTL(cmd,handler) \
 	{ (cmd), (ioctl_trans_handler_t)(handler) },
 
@@ -2396,9 +2422,844 @@ lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 #define ULONG_IOCTL(cmd) \
 	{ (cmd), (ioctl_trans_handler_t)sys_ioctl },
 
-
-struct ioctl_trans ioctl_start[] = {
-#include <linux/compat_ioctl.h>
+/* ioctl should not be warned about even if it's not implemented.
+   Valid reasons to use this:
+   - It is implemented with ->compat_ioctl on some device, but programs
+   call it on others too.
+   - The ioctl is not implemented in the native kernel, but programs
+   call it commonly anyways.
+   Most other reasons are not valid. */
+#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
+
+static struct ioctl_trans ioctl_start[] = {
+/* compatible ioctls first */
+COMPATIBLE_IOCTL(0x4B50)   /* KDGHWCLK - not in the kernel, but don't complain */
+COMPATIBLE_IOCTL(0x4B51)   /* KDSHWCLK - not in the kernel, but don't complain */
+
+/* Big T */
+COMPATIBLE_IOCTL(TCGETA)
+COMPATIBLE_IOCTL(TCSETA)
+COMPATIBLE_IOCTL(TCSETAW)
+COMPATIBLE_IOCTL(TCSETAF)
+COMPATIBLE_IOCTL(TCSBRK)
+ULONG_IOCTL(TCSBRKP)
+COMPATIBLE_IOCTL(TCXONC)
+COMPATIBLE_IOCTL(TCFLSH)
+COMPATIBLE_IOCTL(TCGETS)
+COMPATIBLE_IOCTL(TCSETS)
+COMPATIBLE_IOCTL(TCSETSW)
+COMPATIBLE_IOCTL(TCSETSF)
+COMPATIBLE_IOCTL(TIOCLINUX)
+COMPATIBLE_IOCTL(TIOCSBRK)
+COMPATIBLE_IOCTL(TIOCCBRK)
+ULONG_IOCTL(TIOCMIWAIT)
+COMPATIBLE_IOCTL(TIOCGICOUNT)
+/* Little t */
+COMPATIBLE_IOCTL(TIOCGETD)
+COMPATIBLE_IOCTL(TIOCSETD)
+COMPATIBLE_IOCTL(TIOCEXCL)
+COMPATIBLE_IOCTL(TIOCNXCL)
+COMPATIBLE_IOCTL(TIOCCONS)
+COMPATIBLE_IOCTL(TIOCGSOFTCAR)
+COMPATIBLE_IOCTL(TIOCSSOFTCAR)
+COMPATIBLE_IOCTL(TIOCSWINSZ)
+COMPATIBLE_IOCTL(TIOCGWINSZ)
+COMPATIBLE_IOCTL(TIOCMGET)
+COMPATIBLE_IOCTL(TIOCMBIC)
+COMPATIBLE_IOCTL(TIOCMBIS)
+COMPATIBLE_IOCTL(TIOCMSET)
+COMPATIBLE_IOCTL(TIOCPKT)
+COMPATIBLE_IOCTL(TIOCNOTTY)
+COMPATIBLE_IOCTL(TIOCSTI)
+COMPATIBLE_IOCTL(TIOCOUTQ)
+COMPATIBLE_IOCTL(TIOCSPGRP)
+COMPATIBLE_IOCTL(TIOCGPGRP)
+ULONG_IOCTL(TIOCSCTTY)
+COMPATIBLE_IOCTL(TIOCGPTN)
+COMPATIBLE_IOCTL(TIOCSPTLCK)
+COMPATIBLE_IOCTL(TIOCSERGETLSR)
+/* Little f */
+COMPATIBLE_IOCTL(FIOCLEX)
+COMPATIBLE_IOCTL(FIONCLEX)
+COMPATIBLE_IOCTL(FIOASYNC)
+COMPATIBLE_IOCTL(FIONBIO)
+COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
+/* 0x00 */
+COMPATIBLE_IOCTL(FIBMAP)
+COMPATIBLE_IOCTL(FIGETBSZ)
+/* 0x03 -- HD/IDE ioctl's used by hdparm and friends.
+ *         Some need translations, these do not.
+ */
+COMPATIBLE_IOCTL(HDIO_GET_IDENTITY)
+COMPATIBLE_IOCTL(HDIO_DRIVE_TASK)
+COMPATIBLE_IOCTL(HDIO_DRIVE_CMD)
+ULONG_IOCTL(HDIO_SET_MULTCOUNT)
+ULONG_IOCTL(HDIO_SET_UNMASKINTR)
+ULONG_IOCTL(HDIO_SET_KEEPSETTINGS)
+ULONG_IOCTL(HDIO_SET_32BIT)
+ULONG_IOCTL(HDIO_SET_NOWERR)
+ULONG_IOCTL(HDIO_SET_DMA)
+ULONG_IOCTL(HDIO_SET_PIO_MODE)
+ULONG_IOCTL(HDIO_SET_NICE)
+ULONG_IOCTL(HDIO_SET_WCACHE)
+ULONG_IOCTL(HDIO_SET_ACOUSTIC)
+ULONG_IOCTL(HDIO_SET_BUSSTATE)
+ULONG_IOCTL(HDIO_SET_ADDRESS)
+COMPATIBLE_IOCTL(HDIO_SCAN_HWIF)
+/* 0x330 is reserved -- it used to be HDIO_GETGEO_BIG */
+COMPATIBLE_IOCTL(0x330)
+/* 0x02 -- Floppy ioctls */
+COMPATIBLE_IOCTL(FDMSGON)
+COMPATIBLE_IOCTL(FDMSGOFF)
+COMPATIBLE_IOCTL(FDSETEMSGTRESH)
+COMPATIBLE_IOCTL(FDFLUSH)
+COMPATIBLE_IOCTL(FDWERRORCLR)
+COMPATIBLE_IOCTL(FDSETMAXERRS)
+COMPATIBLE_IOCTL(FDGETMAXERRS)
+COMPATIBLE_IOCTL(FDGETDRVTYP)
+COMPATIBLE_IOCTL(FDEJECT)
+COMPATIBLE_IOCTL(FDCLRPRM)
+COMPATIBLE_IOCTL(FDFMTBEG)
+COMPATIBLE_IOCTL(FDFMTEND)
+COMPATIBLE_IOCTL(FDRESET)
+COMPATIBLE_IOCTL(FDTWADDLE)
+COMPATIBLE_IOCTL(FDFMTTRK)
+COMPATIBLE_IOCTL(FDRAWCMD)
+/* 0x12 */
+#ifdef CONFIG_BLOCK
+COMPATIBLE_IOCTL(BLKRASET)
+COMPATIBLE_IOCTL(BLKROSET)
+COMPATIBLE_IOCTL(BLKROGET)
+COMPATIBLE_IOCTL(BLKRRPART)
+COMPATIBLE_IOCTL(BLKFLSBUF)
+COMPATIBLE_IOCTL(BLKSECTSET)
+COMPATIBLE_IOCTL(BLKSSZGET)
+COMPATIBLE_IOCTL(BLKTRACESTART)
+COMPATIBLE_IOCTL(BLKTRACESTOP)
+COMPATIBLE_IOCTL(BLKTRACESETUP)
+COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
+ULONG_IOCTL(BLKRASET)
+ULONG_IOCTL(BLKFRASET)
+#endif
+/* RAID */
+COMPATIBLE_IOCTL(RAID_VERSION)
+COMPATIBLE_IOCTL(GET_ARRAY_INFO)
+COMPATIBLE_IOCTL(GET_DISK_INFO)
+COMPATIBLE_IOCTL(PRINT_RAID_DEBUG)
+COMPATIBLE_IOCTL(RAID_AUTORUN)
+COMPATIBLE_IOCTL(CLEAR_ARRAY)
+COMPATIBLE_IOCTL(ADD_NEW_DISK)
+ULONG_IOCTL(HOT_REMOVE_DISK)
+COMPATIBLE_IOCTL(SET_ARRAY_INFO)
+COMPATIBLE_IOCTL(SET_DISK_INFO)
+COMPATIBLE_IOCTL(WRITE_RAID_INFO)
+COMPATIBLE_IOCTL(UNPROTECT_ARRAY)
+COMPATIBLE_IOCTL(PROTECT_ARRAY)
+ULONG_IOCTL(HOT_ADD_DISK)
+ULONG_IOCTL(SET_DISK_FAULTY)
+COMPATIBLE_IOCTL(RUN_ARRAY)
+COMPATIBLE_IOCTL(STOP_ARRAY)
+COMPATIBLE_IOCTL(STOP_ARRAY_RO)
+COMPATIBLE_IOCTL(RESTART_ARRAY_RW)
+COMPATIBLE_IOCTL(GET_BITMAP_FILE)
+ULONG_IOCTL(SET_BITMAP_FILE)
+/* DM */
+COMPATIBLE_IOCTL(DM_VERSION_32)
+COMPATIBLE_IOCTL(DM_REMOVE_ALL_32)
+COMPATIBLE_IOCTL(DM_LIST_DEVICES_32)
+COMPATIBLE_IOCTL(DM_DEV_CREATE_32)
+COMPATIBLE_IOCTL(DM_DEV_REMOVE_32)
+COMPATIBLE_IOCTL(DM_DEV_RENAME_32)
+COMPATIBLE_IOCTL(DM_DEV_SUSPEND_32)
+COMPATIBLE_IOCTL(DM_DEV_STATUS_32)
+COMPATIBLE_IOCTL(DM_DEV_WAIT_32)
+COMPATIBLE_IOCTL(DM_TABLE_LOAD_32)
+COMPATIBLE_IOCTL(DM_TABLE_CLEAR_32)
+COMPATIBLE_IOCTL(DM_TABLE_DEPS_32)
+COMPATIBLE_IOCTL(DM_TABLE_STATUS_32)
+COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32)
+COMPATIBLE_IOCTL(DM_TARGET_MSG_32)
+COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY_32)
+COMPATIBLE_IOCTL(DM_VERSION)
+COMPATIBLE_IOCTL(DM_REMOVE_ALL)
+COMPATIBLE_IOCTL(DM_LIST_DEVICES)
+COMPATIBLE_IOCTL(DM_DEV_CREATE)
+COMPATIBLE_IOCTL(DM_DEV_REMOVE)
+COMPATIBLE_IOCTL(DM_DEV_RENAME)
+COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
+COMPATIBLE_IOCTL(DM_DEV_STATUS)
+COMPATIBLE_IOCTL(DM_DEV_WAIT)
+COMPATIBLE_IOCTL(DM_TABLE_LOAD)
+COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
+COMPATIBLE_IOCTL(DM_TABLE_DEPS)
+COMPATIBLE_IOCTL(DM_TABLE_STATUS)
+COMPATIBLE_IOCTL(DM_LIST_VERSIONS)
+COMPATIBLE_IOCTL(DM_TARGET_MSG)
+COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY)
+/* Big K */
+COMPATIBLE_IOCTL(PIO_FONT)
+COMPATIBLE_IOCTL(GIO_FONT)
+ULONG_IOCTL(KDSIGACCEPT)
+COMPATIBLE_IOCTL(KDGETKEYCODE)
+COMPATIBLE_IOCTL(KDSETKEYCODE)
+ULONG_IOCTL(KIOCSOUND)
+ULONG_IOCTL(KDMKTONE)
+COMPATIBLE_IOCTL(KDGKBTYPE)
+ULONG_IOCTL(KDSETMODE)
+COMPATIBLE_IOCTL(KDGETMODE)
+ULONG_IOCTL(KDSKBMODE)
+COMPATIBLE_IOCTL(KDGKBMODE)
+ULONG_IOCTL(KDSKBMETA)
+COMPATIBLE_IOCTL(KDGKBMETA)
+COMPATIBLE_IOCTL(KDGKBENT)
+COMPATIBLE_IOCTL(KDSKBENT)
+COMPATIBLE_IOCTL(KDGKBSENT)
+COMPATIBLE_IOCTL(KDSKBSENT)
+COMPATIBLE_IOCTL(KDGKBDIACR)
+COMPATIBLE_IOCTL(KDSKBDIACR)
+COMPATIBLE_IOCTL(KDKBDREP)
+COMPATIBLE_IOCTL(KDGKBLED)
+ULONG_IOCTL(KDSKBLED)
+COMPATIBLE_IOCTL(KDGETLED)
+ULONG_IOCTL(KDSETLED)
+COMPATIBLE_IOCTL(GIO_SCRNMAP)
+COMPATIBLE_IOCTL(PIO_SCRNMAP)
+COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
+COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
+COMPATIBLE_IOCTL(PIO_FONTRESET)
+COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
+/* Big S */
+COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
+COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
+COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK)
+COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY)
+COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
+COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
+COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
+COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
+/* Big T */
+COMPATIBLE_IOCTL(TUNSETNOCSUM)
+COMPATIBLE_IOCTL(TUNSETDEBUG)
+COMPATIBLE_IOCTL(TUNSETPERSIST)
+COMPATIBLE_IOCTL(TUNSETOWNER)
+/* Big V */
+COMPATIBLE_IOCTL(VT_SETMODE)
+COMPATIBLE_IOCTL(VT_GETMODE)
+COMPATIBLE_IOCTL(VT_GETSTATE)
+COMPATIBLE_IOCTL(VT_OPENQRY)
+ULONG_IOCTL(VT_ACTIVATE)
+ULONG_IOCTL(VT_WAITACTIVE)
+ULONG_IOCTL(VT_RELDISP)
+ULONG_IOCTL(VT_DISALLOCATE)
+COMPATIBLE_IOCTL(VT_RESIZE)
+COMPATIBLE_IOCTL(VT_RESIZEX)
+COMPATIBLE_IOCTL(VT_LOCKSWITCH)
+COMPATIBLE_IOCTL(VT_UNLOCKSWITCH)
+COMPATIBLE_IOCTL(VT_GETHIFONTMASK)
+/* Little p (/dev/rtc, /dev/envctrl, etc.) */
+COMPATIBLE_IOCTL(RTC_AIE_ON)
+COMPATIBLE_IOCTL(RTC_AIE_OFF)
+COMPATIBLE_IOCTL(RTC_UIE_ON)
+COMPATIBLE_IOCTL(RTC_UIE_OFF)
+COMPATIBLE_IOCTL(RTC_PIE_ON)
+COMPATIBLE_IOCTL(RTC_PIE_OFF)
+COMPATIBLE_IOCTL(RTC_WIE_ON)
+COMPATIBLE_IOCTL(RTC_WIE_OFF)
+COMPATIBLE_IOCTL(RTC_ALM_SET)
+COMPATIBLE_IOCTL(RTC_ALM_READ)
+COMPATIBLE_IOCTL(RTC_RD_TIME)
+COMPATIBLE_IOCTL(RTC_SET_TIME)
+COMPATIBLE_IOCTL(RTC_WKALM_SET)
+COMPATIBLE_IOCTL(RTC_WKALM_RD)
+/*
+ * These two are only for the sbus rtc driver, but
+ * hwclock tries them on every rtc device first when
+ * running on sparc.  On other architectures the entries
+ * are useless but harmless.
+ */
+COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */
+COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
+/* Little m */
+COMPATIBLE_IOCTL(MTIOCTOP)
+/* Socket level stuff */
+COMPATIBLE_IOCTL(FIOQSIZE)
+COMPATIBLE_IOCTL(FIOSETOWN)
+COMPATIBLE_IOCTL(SIOCSPGRP)
+COMPATIBLE_IOCTL(FIOGETOWN)
+COMPATIBLE_IOCTL(SIOCGPGRP)
+COMPATIBLE_IOCTL(SIOCATMARK)
+COMPATIBLE_IOCTL(SIOCSIFLINK)
+COMPATIBLE_IOCTL(SIOCSIFENCAP)
+COMPATIBLE_IOCTL(SIOCGIFENCAP)
+COMPATIBLE_IOCTL(SIOCSIFNAME)
+COMPATIBLE_IOCTL(SIOCSARP)
+COMPATIBLE_IOCTL(SIOCGARP)
+COMPATIBLE_IOCTL(SIOCDARP)
+COMPATIBLE_IOCTL(SIOCSRARP)
+COMPATIBLE_IOCTL(SIOCGRARP)
+COMPATIBLE_IOCTL(SIOCDRARP)
+COMPATIBLE_IOCTL(SIOCADDDLCI)
+COMPATIBLE_IOCTL(SIOCDELDLCI)
+COMPATIBLE_IOCTL(SIOCGMIIPHY)
+COMPATIBLE_IOCTL(SIOCGMIIREG)
+COMPATIBLE_IOCTL(SIOCSMIIREG)
+COMPATIBLE_IOCTL(SIOCGIFVLAN)
+COMPATIBLE_IOCTL(SIOCSIFVLAN)
+COMPATIBLE_IOCTL(SIOCBRADDBR)
+COMPATIBLE_IOCTL(SIOCBRDELBR)
+/* SG stuff */
+COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
+COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
+COMPATIBLE_IOCTL(SG_EMULATED_HOST)
+ULONG_IOCTL(SG_SET_TRANSFORM)
+COMPATIBLE_IOCTL(SG_GET_TRANSFORM)
+COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE)
+COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE)
+COMPATIBLE_IOCTL(SG_GET_SCSI_ID)
+COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA)
+COMPATIBLE_IOCTL(SG_GET_LOW_DMA)
+COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID)
+COMPATIBLE_IOCTL(SG_GET_PACK_ID)
+COMPATIBLE_IOCTL(SG_GET_NUM_WAITING)
+COMPATIBLE_IOCTL(SG_SET_DEBUG)
+COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE)
+COMPATIBLE_IOCTL(SG_GET_COMMAND_Q)
+COMPATIBLE_IOCTL(SG_SET_COMMAND_Q)
+COMPATIBLE_IOCTL(SG_GET_VERSION_NUM)
+COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN)
+COMPATIBLE_IOCTL(SG_SCSI_RESET)
+COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
+COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
+COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
+/* PPP stuff */
+COMPATIBLE_IOCTL(PPPIOCGFLAGS)
+COMPATIBLE_IOCTL(PPPIOCSFLAGS)
+COMPATIBLE_IOCTL(PPPIOCGASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCSASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCGUNIT)
+COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCGMRU)
+COMPATIBLE_IOCTL(PPPIOCSMRU)
+COMPATIBLE_IOCTL(PPPIOCSMAXCID)
+COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP)
+COMPATIBLE_IOCTL(PPPIOCXFERUNIT)
+/* PPPIOCSCOMPRESS is translated */
+COMPATIBLE_IOCTL(PPPIOCGNPMODE)
+COMPATIBLE_IOCTL(PPPIOCSNPMODE)
+COMPATIBLE_IOCTL(PPPIOCGDEBUG)
+COMPATIBLE_IOCTL(PPPIOCSDEBUG)
+/* PPPIOCSPASS is translated */
+/* PPPIOCSACTIVE is translated */
+/* PPPIOCGIDLE is translated */
+COMPATIBLE_IOCTL(PPPIOCNEWUNIT)
+COMPATIBLE_IOCTL(PPPIOCATTACH)
+COMPATIBLE_IOCTL(PPPIOCDETACH)
+COMPATIBLE_IOCTL(PPPIOCSMRRU)
+COMPATIBLE_IOCTL(PPPIOCCONNECT)
+COMPATIBLE_IOCTL(PPPIOCDISCONN)
+COMPATIBLE_IOCTL(PPPIOCATTCHAN)
+COMPATIBLE_IOCTL(PPPIOCGCHAN)
+/* PPPOX */
+COMPATIBLE_IOCTL(PPPOEIOCSFWD)
+COMPATIBLE_IOCTL(PPPOEIOCDFWD)
+/* LP */
+COMPATIBLE_IOCTL(LPGETSTATUS)
+/* ppdev */
+COMPATIBLE_IOCTL(PPSETMODE)
+COMPATIBLE_IOCTL(PPRSTATUS)
+COMPATIBLE_IOCTL(PPRCONTROL)
+COMPATIBLE_IOCTL(PPWCONTROL)
+COMPATIBLE_IOCTL(PPFCONTROL)
+COMPATIBLE_IOCTL(PPRDATA)
+COMPATIBLE_IOCTL(PPWDATA)
+COMPATIBLE_IOCTL(PPCLAIM)
+COMPATIBLE_IOCTL(PPRELEASE)
+COMPATIBLE_IOCTL(PPYIELD)
+COMPATIBLE_IOCTL(PPEXCL)
+COMPATIBLE_IOCTL(PPDATADIR)
+COMPATIBLE_IOCTL(PPNEGOT)
+COMPATIBLE_IOCTL(PPWCTLONIRQ)
+COMPATIBLE_IOCTL(PPCLRIRQ)
+COMPATIBLE_IOCTL(PPSETPHASE)
+COMPATIBLE_IOCTL(PPGETMODES)
+COMPATIBLE_IOCTL(PPGETMODE)
+COMPATIBLE_IOCTL(PPGETPHASE)
+COMPATIBLE_IOCTL(PPGETFLAGS)
+COMPATIBLE_IOCTL(PPSETFLAGS)
+/* CDROM stuff */
+COMPATIBLE_IOCTL(CDROMPAUSE)
+COMPATIBLE_IOCTL(CDROMRESUME)
+COMPATIBLE_IOCTL(CDROMPLAYMSF)
+COMPATIBLE_IOCTL(CDROMPLAYTRKIND)
+COMPATIBLE_IOCTL(CDROMREADTOCHDR)
+COMPATIBLE_IOCTL(CDROMREADTOCENTRY)
+COMPATIBLE_IOCTL(CDROMSTOP)
+COMPATIBLE_IOCTL(CDROMSTART)
+COMPATIBLE_IOCTL(CDROMEJECT)
+COMPATIBLE_IOCTL(CDROMVOLCTRL)
+COMPATIBLE_IOCTL(CDROMSUBCHNL)
+ULONG_IOCTL(CDROMEJECT_SW)
+COMPATIBLE_IOCTL(CDROMMULTISESSION)
+COMPATIBLE_IOCTL(CDROM_GET_MCN)
+COMPATIBLE_IOCTL(CDROMRESET)
+COMPATIBLE_IOCTL(CDROMVOLREAD)
+COMPATIBLE_IOCTL(CDROMSEEK)
+COMPATIBLE_IOCTL(CDROMPLAYBLK)
+COMPATIBLE_IOCTL(CDROMCLOSETRAY)
+ULONG_IOCTL(CDROM_SET_OPTIONS)
+ULONG_IOCTL(CDROM_CLEAR_OPTIONS)
+ULONG_IOCTL(CDROM_SELECT_SPEED)
+ULONG_IOCTL(CDROM_SELECT_DISC)
+ULONG_IOCTL(CDROM_MEDIA_CHANGED)
+ULONG_IOCTL(CDROM_DRIVE_STATUS)
+COMPATIBLE_IOCTL(CDROM_DISC_STATUS)
+COMPATIBLE_IOCTL(CDROM_CHANGER_NSLOTS)
+ULONG_IOCTL(CDROM_LOCKDOOR)
+ULONG_IOCTL(CDROM_DEBUG)
+COMPATIBLE_IOCTL(CDROM_GET_CAPABILITY)
+/* Ignore cdrom.h about these next 5 ioctls, they absolutely do
+ * not take a struct cdrom_read, instead they take a struct cdrom_msf
+ * which is compatible.
+ */
+COMPATIBLE_IOCTL(CDROMREADMODE2)
+COMPATIBLE_IOCTL(CDROMREADMODE1)
+COMPATIBLE_IOCTL(CDROMREADRAW)
+COMPATIBLE_IOCTL(CDROMREADCOOKED)
+COMPATIBLE_IOCTL(CDROMREADALL)
+/* DVD ioctls */
+COMPATIBLE_IOCTL(DVD_READ_STRUCT)
+COMPATIBLE_IOCTL(DVD_WRITE_STRUCT)
+COMPATIBLE_IOCTL(DVD_AUTH)
+/* pktcdvd */
+COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
+/* Big A */
+/* sparc only */
+/* Big Q for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE)
+COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS)
+COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL)
+COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND)
+COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL)
+COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE)
+/* Big T for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE)
+COMPATIBLE_IOCTL(SNDCTL_TMR_START)
+COMPATIBLE_IOCTL(SNDCTL_TMR_STOP)
+COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE)
+COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO)
+COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE)
+COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME)
+COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT)
+/* Little m for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME)
+COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE)
+COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD)
+/* Big P for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_DSP_RESET)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED)
+COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS)
+COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER)
+COMPATIBLE_IOCTL(SNDCTL_DSP_POST)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE)
+COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR)
+/* SNDCTL_DSP_MAPINBUF,  XXX needs translation */
+/* SNDCTL_DSP_MAPOUTBUF,  XXX needs translation */
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO)
+COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX)
+COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY)
+COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS)
+COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER)
+/* Big C for sound/OSS */
+COMPATIBLE_IOCTL(SNDCTL_COPR_RESET)
+COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE)
+COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA)
+COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RUN)
+COMPATIBLE_IOCTL(SNDCTL_COPR_HALT)
+COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG)
+COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG)
+/* Big M for sound/OSS */
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3)
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR))
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE)
+/* SOUND_MIXER_READ_ENHANCE,  same value as READ_MUTE */
+/* SOUND_MIXER_READ_LOUD,  same value as READ_MUTE */
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS)
+COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2)
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3)
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO))
+COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR))
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE)
+/* SOUND_MIXER_WRITE_ENHANCE,  same value as WRITE_MUTE */
+/* SOUND_MIXER_WRITE_LOUD,  same value as WRITE_MUTE */
+COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC)
+COMPATIBLE_IOCTL(SOUND_MIXER_INFO)
+COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO)
+COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS)
+COMPATIBLE_IOCTL(SOUND_MIXER_AGC)
+COMPATIBLE_IOCTL(SOUND_MIXER_3DSE)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4)
+COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
+COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
+COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
+COMPATIBLE_IOCTL(OSS_GETVERSION)
+/* AUTOFS */
+ULONG_IOCTL(AUTOFS_IOC_READY)
+ULONG_IOCTL(AUTOFS_IOC_FAIL)
+COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC)
+COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
+COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
+COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
+COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
+COMPATIBLE_IOCTL(AUTOFS_IOC_ASKREGHOST)
+COMPATIBLE_IOCTL(AUTOFS_IOC_TOGGLEREGHOST)
+COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
+/* Raw devices */
+COMPATIBLE_IOCTL(RAW_SETBIND)
+COMPATIBLE_IOCTL(RAW_GETBIND)
+/* SMB ioctls which do not need any translations */
+COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
+/* Little a */
+COMPATIBLE_IOCTL(ATMSIGD_CTRL)
+COMPATIBLE_IOCTL(ATMARPD_CTRL)
+COMPATIBLE_IOCTL(ATMLEC_CTRL)
+COMPATIBLE_IOCTL(ATMLEC_MCAST)
+COMPATIBLE_IOCTL(ATMLEC_DATA)
+COMPATIBLE_IOCTL(ATM_SETSC)
+COMPATIBLE_IOCTL(SIOCSIFATMTCP)
+COMPATIBLE_IOCTL(SIOCMKCLIP)
+COMPATIBLE_IOCTL(ATMARP_MKIP)
+COMPATIBLE_IOCTL(ATMARP_SETENTRY)
+COMPATIBLE_IOCTL(ATMARP_ENCAP)
+COMPATIBLE_IOCTL(ATMTCP_CREATE)
+COMPATIBLE_IOCTL(ATMTCP_REMOVE)
+COMPATIBLE_IOCTL(ATMMPC_CTRL)
+COMPATIBLE_IOCTL(ATMMPC_DATA)
+/* Watchdog */
+COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
+COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
+COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS)
+COMPATIBLE_IOCTL(WDIOC_GETTEMP)
+COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
+COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
+COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
+/* Big R */
+COMPATIBLE_IOCTL(RNDGETENTCNT)
+COMPATIBLE_IOCTL(RNDADDTOENTCNT)
+COMPATIBLE_IOCTL(RNDGETPOOL)
+COMPATIBLE_IOCTL(RNDADDENTROPY)
+COMPATIBLE_IOCTL(RNDZAPENTCNT)
+COMPATIBLE_IOCTL(RNDCLEARPOOL)
+/* Bluetooth */
+COMPATIBLE_IOCTL(HCIDEVUP)
+COMPATIBLE_IOCTL(HCIDEVDOWN)
+COMPATIBLE_IOCTL(HCIDEVRESET)
+COMPATIBLE_IOCTL(HCIDEVRESTAT)
+COMPATIBLE_IOCTL(HCIGETDEVLIST)
+COMPATIBLE_IOCTL(HCIGETDEVINFO)
+COMPATIBLE_IOCTL(HCIGETCONNLIST)
+COMPATIBLE_IOCTL(HCIGETCONNINFO)
+COMPATIBLE_IOCTL(HCISETRAW)
+COMPATIBLE_IOCTL(HCISETSCAN)
+COMPATIBLE_IOCTL(HCISETAUTH)
+COMPATIBLE_IOCTL(HCISETENCRYPT)
+COMPATIBLE_IOCTL(HCISETPTYPE)
+COMPATIBLE_IOCTL(HCISETLINKPOL)
+COMPATIBLE_IOCTL(HCISETLINKMODE)
+COMPATIBLE_IOCTL(HCISETACLMTU)
+COMPATIBLE_IOCTL(HCISETSCOMTU)
+COMPATIBLE_IOCTL(HCIINQUIRY)
+COMPATIBLE_IOCTL(HCIUARTSETPROTO)
+COMPATIBLE_IOCTL(HCIUARTGETPROTO)
+COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
+COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
+COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
+COMPATIBLE_IOCTL(RFCOMMGETDEVINFO)
+COMPATIBLE_IOCTL(RFCOMMSTEALDLC)
+COMPATIBLE_IOCTL(BNEPCONNADD)
+COMPATIBLE_IOCTL(BNEPCONNDEL)
+COMPATIBLE_IOCTL(BNEPGETCONNLIST)
+COMPATIBLE_IOCTL(BNEPGETCONNINFO)
+COMPATIBLE_IOCTL(CMTPCONNADD)
+COMPATIBLE_IOCTL(CMTPCONNDEL)
+COMPATIBLE_IOCTL(CMTPGETCONNLIST)
+COMPATIBLE_IOCTL(CMTPGETCONNINFO)
+COMPATIBLE_IOCTL(HIDPCONNADD)
+COMPATIBLE_IOCTL(HIDPCONNDEL)
+COMPATIBLE_IOCTL(HIDPGETCONNLIST)
+COMPATIBLE_IOCTL(HIDPGETCONNINFO)
+/* CAPI */
+COMPATIBLE_IOCTL(CAPI_REGISTER)
+COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER)
+COMPATIBLE_IOCTL(CAPI_GET_VERSION)
+COMPATIBLE_IOCTL(CAPI_GET_SERIAL)
+COMPATIBLE_IOCTL(CAPI_GET_PROFILE)
+COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD)
+COMPATIBLE_IOCTL(CAPI_GET_ERRCODE)
+COMPATIBLE_IOCTL(CAPI_INSTALLED)
+COMPATIBLE_IOCTL(CAPI_GET_FLAGS)
+COMPATIBLE_IOCTL(CAPI_SET_FLAGS)
+COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
+COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
+COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
+/* Siemens Gigaset */
+COMPATIBLE_IOCTL(GIGASET_REDIR)
+COMPATIBLE_IOCTL(GIGASET_CONFIG)
+COMPATIBLE_IOCTL(GIGASET_BRKCHARS)
+COMPATIBLE_IOCTL(GIGASET_VERSION)
+/* Misc. */
+COMPATIBLE_IOCTL(0x41545900)		/* ATYIO_CLKR */
+COMPATIBLE_IOCTL(0x41545901)		/* ATYIO_CLKW */
+COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
+COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
+COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
+COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
+/* USB */
+COMPATIBLE_IOCTL(USBDEVFS_RESETEP)
+COMPATIBLE_IOCTL(USBDEVFS_SETINTERFACE)
+COMPATIBLE_IOCTL(USBDEVFS_SETCONFIGURATION)
+COMPATIBLE_IOCTL(USBDEVFS_GETDRIVER)
+COMPATIBLE_IOCTL(USBDEVFS_DISCARDURB)
+COMPATIBLE_IOCTL(USBDEVFS_CLAIMINTERFACE)
+COMPATIBLE_IOCTL(USBDEVFS_RELEASEINTERFACE)
+COMPATIBLE_IOCTL(USBDEVFS_CONNECTINFO)
+COMPATIBLE_IOCTL(USBDEVFS_HUB_PORTINFO)
+COMPATIBLE_IOCTL(USBDEVFS_RESET)
+COMPATIBLE_IOCTL(USBDEVFS_SUBMITURB32)
+COMPATIBLE_IOCTL(USBDEVFS_REAPURB32)
+COMPATIBLE_IOCTL(USBDEVFS_REAPURBNDELAY32)
+COMPATIBLE_IOCTL(USBDEVFS_CLEAR_HALT)
+/* MTD */
+COMPATIBLE_IOCTL(MEMGETINFO)
+COMPATIBLE_IOCTL(MEMERASE)
+COMPATIBLE_IOCTL(MEMLOCK)
+COMPATIBLE_IOCTL(MEMUNLOCK)
+COMPATIBLE_IOCTL(MEMGETREGIONCOUNT)
+COMPATIBLE_IOCTL(MEMGETREGIONINFO)
+COMPATIBLE_IOCTL(MEMGETBADBLOCK)
+COMPATIBLE_IOCTL(MEMSETBADBLOCK)
+/* NBD */
+ULONG_IOCTL(NBD_SET_SOCK)
+ULONG_IOCTL(NBD_SET_BLKSIZE)
+ULONG_IOCTL(NBD_SET_SIZE)
+COMPATIBLE_IOCTL(NBD_DO_IT)
+COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
+COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
+COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
+ULONG_IOCTL(NBD_SET_SIZE_BLOCKS)
+COMPATIBLE_IOCTL(NBD_DISCONNECT)
+/* i2c */
+COMPATIBLE_IOCTL(I2C_SLAVE)
+COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
+COMPATIBLE_IOCTL(I2C_TENBIT)
+COMPATIBLE_IOCTL(I2C_PEC)
+COMPATIBLE_IOCTL(I2C_RETRIES)
+COMPATIBLE_IOCTL(I2C_TIMEOUT)
+/* wireless */
+COMPATIBLE_IOCTL(SIOCSIWCOMMIT)
+COMPATIBLE_IOCTL(SIOCGIWNAME)
+COMPATIBLE_IOCTL(SIOCSIWNWID)
+COMPATIBLE_IOCTL(SIOCGIWNWID)
+COMPATIBLE_IOCTL(SIOCSIWFREQ)
+COMPATIBLE_IOCTL(SIOCGIWFREQ)
+COMPATIBLE_IOCTL(SIOCSIWMODE)
+COMPATIBLE_IOCTL(SIOCGIWMODE)
+COMPATIBLE_IOCTL(SIOCSIWSENS)
+COMPATIBLE_IOCTL(SIOCGIWSENS)
+COMPATIBLE_IOCTL(SIOCSIWRANGE)
+COMPATIBLE_IOCTL(SIOCSIWPRIV)
+COMPATIBLE_IOCTL(SIOCGIWPRIV)
+COMPATIBLE_IOCTL(SIOCSIWSTATS)
+COMPATIBLE_IOCTL(SIOCGIWSTATS)
+COMPATIBLE_IOCTL(SIOCSIWAP)
+COMPATIBLE_IOCTL(SIOCGIWAP)
+COMPATIBLE_IOCTL(SIOCSIWSCAN)
+COMPATIBLE_IOCTL(SIOCSIWRATE)
+COMPATIBLE_IOCTL(SIOCGIWRATE)
+COMPATIBLE_IOCTL(SIOCSIWRTS)
+COMPATIBLE_IOCTL(SIOCGIWRTS)
+COMPATIBLE_IOCTL(SIOCSIWFRAG)
+COMPATIBLE_IOCTL(SIOCGIWFRAG)
+COMPATIBLE_IOCTL(SIOCSIWTXPOW)
+COMPATIBLE_IOCTL(SIOCGIWTXPOW)
+COMPATIBLE_IOCTL(SIOCSIWRETRY)
+COMPATIBLE_IOCTL(SIOCGIWRETRY)
+COMPATIBLE_IOCTL(SIOCSIWPOWER)
+COMPATIBLE_IOCTL(SIOCGIWPOWER)
+/* hiddev */
+COMPATIBLE_IOCTL(HIDIOCGVERSION)
+COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
+COMPATIBLE_IOCTL(HIDIOCGDEVINFO)
+COMPATIBLE_IOCTL(HIDIOCGSTRING)
+COMPATIBLE_IOCTL(HIDIOCINITREPORT)
+COMPATIBLE_IOCTL(HIDIOCGREPORT)
+COMPATIBLE_IOCTL(HIDIOCSREPORT)
+COMPATIBLE_IOCTL(HIDIOCGREPORTINFO)
+COMPATIBLE_IOCTL(HIDIOCGFIELDINFO)
+COMPATIBLE_IOCTL(HIDIOCGUSAGE)
+COMPATIBLE_IOCTL(HIDIOCSUSAGE)
+COMPATIBLE_IOCTL(HIDIOCGUCODE)
+COMPATIBLE_IOCTL(HIDIOCGFLAG)
+COMPATIBLE_IOCTL(HIDIOCSFLAG)
+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
+COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
+/* dvb */
+COMPATIBLE_IOCTL(AUDIO_STOP)
+COMPATIBLE_IOCTL(AUDIO_PLAY)
+COMPATIBLE_IOCTL(AUDIO_PAUSE)
+COMPATIBLE_IOCTL(AUDIO_CONTINUE)
+COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE)
+COMPATIBLE_IOCTL(AUDIO_SET_MUTE)
+COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC)
+COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE)
+COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT)
+COMPATIBLE_IOCTL(AUDIO_GET_STATUS)
+COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES)
+COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER)
+COMPATIBLE_IOCTL(AUDIO_SET_ID)
+COMPATIBLE_IOCTL(AUDIO_SET_MIXER)
+COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE)
+COMPATIBLE_IOCTL(AUDIO_SET_EXT_ID)
+COMPATIBLE_IOCTL(AUDIO_SET_ATTRIBUTES)
+COMPATIBLE_IOCTL(AUDIO_SET_KARAOKE)
+COMPATIBLE_IOCTL(DMX_START)
+COMPATIBLE_IOCTL(DMX_STOP)
+COMPATIBLE_IOCTL(DMX_SET_FILTER)
+COMPATIBLE_IOCTL(DMX_SET_PES_FILTER)
+COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE)
+COMPATIBLE_IOCTL(DMX_GET_PES_PIDS)
+COMPATIBLE_IOCTL(DMX_GET_CAPS)
+COMPATIBLE_IOCTL(DMX_SET_SOURCE)
+COMPATIBLE_IOCTL(DMX_GET_STC)
+COMPATIBLE_IOCTL(FE_GET_INFO)
+COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD)
+COMPATIBLE_IOCTL(FE_DISEQC_SEND_MASTER_CMD)
+COMPATIBLE_IOCTL(FE_DISEQC_RECV_SLAVE_REPLY)
+COMPATIBLE_IOCTL(FE_DISEQC_SEND_BURST)
+COMPATIBLE_IOCTL(FE_SET_TONE)
+COMPATIBLE_IOCTL(FE_SET_VOLTAGE)
+COMPATIBLE_IOCTL(FE_ENABLE_HIGH_LNB_VOLTAGE)
+COMPATIBLE_IOCTL(FE_READ_STATUS)
+COMPATIBLE_IOCTL(FE_READ_BER)
+COMPATIBLE_IOCTL(FE_READ_SIGNAL_STRENGTH)
+COMPATIBLE_IOCTL(FE_READ_SNR)
+COMPATIBLE_IOCTL(FE_READ_UNCORRECTED_BLOCKS)
+COMPATIBLE_IOCTL(FE_SET_FRONTEND)
+COMPATIBLE_IOCTL(FE_GET_FRONTEND)
+COMPATIBLE_IOCTL(FE_GET_EVENT)
+COMPATIBLE_IOCTL(FE_DISHNETWORK_SEND_LEGACY_CMD)
+COMPATIBLE_IOCTL(VIDEO_STOP)
+COMPATIBLE_IOCTL(VIDEO_PLAY)
+COMPATIBLE_IOCTL(VIDEO_FREEZE)
+COMPATIBLE_IOCTL(VIDEO_CONTINUE)
+COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE)
+COMPATIBLE_IOCTL(VIDEO_SET_BLANK)
+COMPATIBLE_IOCTL(VIDEO_GET_STATUS)
+COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT)
+COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD)
+COMPATIBLE_IOCTL(VIDEO_SLOWMOTION)
+COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES)
+COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER)
+COMPATIBLE_IOCTL(VIDEO_SET_ID)
+COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE)
+COMPATIBLE_IOCTL(VIDEO_SET_FORMAT)
+COMPATIBLE_IOCTL(VIDEO_SET_SYSTEM)
+COMPATIBLE_IOCTL(VIDEO_SET_HIGHLIGHT)
+COMPATIBLE_IOCTL(VIDEO_SET_SPU)
+COMPATIBLE_IOCTL(VIDEO_GET_NAVI)
+COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES)
+COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
+COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE)
+
+/* now things that need handlers */
 HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
 HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
 #ifdef CONFIG_NET
@@ -2594,6 +3455,8 @@ HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl)
 HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
 HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
+/* Not implemented in the native kernel */
+IGNORE_IOCTL(SIOCGIFCOUNT)
 HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl)
 HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
 HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
@@ -2617,6 +3480,170 @@ COMPATIBLE_IOCTL(LPRESET)
 /*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/
 COMPATIBLE_IOCTL(LPGETFLAGS)
 HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans)
+
+/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
+   but we don't want warnings on other file systems. So declare
+   them as compatible here. */
+#define VFAT_IOCTL_READDIR_BOTH32       _IOR('r', 1, struct compat_dirent[2])
+#define VFAT_IOCTL_READDIR_SHORT32      _IOR('r', 2, struct compat_dirent[2])
+
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
+IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
 };
 
-int ioctl_table_size = ARRAY_SIZE(ioctl_start);
+#define IOCTL_HASHSIZE 256
+static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
+
+static inline unsigned long ioctl32_hash(unsigned long cmd)
+{
+	return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE;
+}
+
+static void compat_ioctl_error(struct file *filp, unsigned int fd,
+		unsigned int cmd, unsigned long arg)
+{
+	char buf[10];
+	char *fn = "?";
+	char *path;
+
+	/* find the name of the device. */
+	path = (char *)__get_free_page(GFP_KERNEL);
+	if (path) {
+		fn = d_path(filp->f_path.dentry, filp->f_path.mnt, path, PAGE_SIZE);
+		if (IS_ERR(fn))
+			fn = "?";
+	}
+
+	 sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
+	if (!isprint(buf[1]))
+		sprintf(buf, "%02x", buf[1]);
+	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+			"cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
+			current->comm, current->pid,
+			(int)fd, (unsigned int)cmd, buf,
+			(cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
+			(unsigned int)arg, fn);
+
+	if (path)
+		free_page((unsigned long)path);
+}
+
+asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
+				unsigned long arg)
+{
+	struct file *filp;
+	int error = -EBADF;
+	struct ioctl_trans *t;
+	int fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (!filp)
+		goto out;
+
+	/* RED-PEN how should LSM module know it's handling 32bit? */
+	error = security_file_ioctl(filp, cmd, arg);
+	if (error)
+		goto out_fput;
+
+	/*
+	 * To allow the compat_ioctl handlers to be self contained
+	 * we need to check the common ioctls here first.
+	 * Just handle them with the standard handlers below.
+	 */
+	switch (cmd) {
+	case FIOCLEX:
+	case FIONCLEX:
+	case FIONBIO:
+	case FIOASYNC:
+	case FIOQSIZE:
+		break;
+
+	case FIBMAP:
+	case FIGETBSZ:
+	case FIONREAD:
+		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
+			break;
+		/*FALL THROUGH*/
+
+	default:
+		if (filp->f_op && filp->f_op->compat_ioctl) {
+			error = filp->f_op->compat_ioctl(filp, cmd, arg);
+			if (error != -ENOIOCTLCMD)
+				goto out_fput;
+		}
+
+		if (!filp->f_op ||
+		    (!filp->f_op->ioctl && !filp->f_op->unlocked_ioctl))
+			goto do_ioctl;
+		break;
+	}
+
+	for (t = ioctl32_hash_table[ioctl32_hash(cmd)]; t; t = t->next) {
+		if (t->cmd == cmd)
+			goto found_handler;
+	}
+
+#ifdef CONFIG_NET
+	if (S_ISSOCK(filp->f_path.dentry->d_inode->i_mode) &&
+	    cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
+		error = siocdevprivate_ioctl(fd, cmd, arg);
+	} else
+#endif
+	{
+		static int count;
+
+		if (++count <= 50)
+			compat_ioctl_error(filp, fd, cmd, arg);
+		error = -EINVAL;
+	}
+
+	goto out_fput;
+
+ found_handler:
+	if (t->handler) {
+		lock_kernel();
+		error = t->handler(fd, cmd, arg, filp);
+		unlock_kernel();
+		goto out_fput;
+	}
+
+ do_ioctl:
+	error = vfs_ioctl(filp, fd, cmd, arg);
+ out_fput:
+	fput_light(filp, fput_needed);
+ out:
+	return error;
+}
+
+static void ioctl32_insert_translation(struct ioctl_trans *trans)
+{
+	unsigned long hash;
+	struct ioctl_trans *t;
+
+	hash = ioctl32_hash (trans->cmd);
+	if (!ioctl32_hash_table[hash])
+		ioctl32_hash_table[hash] = trans;
+	else {
+		t = ioctl32_hash_table[hash];
+		while (t->next)
+			t = t->next;
+		trans->next = NULL;
+		t->next = trans;
+	}
+}
+
+static int __init init_sys32_ioctl(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ioctl_start); i++) {
+		if (ioctl_start[i].next != 0) {
+			printk("ioctl translation %d bad\n",i);
+			return -1;
+		}
+
+		ioctl32_insert_translation(&ioctl_start[i]);
+	}
+	return 0;
+}
+__initcall(init_sys32_ioctl);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index d98be5e01328..3527c7c6def8 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -77,36 +77,6 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
 	return ret;
 }
 
-
-/**
- *	flush_read_buffer - push buffer to userspace.
- *	@buffer:	data buffer for file.
- *	@userbuf:	user-passed buffer.
- *	@count:		number of bytes requested.
- *	@ppos:		file position.
- *
- *	Copy the buffer we filled in fill_read_buffer() to userspace.
- *	This is done at the reader's leisure, copying and advancing
- *	the amount they specify each time.
- *	This may be called continuously until the buffer is empty.
- */
-static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
-			     size_t count, loff_t * ppos)
-{
-	int error;
-
-	if (*ppos > buffer->count)
-		return 0;
-
-	if (count > (buffer->count - *ppos))
-		count = buffer->count - *ppos;
-
-	error = copy_to_user(buf,buffer->page + *ppos,count);
-	if (!error)
-		*ppos += count;
-	return error ? -EFAULT : count;
-}
-
 /**
  *	configfs_read_file - read an attribute.
  *	@file:	file pointer.
@@ -139,7 +109,8 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
 	}
 	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
 		 __FUNCTION__, count, *ppos, buffer->page);
-	retval = flush_read_buffer(buffer,buf,count,ppos);
+	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
+					 buffer->count);
 out:
 	up(&buffer->sem);
 	return retval;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 2ec9beac17cf..ddc003a9d214 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
+#include <linux/sched.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 6f573004cd7d..b00d962de833 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
 	if (!configfs_dir_cachep)
 		goto out;
 
-	kset_set_kset_s(&config_subsys, kernel_subsys);
+	kobj_set_kset_s(&config_subsys, kernel_subsys);
 	err = subsystem_register(&config_subsys);
 	if (err) {
 		kmem_cache_destroy(configfs_dir_cachep);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index facd0c89be8f..3d194a2be3f5 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -180,7 +180,8 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 		struct page *page = NULL;
 
 		if (blocknr + i < devsize) {
-			page = read_mapping_page(mapping, blocknr + i, NULL);
+			page = read_mapping_page_async(mapping, blocknr + i,
+									NULL);
 			/* synchronous error? */
 			if (IS_ERR(page))
 				page = NULL;
diff --git a/fs/dcache.c b/fs/dcache.c
index d68631f18df1..0e73aa0a0e8b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -21,7 +21,6 @@
 #include <linux/fsnotify.h>
 #include <linux/slab.h>
 #include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/hash.h>
 #include <linux/cache.h>
 #include <linux/module.h>
@@ -121,6 +120,28 @@ static void dentry_iput(struct dentry * dentry)
 	}
 }
 
+/**
+ * d_kill - kill dentry and return parent
+ * @dentry: dentry to kill
+ *
+ * Called with dcache_lock and d_lock, releases both.  The dentry must
+ * already be unhashed and removed from the LRU.
+ *
+ * If this is the root of the dentry tree, return NULL.
+ */
+static struct dentry *d_kill(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	list_del(&dentry->d_u.d_child);
+	dentry_stat.nr_dentry--;	/* For d_free, below */
+	/*drops the locks, at that point nobody can reach this dentry */
+	dentry_iput(dentry);
+	parent = dentry->d_parent;
+	d_free(dentry);
+	return dentry == parent ? NULL : parent;
+}
+
 /* 
  * This is dput
  *
@@ -189,28 +210,17 @@ repeat:
 
 unhash_it:
 	__d_drop(dentry);
-
-kill_it: {
-		struct dentry *parent;
-
-		/* If dentry was on d_lru list
-		 * delete it from there
-		 */
-  		if (!list_empty(&dentry->d_lru)) {
-  			list_del(&dentry->d_lru);
-  			dentry_stat.nr_unused--;
-  		}
-  		list_del(&dentry->d_u.d_child);
-		dentry_stat.nr_dentry--;	/* For d_free, below */
-		/*drops the locks, at that point nobody can reach this dentry */
-		dentry_iput(dentry);
-		parent = dentry->d_parent;
-		d_free(dentry);
-		if (dentry == parent)
-			return;
-		dentry = parent;
-		goto repeat;
+kill_it:
+	/* If dentry was on d_lru list
+	 * delete it from there
+	 */
+	if (!list_empty(&dentry->d_lru)) {
+		list_del(&dentry->d_lru);
+		dentry_stat.nr_unused--;
 	}
+	dentry = d_kill(dentry);
+	if (dentry)
+		goto repeat;
 }
 
 /**
@@ -371,22 +381,40 @@ restart:
  * Throw away a dentry - free the inode, dput the parent.  This requires that
  * the LRU list has already been removed.
  *
+ * If prune_parents is true, try to prune ancestors as well.
+ *
  * Called with dcache_lock, drops it and then regains.
  * Called with dentry->d_lock held, drops it.
  */
-static void prune_one_dentry(struct dentry * dentry)
+static void prune_one_dentry(struct dentry * dentry, int prune_parents)
 {
-	struct dentry * parent;
-
 	__d_drop(dentry);
-	list_del(&dentry->d_u.d_child);
-	dentry_stat.nr_dentry--;	/* For d_free, below */
-	dentry_iput(dentry);
-	parent = dentry->d_parent;
-	d_free(dentry);
-	if (parent != dentry)
-		dput(parent);
+	dentry = d_kill(dentry);
+	if (!prune_parents) {
+		dput(dentry);
+		spin_lock(&dcache_lock);
+		return;
+	}
+
+	/*
+	 * Prune ancestors.  Locking is simpler than in dput(),
+	 * because dcache_lock needs to be taken anyway.
+	 */
 	spin_lock(&dcache_lock);
+	while (dentry) {
+		if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
+			return;
+
+		if (dentry->d_op && dentry->d_op->d_delete)
+			dentry->d_op->d_delete(dentry);
+		if (!list_empty(&dentry->d_lru)) {
+			list_del(&dentry->d_lru);
+			dentry_stat.nr_unused--;
+		}
+		__d_drop(dentry);
+		dentry = d_kill(dentry);
+		spin_lock(&dcache_lock);
+	}
 }
 
 /**
@@ -394,6 +422,7 @@ static void prune_one_dentry(struct dentry * dentry)
  * @count: number of entries to try and free
  * @sb: if given, ignore dentries for other superblocks
  *         which are being unmounted.
+ * @prune_parents: if true, try to prune ancestors as well in one go
  *
  * Shrink the dcache. This is done when we need
  * more memory, or simply when we need to unmount
@@ -404,7 +433,7 @@ static void prune_one_dentry(struct dentry * dentry)
  * all the dentries are in use.
  */
  
-static void prune_dcache(int count, struct super_block *sb)
+static void prune_dcache(int count, struct super_block *sb, int prune_parents)
 {
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
@@ -464,7 +493,7 @@ static void prune_dcache(int count, struct super_block *sb)
 		 * without taking the s_umount lock (I already hold it).
 		 */
 		if (sb && dentry->d_sb == sb) {
-			prune_one_dentry(dentry);
+			prune_one_dentry(dentry, prune_parents);
 			continue;
 		}
 		/*
@@ -479,7 +508,7 @@ static void prune_dcache(int count, struct super_block *sb)
 		s_umount = &dentry->d_sb->s_umount;
 		if (down_read_trylock(s_umount)) {
 			if (dentry->d_sb->s_root != NULL) {
-				prune_one_dentry(dentry);
+				prune_one_dentry(dentry, prune_parents);
 				up_read(s_umount);
 				continue;
 			}
@@ -550,7 +579,7 @@ repeat:
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		prune_one_dentry(dentry);
+		prune_one_dentry(dentry, 1);
 		cond_resched_lock(&dcache_lock);
 		goto repeat;
 	}
@@ -829,7 +858,7 @@ void shrink_dcache_parent(struct dentry * parent)
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found, parent->d_sb);
+		prune_dcache(found, parent->d_sb, 1);
 }
 
 /*
@@ -849,7 +878,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr, NULL);
+		prune_dcache(nr, NULL, 1);
 	}
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
@@ -1823,6 +1852,16 @@ char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
 	struct vfsmount *rootmnt;
 	struct dentry *root;
 
+	/*
+	 * We have various synthetic filesystems that never get mounted.  On
+	 * these filesystems dentries are never used for lookup purposes, and
+	 * thus don't need to be hashed.  They also don't need a name until a
+	 * user wants to identify the object in /proc/pid/fd/.  The little hack
+	 * below allows us to generate a name for these objects on demand:
+	 */
+	if (dentry->d_op && dentry->d_op->d_dname)
+		return dentry->d_op->d_dname(dentry, buf, buflen);
+
 	read_lock(&current->fs->lock);
 	rootmnt = mntget(current->fs->rootmnt);
 	root = dget(current->fs->root);
@@ -1836,6 +1875,27 @@ char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
 }
 
 /*
+ * Helper function for dentry_operations.d_dname() members
+ */
+char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
+			const char *fmt, ...)
+{
+	va_list args;
+	char temp[64];
+	int sz;
+
+	va_start(args, fmt);
+	sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
+	va_end(args);
+
+	if (sz > sizeof(temp) || sz > buflen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	buffer += buflen - sz;
+	return memcpy(buffer, temp, sz);
+}
+
+/*
  * NOTE! The user-level library version returns a
  * character pointer. The kernel system call just
  * returns the length of the buffer filled (which
@@ -2052,12 +2112,8 @@ static void __init dcache_init(unsigned long mempages)
 	 * but it is probably not worth it because of the cache nature
 	 * of the dcache. 
 	 */
-	dentry_cache = kmem_cache_create("dentry_cache",
-					 sizeof(struct dentry),
-					 0,
-					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-					 SLAB_MEM_SPREAD),
-					 NULL, NULL);
+	dentry_cache = KMEM_CACHE(dentry,
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
 	
 	set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7b324cfebcb1..ec8896b264de 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -374,7 +374,7 @@ static int __init debugfs_init(void)
 {
 	int retval;
 
-	kset_set_kset_s(&debug_subsys, kernel_subsys);
+	kobj_set_kset_s(&debug_subsys, kernel_subsys);
 	retval = subsystem_register(&debug_subsys);
 	if (retval)
 		return retval;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 643e57b622bd..06ef9a255c76 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -19,6 +19,7 @@
 #include <linux/tty.h>
 #include <linux/devpts_fs.h>
 #include <linux/parser.h>
+#include <linux/fsnotify.h>
 
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 
@@ -178,8 +179,10 @@ int devpts_pty_new(struct tty_struct *tty)
 	inode->i_private = tty;
 
 	dentry = get_node(number);
-	if (!IS_ERR(dentry) && !dentry->d_inode)
+	if (!IS_ERR(dentry) && !dentry->d_inode) {
 		d_instantiate(dentry, inode);
+		fsnotify_create(devpts_root->d_inode, dentry);
+	}
 
 	mutex_unlock(&devpts_root->d_inode->i_mutex);
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d9d0833444f5..8593f3dfd299 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -439,7 +439,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
  * Wait on and process all in-flight BIOs.  This must only be called once
  * all bios have been issued so that the refcount can only decrease.
  * This just waits for all bios to make it through dio_bio_complete.  IO
- * errors are propogated through dio->io_error and should be propogated via
+ * errors are propagated through dio->io_error and should be propagated via
  * dio_complete().
  */
 static void dio_await_completion(struct dio *dio)
@@ -867,7 +867,6 @@ static int do_direct_IO(struct dio *dio)
 do_holes:
 			/* Handle holes */
 			if (!buffer_mapped(map_bh)) {
-				char *kaddr;
 				loff_t i_size_aligned;
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
@@ -888,11 +887,8 @@ do_holes:
 					page_cache_release(page);
 					goto out;
 				}
-				kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + (block_in_page << blkbits),
-						0, 1 << blkbits);
-				flush_dcache_page(page);
-				kunmap_atomic(kaddr, KM_USER0);
+				zero_user_page(page, block_in_page << blkbits,
+						1 << blkbits, KM_USER0);
 				dio->block_in_file++;
 				block_in_page++;
 				goto next_block;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 6fa7b0d5c043..69a94690e493 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,36 +3,19 @@ menu "Distributed Lock Manager"
 
 config DLM
 	tristate "Distributed Lock Manager (DLM)"
-	depends on SYSFS && (IPV6 || IPV6=n)
+	depends on IPV6 || IPV6=n
 	select CONFIGFS_FS
-	select IP_SCTP if DLM_SCTP
+	select IP_SCTP
 	help
-	  A general purpose distributed lock manager for kernel or userspace
-	  applications.
-
-choice
-	prompt "Select DLM communications protocol"
-	depends on DLM
-	default DLM_TCP
-	help
-	  The DLM Can use TCP or SCTP for it's network communications.
-	  SCTP supports multi-homed operations whereas TCP doesn't.
-	  However, SCTP seems to have stability problems at the moment.
-
-config DLM_TCP
-	bool "TCP/IP"
-
-config DLM_SCTP
-	bool "SCTP"
-
-endchoice
+	A general purpose distributed lock manager for kernel or userspace
+	applications.
 
 config DLM_DEBUG
 	bool "DLM debugging"
 	depends on DLM
 	help
-	  Under the debugfs mount point, the name of each lockspace will
-	  appear as a file in the "dlm" directory.  The output is the
-	  list of resource and locks the local node knows about.
+	Under the debugfs mount point, the name of each lockspace will
+	appear as a file in the "dlm" directory.  The output is the
+	list of resource and locks the local node knows about.
 
 endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 65388944eba0..604cf7dc5f39 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -8,14 +8,12 @@ dlm-y :=			ast.o \
 				member.o \
 				memory.o \
 				midcomms.o \
+				lowcomms.o \
 				rcom.o \
 				recover.o \
 				recoverd.o \
 				requestqueue.o \
 				user.o \
-				util.o
+				util.o 
 dlm-$(CONFIG_DLM_DEBUG) +=	debug_fs.o
 
-dlm-$(CONFIG_DLM_TCP)   += lowcomms-tcp.o
-
-dlm-$(CONFIG_DLM_SCTP)  += lowcomms-sctp.o
-\ No newline at end of file
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index f91d39cb1e0b..6308122890ca 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -14,6 +14,7 @@
 #include "dlm_internal.h"
 #include "lock.h"
 #include "user.h"
+#include "ast.h"
 
 #define WAKE_ASTS  0
 
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 8665c88e5af2..822abdcd1434 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -89,6 +89,7 @@ struct cluster {
 	unsigned int cl_toss_secs;
 	unsigned int cl_scan_secs;
 	unsigned int cl_log_debug;
+	unsigned int cl_protocol;
 };
 
 enum {
@@ -101,6 +102,7 @@ enum {
 	CLUSTER_ATTR_TOSS_SECS,
 	CLUSTER_ATTR_SCAN_SECS,
 	CLUSTER_ATTR_LOG_DEBUG,
+	CLUSTER_ATTR_PROTOCOL,
 };
 
 struct cluster_attribute {
@@ -159,6 +161,7 @@ CLUSTER_ATTR(recover_timer, 1);
 CLUSTER_ATTR(toss_secs, 1);
 CLUSTER_ATTR(scan_secs, 1);
 CLUSTER_ATTR(log_debug, 0);
+CLUSTER_ATTR(protocol, 0);
 
 static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -170,6 +173,7 @@ static struct configfs_attribute *cluster_attrs[] = {
 	[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
 	[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
 	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
+	[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
 	NULL,
 };
 
@@ -904,6 +908,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_SCAN_SECS          5
 #define DEFAULT_LOG_DEBUG          0
+#define DEFAULT_PROTOCOL           0
 
 struct dlm_config_info dlm_config = {
 	.ci_tcp_port = DEFAULT_TCP_PORT,
@@ -914,6 +919,7 @@ struct dlm_config_info dlm_config = {
 	.ci_recover_timer = DEFAULT_RECOVER_TIMER,
 	.ci_toss_secs = DEFAULT_TOSS_SECS,
 	.ci_scan_secs = DEFAULT_SCAN_SECS,
-	.ci_log_debug = DEFAULT_LOG_DEBUG
+	.ci_log_debug = DEFAULT_LOG_DEBUG,
+	.ci_protocol = DEFAULT_PROTOCOL
 };
 
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 1e978611a96e..967cc3d72e5e 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -26,6 +26,7 @@ struct dlm_config_info {
 	int ci_toss_secs;
 	int ci_scan_secs;
 	int ci_log_debug;
+	int ci_protocol;
 };
 
 extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 61d93201e1b2..30994d68f6a0 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -210,6 +210,9 @@ struct dlm_args {
 #define DLM_IFL_MSTCPY		0x00010000
 #define DLM_IFL_RESEND		0x00020000
 #define DLM_IFL_DEAD		0x00040000
+#define DLM_IFL_OVERLAP_UNLOCK  0x00080000
+#define DLM_IFL_OVERLAP_CANCEL  0x00100000
+#define DLM_IFL_ENDOFLIFE	0x00200000
 #define DLM_IFL_USER		0x00000001
 #define DLM_IFL_ORPHAN		0x00000002
 
@@ -230,8 +233,8 @@ struct dlm_lkb {
 	int8_t			lkb_grmode;	/* granted lock mode */
 	int8_t			lkb_bastmode;	/* requested mode */
 	int8_t			lkb_highbast;	/* highest mode bast sent for */
-
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
+	int8_t			lkb_wait_count;
 	int8_t			lkb_ast_type;	/* type of ast queued for */
 
 	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
@@ -339,6 +342,7 @@ struct dlm_header {
 #define DLM_MSG_LOOKUP		11
 #define DLM_MSG_REMOVE		12
 #define DLM_MSG_LOOKUP_REPLY	13
+#define DLM_MSG_PURGE		14
 
 struct dlm_message {
 	struct dlm_header	m_header;
@@ -440,6 +444,9 @@ struct dlm_ls {
 	struct mutex		ls_waiters_mutex;
 	struct list_head	ls_waiters;	/* lkbs needing a reply */
 
+	struct mutex		ls_orphans_mutex;
+	struct list_head	ls_orphans;
+
 	struct list_head	ls_nodes;	/* current nodes in ls */
 	struct list_head	ls_nodes_gone;	/* dead node list, recovery */
 	int			ls_num_nodes;	/* number of nodes in ls */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e725005fafd0..d8d6e729f96b 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -85,6 +85,7 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 				    struct dlm_message *ms);
 static int receive_extralen(struct dlm_message *ms);
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
 
 /*
  * Lock compatibilty matrix - thanks Steve
@@ -223,6 +224,16 @@ static inline int is_demoted(struct dlm_lkb *lkb)
 	return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
 }
 
+static inline int is_altmode(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_sbflags & DLM_SBF_ALTMODE);
+}
+
+static inline int is_granted(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_status == DLM_LKSTS_GRANTED);
+}
+
 static inline int is_remote(struct dlm_rsb *r)
 {
 	DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
@@ -254,6 +265,22 @@ static inline int down_conversion(struct dlm_lkb *lkb)
 	return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
 }
 
+static inline int is_overlap_unlock(struct dlm_lkb *lkb)
+{
+	return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK;
+}
+
+static inline int is_overlap_cancel(struct dlm_lkb *lkb)
+{
+	return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL;
+}
+
+static inline int is_overlap(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK |
+				  DLM_IFL_OVERLAP_CANCEL));
+}
+
 static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 {
 	if (is_master_copy(lkb))
@@ -267,6 +294,12 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 	dlm_add_ast(lkb, AST_COMP);
 }
 
+static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	queue_cast(r, lkb,
+		   is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
+}
+
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
 	if (is_master_copy(lkb))
@@ -547,6 +580,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 	lkb->lkb_grmode = DLM_LOCK_IV;
 	kref_init(&lkb->lkb_ref);
 	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+	INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
 
 	get_random_bytes(&bucket, sizeof(bucket));
 	bucket &= (ls->ls_lkbtbl_size - 1);
@@ -556,7 +590,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 	/* counter can roll over so we must verify lkid is not in use */
 
 	while (lkid == 0) {
-		lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+		lkid = (bucket << 16) | ls->ls_lkbtbl[bucket].counter++;
 
 		list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
 				    lkb_idtbl_list) {
@@ -577,8 +611,8 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 
 static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
 {
-	uint16_t bucket = lkid & 0xFFFF;
 	struct dlm_lkb *lkb;
+	uint16_t bucket = (lkid >> 16);
 
 	list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
 		if (lkb->lkb_id == lkid)
@@ -590,7 +624,7 @@ static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
 static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
 {
 	struct dlm_lkb *lkb;
-	uint16_t bucket = lkid & 0xFFFF;
+	uint16_t bucket = (lkid >> 16);
 
 	if (bucket >= ls->ls_lkbtbl_size)
 		return -EBADSLT;
@@ -620,7 +654,7 @@ static void kill_lkb(struct kref *kref)
 
 static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
-	uint16_t bucket = lkb->lkb_id & 0xFFFF;
+	uint16_t bucket = (lkb->lkb_id >> 16);
 
 	write_lock(&ls->ls_lkbtbl[bucket].lock);
 	if (kref_put(&lkb->lkb_ref, kill_lkb)) {
@@ -735,23 +769,75 @@ static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
 	unhold_lkb(lkb);
 }
 
+static int msg_reply_type(int mstype)
+{
+	switch (mstype) {
+	case DLM_MSG_REQUEST:
+		return DLM_MSG_REQUEST_REPLY;
+	case DLM_MSG_CONVERT:
+		return DLM_MSG_CONVERT_REPLY;
+	case DLM_MSG_UNLOCK:
+		return DLM_MSG_UNLOCK_REPLY;
+	case DLM_MSG_CANCEL:
+		return DLM_MSG_CANCEL_REPLY;
+	case DLM_MSG_LOOKUP:
+		return DLM_MSG_LOOKUP_REPLY;
+	}
+	return -1;
+}
+
 /* add/remove lkb from global waiters list of lkb's waiting for
    a reply from a remote node */
 
-static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error = 0;
 
 	mutex_lock(&ls->ls_waiters_mutex);
-	if (lkb->lkb_wait_type) {
-		log_print("add_to_waiters error %d", lkb->lkb_wait_type);
+
+	if (is_overlap_unlock(lkb) ||
+	    (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
+		switch (mstype) {
+		case DLM_MSG_UNLOCK:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			break;
+		case DLM_MSG_CANCEL:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			break;
+		default:
+			error = -EBUSY;
+			goto out;
+		}
+		lkb->lkb_wait_count++;
+		hold_lkb(lkb);
+
+		log_debug(ls, "add overlap %x cur %d new %d count %d flags %x",
+			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
+			  lkb->lkb_wait_count, lkb->lkb_flags);
 		goto out;
 	}
+
+	DLM_ASSERT(!lkb->lkb_wait_count,
+		   dlm_print_lkb(lkb);
+		   printk("wait_count %d\n", lkb->lkb_wait_count););
+
+	lkb->lkb_wait_count++;
 	lkb->lkb_wait_type = mstype;
-	kref_get(&lkb->lkb_ref);
+	hold_lkb(lkb);
 	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
  out:
+	if (error)
+		log_error(ls, "add_to_waiters %x error %d flags %x %d %d %s",
+			  lkb->lkb_id, error, lkb->lkb_flags, mstype,
+			  lkb->lkb_wait_type, lkb->lkb_resource->res_name);
 	mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
 }
 
 /* We clear the RESEND flag because we might be taking an lkb off the waiters
@@ -759,34 +845,85 @@ static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
    request reply on the requestqueue) between dlm_recover_waiters_pre() which
    set RESEND and dlm_recover_waiters_post() */
 
-static int _remove_from_waiters(struct dlm_lkb *lkb)
+static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 {
-	int error = 0;
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int overlap_done = 0;
 
-	if (!lkb->lkb_wait_type) {
-		log_print("remove_from_waiters error");
-		error = -EINVAL;
-		goto out;
+	if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		overlap_done = 1;
+		goto out_del;
+	}
+
+	if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		overlap_done = 1;
+		goto out_del;
+	}
+
+	/* N.B. type of reply may not always correspond to type of original
+	   msg due to lookup->request optimization, verify others? */
+
+	if (lkb->lkb_wait_type) {
+		lkb->lkb_wait_type = 0;
+		goto out_del;
+	}
+
+	log_error(ls, "remove_from_waiters lkid %x flags %x types %d %d",
+		  lkb->lkb_id, lkb->lkb_flags, mstype, lkb->lkb_wait_type);
+	return -1;
+
+ out_del:
+	/* the force-unlock/cancel has completed and we haven't recvd a reply
+	   to the op that was in progress prior to the unlock/cancel; we
+	   give up on any reply to the earlier op.  FIXME: not sure when/how
+	   this would happen */
+
+	if (overlap_done && lkb->lkb_wait_type) {
+		log_error(ls, "remove_from_waiters %x reply %d give up on %d",
+			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
+		lkb->lkb_wait_count--;
+		lkb->lkb_wait_type = 0;
 	}
-	lkb->lkb_wait_type = 0;
+
+	DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
+
 	lkb->lkb_flags &= ~DLM_IFL_RESEND;
-	list_del(&lkb->lkb_wait_reply);
+	lkb->lkb_wait_count--;
+	if (!lkb->lkb_wait_count)
+		list_del_init(&lkb->lkb_wait_reply);
 	unhold_lkb(lkb);
- out:
-	return error;
+	return 0;
 }
 
-static int remove_from_waiters(struct dlm_lkb *lkb)
+static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int error;
 
 	mutex_lock(&ls->ls_waiters_mutex);
-	error = _remove_from_waiters(lkb);
+	error = _remove_from_waiters(lkb, mstype);
 	mutex_unlock(&ls->ls_waiters_mutex);
 	return error;
 }
 
+/* Handles situations where we might be processing a "fake" or "stub" reply in
+   which we can't try to take waiters_mutex again. */
+
+static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error;
+
+	if (ms != &ls->ls_stub_ms)
+		mutex_lock(&ls->ls_waiters_mutex);
+	error = _remove_from_waiters(lkb, ms->m_type);
+	if (ms != &ls->ls_stub_ms)
+		mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
+}
+
 static void dir_remove(struct dlm_rsb *r)
 {
 	int to_nodeid;
@@ -988,8 +1125,14 @@ static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	_remove_lock(r, lkb);
 }
 
-static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+/* returns: 0 did nothing
+	    1 moved lock to granted
+	   -1 removed lock */
+
+static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
+	int rv = 0;
+
 	lkb->lkb_rqmode = DLM_LOCK_IV;
 
 	switch (lkb->lkb_status) {
@@ -997,6 +1140,7 @@ static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		break;
 	case DLM_LKSTS_CONVERT:
 		move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+		rv = 1;
 		break;
 	case DLM_LKSTS_WAITING:
 		del_lkb(r, lkb);
@@ -1004,15 +1148,17 @@ static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		/* this unhold undoes the original ref from create_lkb()
 		   so this leads to the lkb being freed */
 		unhold_lkb(lkb);
+		rv = -1;
 		break;
 	default:
 		log_print("invalid status for revert %d", lkb->lkb_status);
 	}
+	return rv;
 }
 
-static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
-	revert_lock(r, lkb);
+	return revert_lock(r, lkb);
 }
 
 static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -1055,6 +1201,50 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		queue_cast(r, lkb, 0);
 }
 
+/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
+   change the granted/requested modes.  We're munging things accordingly in
+   the process copy.
+   CONVDEADLK: our grmode may have been forced down to NL to resolve a
+   conversion deadlock
+   ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
+   compatible with other granted locks */
+
+static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
+		log_print("munge_demoted %x invalid reply type %d",
+			  lkb->lkb_id, ms->m_type);
+		return;
+	}
+
+	if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
+		log_print("munge_demoted %x invalid modes gr %d rq %d",
+			  lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
+		return;
+	}
+
+	lkb->lkb_grmode = DLM_LOCK_NL;
+}
+
+static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	if (ms->m_type != DLM_MSG_REQUEST_REPLY &&
+	    ms->m_type != DLM_MSG_GRANT) {
+		log_print("munge_altmode %x invalid reply type %d",
+			  lkb->lkb_id, ms->m_type);
+		return;
+	}
+
+	if (lkb->lkb_exflags & DLM_LKF_ALTPR)
+		lkb->lkb_rqmode = DLM_LOCK_PR;
+	else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
+		lkb->lkb_rqmode = DLM_LOCK_CW;
+	else {
+		log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
+		dlm_print_lkb(lkb);
+	}
+}
+
 static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
 {
 	struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
@@ -1499,7 +1689,7 @@ static void process_lookup_list(struct dlm_rsb *r)
 	struct dlm_lkb *lkb, *safe;
 
 	list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
-		list_del(&lkb->lkb_rsb_lookup);
+		list_del_init(&lkb->lkb_rsb_lookup);
 		_request_lock(r, lkb);
 		schedule();
 	}
@@ -1530,7 +1720,7 @@ static void confirm_master(struct dlm_rsb *r, int error)
 		if (!list_empty(&r->res_lookup)) {
 			lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
 					 lkb_rsb_lookup);
-			list_del(&lkb->lkb_rsb_lookup);
+			list_del_init(&lkb->lkb_rsb_lookup);
 			r->res_first_lkid = lkb->lkb_id;
 			_request_lock(r, lkb);
 		} else
@@ -1614,6 +1804,9 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
  		      DLM_LKF_FORCEUNLOCK))
 		return -EINVAL;
 
+	if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
+		return -EINVAL;
+
 	args->flags = flags;
 	args->astparam = (long) astarg;
 	return 0;
@@ -1638,6 +1831,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 
 		if (lkb->lkb_wait_type)
 			goto out;
+
+		if (is_overlap(lkb))
+			goto out;
 	}
 
 	lkb->lkb_exflags = args->flags;
@@ -1654,35 +1850,126 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	return rv;
 }
 
+/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
+   for success */
+
+/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
+   because there may be a lookup in progress and it's valid to do
+   cancel/unlockf on it */
+
 static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 {
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int rv = -EINVAL;
 
-	if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+	if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
+		log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
+		dlm_print_lkb(lkb);
 		goto out;
+	}
 
-	if (args->flags & DLM_LKF_FORCEUNLOCK)
-		goto out_ok;
+	/* an lkb may still exist even though the lock is EOL'ed due to a
+	   cancel, unlock or failed noqueue request; an app can't use these
+	   locks; return same error as if the lkid had not been found at all */
 
-	if (args->flags & DLM_LKF_CANCEL &&
-	    lkb->lkb_status == DLM_LKSTS_GRANTED)
+	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+		log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
+		rv = -ENOENT;
 		goto out;
+	}
 
-	if (!(args->flags & DLM_LKF_CANCEL) &&
-	    lkb->lkb_status != DLM_LKSTS_GRANTED)
-		goto out;
+	/* an lkb may be waiting for an rsb lookup to complete where the
+	   lookup was initiated by another lock */
+
+	if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
+		if (!list_empty(&lkb->lkb_rsb_lookup)) {
+			log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
+			list_del_init(&lkb->lkb_rsb_lookup);
+			queue_cast(lkb->lkb_resource, lkb,
+				   args->flags & DLM_LKF_CANCEL ?
+				   -DLM_ECANCEL : -DLM_EUNLOCK);
+			unhold_lkb(lkb); /* undoes create_lkb() */
+			rv = -EBUSY;
+			goto out;
+		}
+	}
+
+	/* cancel not allowed with another cancel/unlock in progress */
+
+	if (args->flags & DLM_LKF_CANCEL) {
+		if (lkb->lkb_exflags & DLM_LKF_CANCEL)
+			goto out;
+
+		if (is_overlap(lkb))
+			goto out;
+
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			rv = -EBUSY;
+			goto out;
+		}
+
+		switch (lkb->lkb_wait_type) {
+		case DLM_MSG_LOOKUP:
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
+			rv = -EBUSY;
+			goto out;
+		case DLM_MSG_UNLOCK:
+		case DLM_MSG_CANCEL:
+			goto out;
+		}
+		/* add_to_waiters() will set OVERLAP_CANCEL */
+		goto out_ok;
+	}
+
+	/* do we need to allow a force-unlock if there's a normal unlock
+	   already in progress?  in what conditions could the normal unlock
+	   fail such that we'd want to send a force-unlock to be sure? */
+
+	if (args->flags & DLM_LKF_FORCEUNLOCK) {
+		if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
+			goto out;
+
+		if (is_overlap_unlock(lkb))
+			goto out;
 
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			rv = -EBUSY;
+			goto out;
+		}
+
+		switch (lkb->lkb_wait_type) {
+		case DLM_MSG_LOOKUP:
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
+			rv = -EBUSY;
+			goto out;
+		case DLM_MSG_UNLOCK:
+			goto out;
+		}
+		/* add_to_waiters() will set OVERLAP_UNLOCK */
+		goto out_ok;
+	}
+
+	/* normal unlock not allowed if there's any op in progress */
 	rv = -EBUSY;
-	if (lkb->lkb_wait_type)
+	if (lkb->lkb_wait_type || lkb->lkb_wait_count)
 		goto out;
 
  out_ok:
-	lkb->lkb_exflags = args->flags;
+	/* an overlapping op shouldn't blow away exflags from other op */
+	lkb->lkb_exflags |= args->flags;
 	lkb->lkb_sbflags = 0;
 	lkb->lkb_astparam = args->astparam;
-
 	rv = 0;
  out:
+	if (rv)
+		log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
+			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
+			  args->flags, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
 	return rv;
 }
 
@@ -1732,9 +2019,24 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		goto out;
 	}
 
-	if (can_be_queued(lkb)) {
-		if (is_demoted(lkb))
+	/* is_demoted() means the can_be_granted() above set the grmode
+	   to NL, and left us on the granted queue.  This auto-demotion
+	   (due to CONVDEADLK) might mean other locks, and/or this lock, are
+	   now grantable.  We have to try to grant other converting locks
+	   before we try again to grant this one. */
+
+	if (is_demoted(lkb)) {
+		grant_pending_convert(r, DLM_LOCK_IV);
+		if (_can_be_granted(r, lkb, 1)) {
+			grant_lock(r, lkb);
+			queue_cast(r, lkb, 0);
 			grant_pending_locks(r);
+			goto out;
+		}
+		/* else fall through and move to convert queue */
+	}
+
+	if (can_be_queued(lkb)) {
 		error = -EINPROGRESS;
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
@@ -1759,17 +2061,19 @@ static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	return -DLM_EUNLOCK;
 }
 
-/* FIXME: if revert_lock() finds that the lkb is granted, we should
-   skip the queue_cast(ECANCEL).  It indicates that the request/convert
-   completed (and queued a normal ast) just before the cancel; we don't
-   want to clobber the sb_result for the normal ast with ECANCEL. */
+/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
  
 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
-	revert_lock(r, lkb);
-	queue_cast(r, lkb, -DLM_ECANCEL);
-	grant_pending_locks(r);
-	return -DLM_ECANCEL;
+	int error;
+
+	error = revert_lock(r, lkb);
+	if (error) {
+		queue_cast(r, lkb, -DLM_ECANCEL);
+		grant_pending_locks(r);
+		return -DLM_ECANCEL;
+	}
+	return 0;
 }
 
 /*
@@ -2035,6 +2339,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
 
 	if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
 		error = 0;
+	if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
+		error = 0;
  out_put:
 	dlm_put_lkb(lkb);
  out:
@@ -2065,31 +2371,14 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
  * receive_lookup_reply		send_lookup_reply
  */
 
-static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
-			  int to_nodeid, int mstype,
-			  struct dlm_message **ms_ret,
-			  struct dlm_mhandle **mh_ret)
+static int _create_message(struct dlm_ls *ls, int mb_len,
+			   int to_nodeid, int mstype,
+			   struct dlm_message **ms_ret,
+			   struct dlm_mhandle **mh_ret)
 {
 	struct dlm_message *ms;
 	struct dlm_mhandle *mh;
 	char *mb;
-	int mb_len = sizeof(struct dlm_message);
-
-	switch (mstype) {
-	case DLM_MSG_REQUEST:
-	case DLM_MSG_LOOKUP:
-	case DLM_MSG_REMOVE:
-		mb_len += r->res_length;
-		break;
-	case DLM_MSG_CONVERT:
-	case DLM_MSG_UNLOCK:
-	case DLM_MSG_REQUEST_REPLY:
-	case DLM_MSG_CONVERT_REPLY:
-	case DLM_MSG_GRANT:
-		if (lkb && lkb->lkb_lvbptr)
-			mb_len += r->res_ls->ls_lvblen;
-		break;
-	}
 
 	/* get_buffer gives us a message handle (mh) that we need to
 	   pass into lowcomms_commit and a message buffer (mb) that we
@@ -2104,7 +2393,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	ms = (struct dlm_message *) mb;
 
 	ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
-	ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+	ms->m_header.h_lockspace = ls->ls_global_id;
 	ms->m_header.h_nodeid = dlm_our_nodeid();
 	ms->m_header.h_length = mb_len;
 	ms->m_header.h_cmd = DLM_MSG;
@@ -2116,6 +2405,33 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	return 0;
 }
 
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			  int to_nodeid, int mstype,
+			  struct dlm_message **ms_ret,
+			  struct dlm_mhandle **mh_ret)
+{
+	int mb_len = sizeof(struct dlm_message);
+
+	switch (mstype) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_LOOKUP:
+	case DLM_MSG_REMOVE:
+		mb_len += r->res_length;
+		break;
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_GRANT:
+		if (lkb && lkb->lkb_lvbptr)
+			mb_len += r->res_ls->ls_lvblen;
+		break;
+	}
+
+	return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
+			       ms_ret, mh_ret);
+}
+
 /* further lowcomms enhancements or alternate implementations may make
    the return value from this function useful at some point */
 
@@ -2176,7 +2492,9 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
 	struct dlm_mhandle *mh;
 	int to_nodeid, error;
 
-	add_to_waiters(lkb, mstype);
+	error = add_to_waiters(lkb, mstype);
+	if (error)
+		return error;
 
 	to_nodeid = r->res_nodeid;
 
@@ -2192,7 +2510,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
 	return 0;
 
  fail:
-	remove_from_waiters(lkb);
+	remove_from_waiters(lkb, msg_reply_type(mstype));
 	return error;
 }
 
@@ -2209,7 +2527,8 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 
 	/* down conversions go without a reply from the master */
 	if (!error && down_conversion(lkb)) {
-		remove_from_waiters(lkb);
+		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+		r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
 		r->res_ls->ls_stub_ms.m_result = 0;
 		r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
@@ -2280,7 +2599,9 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	struct dlm_mhandle *mh;
 	int to_nodeid, error;
 
-	add_to_waiters(lkb, DLM_MSG_LOOKUP);
+	error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
+	if (error)
+		return error;
 
 	to_nodeid = dlm_dir_nodeid(r);
 
@@ -2296,7 +2617,7 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	return 0;
 
  fail:
-	remove_from_waiters(lkb);
+	remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
 	return error;
 }
 
@@ -2656,6 +2977,8 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
 	lock_rsb(r);
 
 	receive_flags_reply(lkb, ms);
+	if (is_altmode(lkb))
+		munge_altmode(lkb, ms);
 	grant_lock_pc(r, lkb, ms);
 	queue_cast(r, lkb, 0);
 
@@ -2736,11 +3059,16 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
 	dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
 }
 
+static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	do_purge(ls, ms->m_nodeid, ms->m_pid);
+}
+
 static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 {
 	struct dlm_lkb *lkb;
 	struct dlm_rsb *r;
-	int error, mstype;
+	int error, mstype, result;
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
@@ -2749,20 +3077,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	}
 	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
-	mstype = lkb->lkb_wait_type;
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_request_reply not on waiters");
-		goto out;
-	}
-
-	/* this is the value returned from do_request() on the master */
-	error = ms->m_result;
-
 	r = lkb->lkb_resource;
 	hold_rsb(r);
 	lock_rsb(r);
 
+	mstype = lkb->lkb_wait_type;
+	error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
+	if (error)
+		goto out;
+
 	/* Optimization: the dir node was also the master, so it took our
 	   lookup as a request and sent request reply instead of lookup reply */
 	if (mstype == DLM_MSG_LOOKUP) {
@@ -2770,14 +3093,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		lkb->lkb_nodeid = r->res_nodeid;
 	}
 
-	switch (error) {
+	/* this is the value returned from do_request() on the master */
+	result = ms->m_result;
+
+	switch (result) {
 	case -EAGAIN:
-		/* request would block (be queued) on remote master;
-		   the unhold undoes the original ref from create_lkb()
-		   so it leads to the lkb being freed */
+		/* request would block (be queued) on remote master */
 		queue_cast(r, lkb, -EAGAIN);
 		confirm_master(r, -EAGAIN);
-		unhold_lkb(lkb);
+		unhold_lkb(lkb); /* undoes create_lkb() */
 		break;
 
 	case -EINPROGRESS:
@@ -2785,41 +3109,64 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		/* request was queued or granted on remote master */
 		receive_flags_reply(lkb, ms);
 		lkb->lkb_remid = ms->m_lkid;
-		if (error)
+		if (is_altmode(lkb))
+			munge_altmode(lkb, ms);
+		if (result)
 			add_lkb(r, lkb, DLM_LKSTS_WAITING);
 		else {
 			grant_lock_pc(r, lkb, ms);
 			queue_cast(r, lkb, 0);
 		}
-		confirm_master(r, error);
+		confirm_master(r, result);
 		break;
 
 	case -EBADR:
 	case -ENOTBLK:
 		/* find_rsb failed to find rsb or rsb wasn't master */
+		log_debug(ls, "receive_request_reply %x %x master diff %d %d",
+			  lkb->lkb_id, lkb->lkb_flags, r->res_nodeid, result);
 		r->res_nodeid = -1;
 		lkb->lkb_nodeid = -1;
-		_request_lock(r, lkb);
+
+		if (is_overlap(lkb)) {
+			/* we'll ignore error in cancel/unlock reply */
+			queue_cast_overlap(r, lkb);
+			unhold_lkb(lkb); /* undoes create_lkb() */
+		} else
+			_request_lock(r, lkb);
 		break;
 
 	default:
-		log_error(ls, "receive_request_reply error %d", error);
+		log_error(ls, "receive_request_reply %x error %d",
+			  lkb->lkb_id, result);
 	}
 
+	if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) {
+		log_debug(ls, "receive_request_reply %x result %d unlock",
+			  lkb->lkb_id, result);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		send_unlock(r, lkb);
+	} else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) {
+		log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		send_cancel(r, lkb);
+	} else {
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+	}
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
- out:
 	dlm_put_lkb(lkb);
 }
 
 static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 				    struct dlm_message *ms)
 {
-	int error = ms->m_result;
-
 	/* this is the value returned from do_convert() on the master */
-
-	switch (error) {
+	switch (ms->m_result) {
 	case -EAGAIN:
 		/* convert would block (be queued) on remote master */
 		queue_cast(r, lkb, -EAGAIN);
@@ -2827,6 +3174,9 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 
 	case -EINPROGRESS:
 		/* convert was queued on remote master */
+		receive_flags_reply(lkb, ms);
+		if (is_demoted(lkb))
+			munge_demoted(lkb, ms);
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
 		break;
@@ -2834,24 +3184,33 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	case 0:
 		/* convert was granted on remote master */
 		receive_flags_reply(lkb, ms);
+		if (is_demoted(lkb))
+			munge_demoted(lkb, ms);
 		grant_lock_pc(r, lkb, ms);
 		queue_cast(r, lkb, 0);
 		break;
 
 	default:
-		log_error(r->res_ls, "receive_convert_reply error %d", error);
+		log_error(r->res_ls, "receive_convert_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
 	}
 }
 
 static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
 	struct dlm_rsb *r = lkb->lkb_resource;
+	int error;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
-	__receive_convert_reply(r, lkb, ms);
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
 
+	__receive_convert_reply(r, lkb, ms);
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 }
@@ -2868,37 +3227,38 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	}
 	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_convert_reply not on waiters");
-		goto out;
-	}
-
 	_receive_convert_reply(lkb, ms);
- out:
 	dlm_put_lkb(lkb);
 }
 
 static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
 	struct dlm_rsb *r = lkb->lkb_resource;
-	int error = ms->m_result;
+	int error;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
 	/* this is the value returned from do_unlock() on the master */
 
-	switch (error) {
+	switch (ms->m_result) {
 	case -DLM_EUNLOCK:
 		receive_flags_reply(lkb, ms);
 		remove_lock_pc(r, lkb);
 		queue_cast(r, lkb, -DLM_EUNLOCK);
 		break;
+	case -ENOENT:
+		break;
 	default:
-		log_error(r->res_ls, "receive_unlock_reply error %d", error);
+		log_error(r->res_ls, "receive_unlock_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
 	}
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 }
@@ -2915,37 +3275,39 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	}
 	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_unlock_reply not on waiters");
-		goto out;
-	}
-
 	_receive_unlock_reply(lkb, ms);
- out:
 	dlm_put_lkb(lkb);
 }
 
 static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 {
 	struct dlm_rsb *r = lkb->lkb_resource;
-	int error = ms->m_result;
+	int error;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
+	/* stub reply can happen with waiters_mutex held */
+	error = remove_from_waiters_ms(lkb, ms);
+	if (error)
+		goto out;
+
 	/* this is the value returned from do_cancel() on the master */
 
-	switch (error) {
+	switch (ms->m_result) {
 	case -DLM_ECANCEL:
 		receive_flags_reply(lkb, ms);
 		revert_lock_pc(r, lkb);
-		queue_cast(r, lkb, -DLM_ECANCEL);
+		if (ms->m_result)
+			queue_cast(r, lkb, -DLM_ECANCEL);
+		break;
+	case 0:
 		break;
 	default:
-		log_error(r->res_ls, "receive_cancel_reply error %d", error);
+		log_error(r->res_ls, "receive_cancel_reply %x error %d",
+			  lkb->lkb_id, ms->m_result);
 	}
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 }
@@ -2962,14 +3324,7 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	}
 	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_cancel_reply not on waiters");
-		goto out;
-	}
-
 	_receive_cancel_reply(lkb, ms);
- out:
 	dlm_put_lkb(lkb);
 }
 
@@ -2985,20 +3340,17 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		return;
 	}
 
-	error = remove_from_waiters(lkb);
-	if (error) {
-		log_error(ls, "receive_lookup_reply not on waiters");
-		goto out;
-	}
-
-	/* this is the value returned by dlm_dir_lookup on dir node
+	/* ms->m_result is the value returned by dlm_dir_lookup on dir node
 	   FIXME: will a non-zero error ever be returned? */
-	error = ms->m_result;
 
 	r = lkb->lkb_resource;
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
+	if (error)
+		goto out;
+
 	ret_nodeid = ms->m_nodeid;
 	if (ret_nodeid == dlm_our_nodeid()) {
 		r->res_nodeid = 0;
@@ -3009,14 +3361,22 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		r->res_nodeid = ret_nodeid;
 	}
 
+	if (is_overlap(lkb)) {
+		log_debug(ls, "receive_lookup_reply %x unlock %x",
+			  lkb->lkb_id, lkb->lkb_flags);
+		queue_cast_overlap(r, lkb);
+		unhold_lkb(lkb); /* undoes create_lkb() */
+		goto out_list;
+	}
+
 	_request_lock(r, lkb);
 
+ out_list:
 	if (!ret_nodeid)
 		process_lookup_list(r);
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
- out:
 	dlm_put_lkb(lkb);
 }
 
@@ -3133,6 +3493,12 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
 		receive_lookup_reply(ls, ms);
 		break;
 
+	/* other messages */
+
+	case DLM_MSG_PURGE:
+		receive_purge(ls, ms);
+		break;
+
 	default:
 		log_error(ls, "unknown message type %d", ms->m_type);
 	}
@@ -3153,9 +3519,9 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
 	if (middle_conversion(lkb)) {
 		hold_lkb(lkb);
+		ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
 		ls->ls_stub_ms.m_result = -EINPROGRESS;
 		ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-		_remove_from_waiters(lkb);
 		_receive_convert_reply(lkb, &ls->ls_stub_ms);
 
 		/* Same special case as in receive_rcom_lock_args() */
@@ -3227,18 +3593,18 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 
 		case DLM_MSG_UNLOCK:
 			hold_lkb(lkb);
+			ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
 			ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-			_remove_from_waiters(lkb);
 			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
 			break;
 
 		case DLM_MSG_CANCEL:
 			hold_lkb(lkb);
+			ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
 			ls->ls_stub_ms.m_result = -DLM_ECANCEL;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
-			_remove_from_waiters(lkb);
 			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
 			break;
@@ -3252,37 +3618,47 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 	mutex_unlock(&ls->ls_waiters_mutex);
 }
 
-static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
-	int rv = 0;
+	int found = 0;
 
 	mutex_lock(&ls->ls_waiters_mutex);
 	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
 		if (lkb->lkb_flags & DLM_IFL_RESEND) {
-			rv = lkb->lkb_wait_type;
-			_remove_from_waiters(lkb);
-			lkb->lkb_flags &= ~DLM_IFL_RESEND;
+			hold_lkb(lkb);
+			found = 1;
 			break;
 		}
 	}
 	mutex_unlock(&ls->ls_waiters_mutex);
 
-	if (!rv)
+	if (!found)
 		lkb = NULL;
-	*lkb_ret = lkb;
-	return rv;
+	return lkb;
 }
 
 /* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
    master or dir-node for r.  Processing the lkb may result in it being placed
    back on waiters. */
 
+/* We do this after normal locking has been enabled and any saved messages
+   (in requestqueue) have been processed.  We should be confident that at
+   this point we won't get or process a reply to any of these waiting
+   operations.  But, new ops may be coming in on the rsbs/locks here from
+   userspace or remotely. */
+
+/* there may have been an overlap unlock/cancel prior to recovery or after
+   recovery.  if before, the lkb may still have a pos wait_count; if after, the
+   overlap flag would just have been set and nothing new sent.  we can be
+   confident here than any replies to either the initial op or overlap ops
+   prior to recovery have been received. */
+
 int dlm_recover_waiters_post(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
 	struct dlm_rsb *r;
-	int error = 0, mstype;
+	int error = 0, mstype, err, oc, ou;
 
 	while (1) {
 		if (dlm_locking_stopped(ls)) {
@@ -3291,48 +3667,78 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
 			break;
 		}
 
-		mstype = remove_resend_waiter(ls, &lkb);
-		if (!mstype)
+		lkb = find_resend_waiter(ls);
+		if (!lkb)
 			break;
 
 		r = lkb->lkb_resource;
+		hold_rsb(r);
+		lock_rsb(r);
+
+		mstype = lkb->lkb_wait_type;
+		oc = is_overlap_cancel(lkb);
+		ou = is_overlap_unlock(lkb);
+		err = 0;
 
 		log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
 			  lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
 
-		switch (mstype) {
-
-		case DLM_MSG_LOOKUP:
-			hold_rsb(r);
-			lock_rsb(r);
-			_request_lock(r, lkb);
-			if (is_master(r))
-				confirm_master(r, 0);
-			unlock_rsb(r);
-			put_rsb(r);
-			break;
-
-		case DLM_MSG_REQUEST:
-			hold_rsb(r);
-			lock_rsb(r);
-			_request_lock(r, lkb);
-			if (is_master(r))
-				confirm_master(r, 0);
-			unlock_rsb(r);
-			put_rsb(r);
-			break;
-
-		case DLM_MSG_CONVERT:
-			hold_rsb(r);
-			lock_rsb(r);
-			_convert_lock(r, lkb);
-			unlock_rsb(r);
-			put_rsb(r);
-			break;
-
-		default:
-			log_error(ls, "recover_waiters_post type %d", mstype);
+		/* At this point we assume that we won't get a reply to any
+		   previous op or overlap op on this lock.  First, do a big
+		   remove_from_waiters() for all previous ops. */
+
+		lkb->lkb_flags &= ~DLM_IFL_RESEND;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_wait_type = 0;
+		lkb->lkb_wait_count = 0;
+		mutex_lock(&ls->ls_waiters_mutex);
+		list_del_init(&lkb->lkb_wait_reply);
+		mutex_unlock(&ls->ls_waiters_mutex);
+		unhold_lkb(lkb); /* for waiters list */
+
+		if (oc || ou) {
+			/* do an unlock or cancel instead of resending */
+			switch (mstype) {
+			case DLM_MSG_LOOKUP:
+			case DLM_MSG_REQUEST:
+				queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
+							-DLM_ECANCEL);
+				unhold_lkb(lkb); /* undoes create_lkb() */
+				break;
+			case DLM_MSG_CONVERT:
+				if (oc) {
+					queue_cast(r, lkb, -DLM_ECANCEL);
+				} else {
+					lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
+					_unlock_lock(r, lkb);
+				}
+				break;
+			default:
+				err = 1;
+			}
+		} else {
+			switch (mstype) {
+			case DLM_MSG_LOOKUP:
+			case DLM_MSG_REQUEST:
+				_request_lock(r, lkb);
+				if (is_master(r))
+					confirm_master(r, 0);
+				break;
+			case DLM_MSG_CONVERT:
+				_convert_lock(r, lkb);
+				break;
+			default:
+				err = 1;
+			}
 		}
+
+		if (err)
+			log_error(ls, "recover_waiters_post %x %d %x %d %d",
+			  	  lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou);
+		unlock_rsb(r);
+		put_rsb(r);
+		dlm_put_lkb(lkb);
 	}
 
 	return error;
@@ -3684,7 +4090,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 
 	/* add this new lkb to the per-process list of locks */
 	spin_lock(&ua->proc->locks_spin);
-	kref_get(&lkb->lkb_ref);
+	hold_lkb(lkb);
 	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
 	spin_unlock(&ua->proc->locks_spin);
  out:
@@ -3774,6 +4180,9 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 
 	if (error == -DLM_EUNLOCK)
 		error = 0;
+	/* from validate_unlock_args() */
+	if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
+		error = 0;
 	if (error)
 		goto out_put;
 
@@ -3786,6 +4195,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	dlm_put_lkb(lkb);
  out:
 	unlock_recovery(ls);
+	kfree(ua_tmp);
 	return error;
 }
 
@@ -3815,33 +4225,37 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 
 	if (error == -DLM_ECANCEL)
 		error = 0;
-	if (error)
-		goto out_put;
-
-	/* this lkb was removed from the WAITING queue */
-	if (lkb->lkb_grmode == DLM_LOCK_IV) {
-		spin_lock(&ua->proc->locks_spin);
-		list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
-		spin_unlock(&ua->proc->locks_spin);
-	}
+	/* from validate_unlock_args() */
+	if (error == -EBUSY)
+		error = 0;
  out_put:
 	dlm_put_lkb(lkb);
  out:
 	unlock_recovery(ls);
+	kfree(ua_tmp);
 	return error;
 }
 
+/* lkb's that are removed from the waiters list by revert are just left on the
+   orphans list with the granted orphan locks, to be freed by purge */
+
 static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
 	struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+	struct dlm_args args;
+	int error;
 
-	if (ua->lksb.sb_lvbptr)
-		kfree(ua->lksb.sb_lvbptr);
-	kfree(ua);
-	lkb->lkb_astparam = (long)NULL;
+	hold_lkb(lkb);
+	mutex_lock(&ls->ls_orphans_mutex);
+	list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
+	mutex_unlock(&ls->ls_orphans_mutex);
 
-	/* TODO: propogate to master if needed */
-	return 0;
+	set_unlock_args(0, ua, &args);
+
+	error = cancel_lock(ls, lkb, &args);
+	if (error == -DLM_ECANCEL)
+		error = 0;
+	return error;
 }
 
 /* The force flag allows the unlock to go ahead even if the lkb isn't granted.
@@ -3853,10 +4267,6 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
 	struct dlm_args args;
 	int error;
 
-	/* FIXME: we need to handle the case where the lkb is in limbo
-	   while the rsb is being looked up, currently we assert in
-	   _unlock_lock/is_remote because rsb nodeid is -1. */
-
 	set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
 
 	error = unlock_lock(ls, lkb, &args);
@@ -3865,6 +4275,31 @@ static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
 	return error;
 }
 
+/* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
+   (which does lock_rsb) due to deadlock with receiving a message that does
+   lock_rsb followed by dlm_user_add_ast() */
+
+static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
+				     struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb = NULL;
+
+	mutex_lock(&ls->ls_clear_proc_locks);
+	if (list_empty(&proc->locks))
+		goto out;
+
+	lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
+	list_del_init(&lkb->lkb_ownqueue);
+
+	if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
+		lkb->lkb_flags |= DLM_IFL_ORPHAN;
+	else
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+ out:
+	mutex_unlock(&ls->ls_clear_proc_locks);
+	return lkb;
+}
+
 /* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
    1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
    which we clear here. */
@@ -3880,18 +4315,15 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 	struct dlm_lkb *lkb, *safe;
 
 	lock_recovery(ls);
-	mutex_lock(&ls->ls_clear_proc_locks);
 
-	list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
-		list_del_init(&lkb->lkb_ownqueue);
-
-		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
-			lkb->lkb_flags |= DLM_IFL_ORPHAN;
+	while (1) {
+		lkb = del_proc_lock(ls, proc);
+		if (!lkb)
+			break;
+		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
 			orphan_proc_lock(ls, lkb);
-		} else {
-			lkb->lkb_flags |= DLM_IFL_DEAD;
+		else
 			unlock_proc_lock(ls, lkb);
-		}
 
 		/* this removes the reference for the proc->locks list
 		   added by dlm_user_request, it may result in the lkb
@@ -3900,6 +4332,8 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 		dlm_put_lkb(lkb);
 	}
 
+	mutex_lock(&ls->ls_clear_proc_locks);
+
 	/* in-progress unlocks */
 	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
 		list_del_init(&lkb->lkb_ownqueue);
@@ -3916,3 +4350,92 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 	unlock_recovery(ls);
 }
 
+static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	while (1) {
+		lkb = NULL;
+		spin_lock(&proc->locks_spin);
+		if (!list_empty(&proc->locks)) {
+			lkb = list_entry(proc->locks.next, struct dlm_lkb,
+					 lkb_ownqueue);
+			list_del_init(&lkb->lkb_ownqueue);
+		}
+		spin_unlock(&proc->locks_spin);
+
+		if (!lkb)
+			break;
+
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		unlock_proc_lock(ls, lkb);
+		dlm_put_lkb(lkb); /* ref from proc->locks list */
+	}
+
+	spin_lock(&proc->locks_spin);
+	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
+		list_del_init(&lkb->lkb_ownqueue);
+		lkb->lkb_flags |= DLM_IFL_DEAD;
+		dlm_put_lkb(lkb);
+	}
+	spin_unlock(&proc->locks_spin);
+
+	spin_lock(&proc->asts_spin);
+	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+		list_del(&lkb->lkb_astqueue);
+		dlm_put_lkb(lkb);
+	}
+	spin_unlock(&proc->asts_spin);
+}
+
+/* pid of 0 means purge all orphans */
+
+static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	mutex_lock(&ls->ls_orphans_mutex);
+	list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
+		if (pid && lkb->lkb_ownpid != pid)
+			continue;
+		unlock_proc_lock(ls, lkb);
+		list_del_init(&lkb->lkb_ownqueue);
+		dlm_put_lkb(lkb);
+	}
+	mutex_unlock(&ls->ls_orphans_mutex);
+}
+
+static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int error;
+
+	error = _create_message(ls, sizeof(struct dlm_message), nodeid,
+				DLM_MSG_PURGE, &ms, &mh);
+	if (error)
+		return error;
+	ms->m_nodeid = nodeid;
+	ms->m_pid = pid;
+
+	return send_message(mh, ms);
+}
+
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+		   int nodeid, int pid)
+{
+	int error = 0;
+
+	if (nodeid != dlm_our_nodeid()) {
+		error = send_purge(ls, nodeid, pid);
+	} else {
+		lock_recovery(ls);
+		if (pid == current->pid)
+			purge_proc_locks(ls, proc);
+		else
+			do_purge(ls, nodeid, pid);
+		unlock_recovery(ls);
+	}
+	return error;
+}
+
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 0843a3073ec3..64fc4ec40668 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -41,6 +41,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	uint32_t flags, uint32_t lkid, char *lvb_in);
 int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
 	uint32_t flags, uint32_t lkid);
+int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
+	int nodeid, int pid);
 void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
 
 static inline int is_master(struct dlm_rsb *r)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f40817b53c6f..a677b2a5eed4 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -167,7 +167,6 @@ static struct kobj_type dlm_ktype = {
 };
 
 static struct kset dlm_kset = {
-	.subsys = &kernel_subsys,
 	.kobj   = {.name = "dlm",},
 	.ktype  = &dlm_ktype,
 };
@@ -218,6 +217,7 @@ int dlm_lockspace_init(void)
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
+	kobj_set_kset_s(&dlm_kset, kernel_subsys);
 	error = kset_register(&dlm_kset);
 	if (error)
 		printk("dlm_lockspace_init: cannot register kset %d\n", error);
@@ -459,6 +459,8 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 
 	INIT_LIST_HEAD(&ls->ls_waiters);
 	mutex_init(&ls->ls_waiters_mutex);
+	INIT_LIST_HEAD(&ls->ls_orphans);
+	mutex_init(&ls->ls_orphans_mutex);
 
 	INIT_LIST_HEAD(&ls->ls_nodes);
 	INIT_LIST_HEAD(&ls->ls_nodes_gone);
diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c
deleted file mode 100644
index dc83a9d979b5..000000000000
--- a/fs/dlm/lowcomms-sctp.c
+++ /dev/null
@@ -1,1210 +0,0 @@
-/******************************************************************************
-*******************************************************************************
-**
-**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
-**
-**  This copyrighted material is made available to anyone wishing to use,
-**  modify, copy, or redistribute it subject to the terms and conditions
-**  of the GNU General Public License v.2.
-**
-*******************************************************************************
-******************************************************************************/
-
-/*
- * lowcomms.c
- *
- * This is the "low-level" comms layer.
- *
- * It is responsible for sending/receiving messages
- * from other nodes in the cluster.
- *
- * Cluster nodes are referred to by their nodeids. nodeids are
- * simply 32 bit numbers to the locking module - if they need to
- * be expanded for the cluster infrastructure then that is it's
- * responsibility. It is this layer's
- * responsibility to resolve these into IP address or
- * whatever it needs for inter-node communication.
- *
- * The comms level is two kernel threads that deal mainly with
- * the receiving of messages from other nodes and passing them
- * up to the mid-level comms layer (which understands the
- * message format) for execution by the locking core, and
- * a send thread which does all the setting up of connections
- * to remote nodes and the sending of data. Threads are not allowed
- * to send their own data because it may cause them to wait in times
- * of high load. Also, this way, the sending thread can collect together
- * messages bound for one node and send them in one block.
- *
- * I don't see any problem with the recv thread executing the locking
- * code on behalf of remote processes as the locking code is
- * short, efficient and never (well, hardly ever) waits.
- *
- */
-
-#include <asm/ioctls.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-#include <net/sctp/user.h>
-#include <linux/pagemap.h>
-#include <linux/socket.h>
-#include <linux/idr.h>
-
-#include "dlm_internal.h"
-#include "lowcomms.h"
-#include "config.h"
-#include "midcomms.h"
-
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
-static int			dlm_local_count;
-static int			dlm_local_nodeid;
-
-/* One of these per connected node */
-
-#define NI_INIT_PENDING 1
-#define NI_WRITE_PENDING 2
-
-struct nodeinfo {
-	spinlock_t		lock;
-	sctp_assoc_t		assoc_id;
-	unsigned long		flags;
-	struct list_head	write_list; /* nodes with pending writes */
-	struct list_head	writequeue; /* outgoing writequeue_entries */
-	spinlock_t		writequeue_lock;
-	int			nodeid;
-	struct work_struct      swork; /* Send workqueue */
-	struct work_struct      lwork; /* Locking workqueue */
-};
-
-static DEFINE_IDR(nodeinfo_idr);
-static DECLARE_RWSEM(nodeinfo_lock);
-static int max_nodeid;
-
-struct cbuf {
-	unsigned int base;
-	unsigned int len;
-	unsigned int mask;
-};
-
-/* Just the one of these, now. But this struct keeps
-   the connection-specific variables together */
-
-#define CF_READ_PENDING 1
-
-struct connection {
-	struct socket           *sock;
-	unsigned long		flags;
-	struct page             *rx_page;
-	atomic_t		waiting_requests;
-	struct cbuf		cb;
-	int                     eagain_flag;
-	struct work_struct      work; /* Send workqueue */
-};
-
-/* An entry waiting to be sent */
-
-struct writequeue_entry {
-	struct list_head	list;
-	struct page             *page;
-	int			offset;
-	int			len;
-	int			end;
-	int			users;
-	struct nodeinfo         *ni;
-};
-
-static void cbuf_add(struct cbuf *cb, int n)
-{
-	cb->len += n;
-}
-
-static int cbuf_data(struct cbuf *cb)
-{
-	return ((cb->base + cb->len) & cb->mask);
-}
-
-static void cbuf_init(struct cbuf *cb, int size)
-{
-	cb->base = cb->len = 0;
-	cb->mask = size-1;
-}
-
-static void cbuf_eat(struct cbuf *cb, int n)
-{
-	cb->len  -= n;
-	cb->base += n;
-	cb->base &= cb->mask;
-}
-
-/* List of nodes which have writes pending */
-static LIST_HEAD(write_nodes);
-static DEFINE_SPINLOCK(write_nodes_lock);
-
-
-/* Maximum number of incoming messages to process before
- * doing a schedule()
- */
-#define MAX_RX_MSG_COUNT 25
-
-/* Work queues */
-static struct workqueue_struct *recv_workqueue;
-static struct workqueue_struct *send_workqueue;
-static struct workqueue_struct *lock_workqueue;
-
-/* The SCTP connection */
-static struct connection sctp_con;
-
-static void process_send_sockets(struct work_struct *work);
-static void process_recv_sockets(struct work_struct *work);
-static void process_lock_request(struct work_struct *work);
-
-static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
-{
-	struct sockaddr_storage addr;
-	int error;
-
-	if (!dlm_local_count)
-		return -1;
-
-	error = dlm_nodeid_to_addr(nodeid, &addr);
-	if (error)
-		return error;
-
-	if (dlm_local_addr[0]->ss_family == AF_INET) {
-		struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
-		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
-		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
-	} else {
-		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
-		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
-		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
-		       sizeof(in6->sin6_addr));
-	}
-
-	return 0;
-}
-
-/* If alloc is 0 here we will not attempt to allocate a new
-   nodeinfo struct */
-static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
-{
-	struct nodeinfo *ni;
-	int r;
-	int n;
-
-	down_read(&nodeinfo_lock);
-	ni = idr_find(&nodeinfo_idr, nodeid);
-	up_read(&nodeinfo_lock);
-
-	if (ni || !alloc)
-		return ni;
-
-	down_write(&nodeinfo_lock);
-
-	ni = idr_find(&nodeinfo_idr, nodeid);
-	if (ni)
-		goto out_up;
-
-	r = idr_pre_get(&nodeinfo_idr, alloc);
-	if (!r)
-		goto out_up;
-
-	ni = kmalloc(sizeof(struct nodeinfo), alloc);
-	if (!ni)
-		goto out_up;
-
-	r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
-	if (r) {
-		kfree(ni);
-		ni = NULL;
-		goto out_up;
-	}
-	if (n != nodeid) {
-		idr_remove(&nodeinfo_idr, n);
-		kfree(ni);
-		ni = NULL;
-		goto out_up;
-	}
-	memset(ni, 0, sizeof(struct nodeinfo));
-	spin_lock_init(&ni->lock);
-	INIT_LIST_HEAD(&ni->writequeue);
-	spin_lock_init(&ni->writequeue_lock);
-	INIT_WORK(&ni->lwork, process_lock_request);
-	INIT_WORK(&ni->swork, process_send_sockets);
-	ni->nodeid = nodeid;
-
-	if (nodeid > max_nodeid)
-		max_nodeid = nodeid;
-out_up:
-	up_write(&nodeinfo_lock);
-
-	return ni;
-}
-
-/* Don't call this too often... */
-static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
-{
-	int i;
-	struct nodeinfo *ni;
-
-	for (i=1; i<=max_nodeid; i++) {
-		ni = nodeid2nodeinfo(i, 0);
-		if (ni && ni->assoc_id == assoc)
-			return ni;
-	}
-	return NULL;
-}
-
-/* Data or notification available on socket */
-static void lowcomms_data_ready(struct sock *sk, int count_unused)
-{
-	if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
-		queue_work(recv_workqueue, &sctp_con.work);
-}
-
-
-/* Add the port number to an IP6 or 4 sockaddr and return the address length.
-   Also padd out the struct with zeros to make comparisons meaningful */
-
-static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
-			  int *addr_len)
-{
-	struct sockaddr_in *local4_addr;
-	struct sockaddr_in6 *local6_addr;
-
-	if (!dlm_local_count)
-		return;
-
-	if (!port) {
-		if (dlm_local_addr[0]->ss_family == AF_INET) {
-			local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
-			port = be16_to_cpu(local4_addr->sin_port);
-		} else {
-			local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
-			port = be16_to_cpu(local6_addr->sin6_port);
-		}
-	}
-
-	saddr->ss_family = dlm_local_addr[0]->ss_family;
-	if (dlm_local_addr[0]->ss_family == AF_INET) {
-		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
-		in4_addr->sin_port = cpu_to_be16(port);
-		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
-		memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
-		       sizeof(struct sockaddr_in));
-		*addr_len = sizeof(struct sockaddr_in);
-	} else {
-		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
-		in6_addr->sin6_port = cpu_to_be16(port);
-		memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
-		       sizeof(struct sockaddr_in6));
-		*addr_len = sizeof(struct sockaddr_in6);
-	}
-}
-
-/* Close the connection and tidy up */
-static void close_connection(void)
-{
-	if (sctp_con.sock) {
-		sock_release(sctp_con.sock);
-		sctp_con.sock = NULL;
-	}
-
-	if (sctp_con.rx_page) {
-		__free_page(sctp_con.rx_page);
-		sctp_con.rx_page = NULL;
-	}
-}
-
-/* We only send shutdown messages to nodes that are not part of the cluster */
-static void send_shutdown(sctp_assoc_t associd)
-{
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct msghdr outmessage;
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	int ret;
-
-	outmessage.msg_name = NULL;
-	outmessage.msg_namelen = 0;
-	outmessage.msg_control = outcmsg;
-	outmessage.msg_controllen = sizeof(outcmsg);
-	outmessage.msg_flags = MSG_EOR;
-
-	cmsg = CMSG_FIRSTHDR(&outmessage);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	outmessage.msg_controllen = cmsg->cmsg_len;
-	sinfo = CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-
-	sinfo->sinfo_flags |= MSG_EOF;
-	sinfo->sinfo_assoc_id = associd;
-
-	ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
-
-	if (ret != 0)
-		log_print("send EOF to node failed: %d", ret);
-}
-
-
-/* INIT failed but we don't know which node...
-   restart INIT on all pending nodes */
-static void init_failed(void)
-{
-	int i;
-	struct nodeinfo *ni;
-
-	for (i=1; i<=max_nodeid; i++) {
-		ni = nodeid2nodeinfo(i, 0);
-		if (!ni)
-			continue;
-
-		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-			ni->assoc_id = 0;
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-				queue_work(send_workqueue, &ni->swork);
-			}
-		}
-	}
-}
-
-/* Something happened to an association */
-static void process_sctp_notification(struct msghdr *msg, char *buf)
-{
-	union sctp_notification *sn = (union sctp_notification *)buf;
-
-	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
-		switch (sn->sn_assoc_change.sac_state) {
-
-		case SCTP_COMM_UP:
-		case SCTP_RESTART:
-		{
-			/* Check that the new node is in the lockspace */
-			struct sctp_prim prim;
-			mm_segment_t fs;
-			int nodeid;
-			int prim_len, ret;
-			int addr_len;
-			struct nodeinfo *ni;
-
-			/* This seems to happen when we received a connection
-			 * too early... or something...  anyway, it happens but
-			 * we always seem to get a real message too, see
-			 * receive_from_sock */
-
-			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
-				log_print("COMM_UP for invalid assoc ID %d",
-					  (int)sn->sn_assoc_change.sac_assoc_id);
-				init_failed();
-				return;
-			}
-			memset(&prim, 0, sizeof(struct sctp_prim));
-			prim_len = sizeof(struct sctp_prim);
-			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
-
-			fs = get_fs();
-			set_fs(get_ds());
-			ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
-							     IPPROTO_SCTP,
-							     SCTP_PRIMARY_ADDR,
-							     (char*)&prim,
-							     &prim_len);
-			set_fs(fs);
-			if (ret < 0) {
-				struct nodeinfo *ni;
-
-				log_print("getsockopt/sctp_primary_addr on "
-					  "new assoc %d failed : %d",
-					  (int)sn->sn_assoc_change.sac_assoc_id,
-					  ret);
-
-				/* Retry INIT later */
-				ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-				if (ni)
-					clear_bit(NI_INIT_PENDING, &ni->flags);
-				return;
-			}
-			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
-			if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
-				log_print("reject connect from unknown addr");
-				send_shutdown(prim.ssp_assoc_id);
-				return;
-			}
-
-			ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-			if (!ni)
-				return;
-
-			/* Save the assoc ID */
-			ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
-
-			log_print("got new/restarted association %d nodeid %d",
-				  (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
-
-			/* Send any pending writes */
-			clear_bit(NI_INIT_PENDING, &ni->flags);
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-				queue_work(send_workqueue, &ni->swork);
-			}
-		}
-		break;
-
-		case SCTP_COMM_LOST:
-		case SCTP_SHUTDOWN_COMP:
-		{
-			struct nodeinfo *ni;
-
-			ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
-			if (ni) {
-				spin_lock(&ni->lock);
-				ni->assoc_id = 0;
-				spin_unlock(&ni->lock);
-			}
-		}
-		break;
-
-		/* We don't know which INIT failed, so clear the PENDING flags
-		 * on them all.  if assoc_id is zero then it will then try
-		 * again */
-
-		case SCTP_CANT_STR_ASSOC:
-		{
-			log_print("Can't start SCTP association - retrying");
-			init_failed();
-		}
-		break;
-
-		default:
-			log_print("unexpected SCTP assoc change id=%d state=%d",
-				  (int)sn->sn_assoc_change.sac_assoc_id,
-				  sn->sn_assoc_change.sac_state);
-		}
-	}
-}
-
-/* Data received from remote end */
-static int receive_from_sock(void)
-{
-	int ret = 0;
-	struct msghdr msg;
-	struct kvec iov[2];
-	unsigned len;
-	int r;
-	struct sctp_sndrcvinfo *sinfo;
-	struct cmsghdr *cmsg;
-	struct nodeinfo *ni;
-
-	/* These two are marginally too big for stack allocation, but this
-	 * function is (currently) only called by dlm_recvd so static should be
-	 * OK.
-	 */
-	static struct sockaddr_storage msgname;
-	static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-
-	if (sctp_con.sock == NULL)
-		goto out;
-
-	if (sctp_con.rx_page == NULL) {
-		/*
-		 * This doesn't need to be atomic, but I think it should
-		 * improve performance if it is.
-		 */
-		sctp_con.rx_page = alloc_page(GFP_ATOMIC);
-		if (sctp_con.rx_page == NULL)
-			goto out_resched;
-		cbuf_init(&sctp_con.cb, PAGE_CACHE_SIZE);
-	}
-
-	memset(&incmsg, 0, sizeof(incmsg));
-	memset(&msgname, 0, sizeof(msgname));
-
-	msg.msg_name = &msgname;
-	msg.msg_namelen = sizeof(msgname);
-	msg.msg_flags = 0;
-	msg.msg_control = incmsg;
-	msg.msg_controllen = sizeof(incmsg);
-	msg.msg_iovlen = 1;
-
-	/* I don't see why this circular buffer stuff is necessary for SCTP
-	 * which is a packet-based protocol, but the whole thing breaks under
-	 * load without it! The overhead is minimal (and is in the TCP lowcomms
-	 * anyway, of course) so I'll leave it in until I can figure out what's
-	 * really happening.
-	 */
-
-	/*
-	 * iov[0] is the bit of the circular buffer between the current end
-	 * point (cb.base + cb.len) and the end of the buffer.
-	 */
-	iov[0].iov_len = sctp_con.cb.base - cbuf_data(&sctp_con.cb);
-	iov[0].iov_base = page_address(sctp_con.rx_page) +
-		cbuf_data(&sctp_con.cb);
-	iov[1].iov_len = 0;
-
-	/*
-	 * iov[1] is the bit of the circular buffer between the start of the
-	 * buffer and the start of the currently used section (cb.base)
-	 */
-	if (cbuf_data(&sctp_con.cb) >= sctp_con.cb.base) {
-		iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&sctp_con.cb);
-		iov[1].iov_len = sctp_con.cb.base;
-		iov[1].iov_base = page_address(sctp_con.rx_page);
-		msg.msg_iovlen = 2;
-	}
-	len = iov[0].iov_len + iov[1].iov_len;
-
-	r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
-				 MSG_NOSIGNAL | MSG_DONTWAIT);
-	if (ret <= 0)
-		goto out_close;
-
-	msg.msg_control = incmsg;
-	msg.msg_controllen = sizeof(incmsg);
-	cmsg = CMSG_FIRSTHDR(&msg);
-	sinfo = CMSG_DATA(cmsg);
-
-	if (msg.msg_flags & MSG_NOTIFICATION) {
-		process_sctp_notification(&msg, page_address(sctp_con.rx_page));
-		return 0;
-	}
-
-	/* Is this a new association ? */
-	ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
-	if (ni) {
-		ni->assoc_id = sinfo->sinfo_assoc_id;
-		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
-
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-				queue_work(send_workqueue, &ni->swork);
-			}
-		}
-	}
-
-	/* INIT sends a message with length of 1 - ignore it */
-	if (r == 1)
-		return 0;
-
-	cbuf_add(&sctp_con.cb, ret);
-	// PJC: TODO: Add to node's workqueue....can we ??
-	ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
-					  page_address(sctp_con.rx_page),
-					  sctp_con.cb.base, sctp_con.cb.len,
-					  PAGE_CACHE_SIZE);
-	if (ret < 0)
-		goto out_close;
-	cbuf_eat(&sctp_con.cb, ret);
-
-out:
-	ret = 0;
-	goto out_ret;
-
-out_resched:
-	lowcomms_data_ready(sctp_con.sock->sk, 0);
-	ret = 0;
-	cond_resched();
-	goto out_ret;
-
-out_close:
-	if (ret != -EAGAIN)
-		log_print("error reading from sctp socket: %d", ret);
-out_ret:
-	return ret;
-}
-
-/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
-static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
-{
-	mm_segment_t fs;
-	int result = 0;
-
-	fs = get_fs();
-	set_fs(get_ds());
-	if (num == 1)
-		result = sctp_con.sock->ops->bind(sctp_con.sock,
-						  (struct sockaddr *) addr,
-						  addr_len);
-	else
-		result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
-							SCTP_SOCKOPT_BINDX_ADD,
-							(char *)addr, addr_len);
-	set_fs(fs);
-
-	if (result < 0)
-		log_print("Can't bind to port %d addr number %d",
-			  dlm_config.ci_tcp_port, num);
-
-	return result;
-}
-
-static void init_local(void)
-{
-	struct sockaddr_storage sas, *addr;
-	int i;
-
-	dlm_local_nodeid = dlm_our_nodeid();
-
-	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
-		if (dlm_our_addr(&sas, i))
-			break;
-
-		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
-		if (!addr)
-			break;
-		memcpy(addr, &sas, sizeof(*addr));
-		dlm_local_addr[dlm_local_count++] = addr;
-	}
-}
-
-/* Initialise SCTP socket and bind to all interfaces */
-static int init_sock(void)
-{
-	mm_segment_t fs;
-	struct socket *sock = NULL;
-	struct sockaddr_storage localaddr;
-	struct sctp_event_subscribe subscribe;
-	int result = -EINVAL, num = 1, i, addr_len;
-
-	if (!dlm_local_count) {
-		init_local();
-		if (!dlm_local_count) {
-			log_print("no local IP address has been set");
-			goto out;
-		}
-	}
-
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
-				  IPPROTO_SCTP, &sock);
-	if (result < 0) {
-		log_print("Can't create comms socket, check SCTP is loaded");
-		goto out;
-	}
-
-	/* Listen for events */
-	memset(&subscribe, 0, sizeof(subscribe));
-	subscribe.sctp_data_io_event = 1;
-	subscribe.sctp_association_event = 1;
-	subscribe.sctp_send_failure_event = 1;
-	subscribe.sctp_shutdown_event = 1;
-	subscribe.sctp_partial_delivery_event = 1;
-
-	fs = get_fs();
-	set_fs(get_ds());
-	result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
-				       (char *)&subscribe, sizeof(subscribe));
-	set_fs(fs);
-
-	if (result < 0) {
-		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
-			  result);
-		goto create_delsock;
-	}
-
-	/* Init con struct */
-	sock->sk->sk_user_data = &sctp_con;
-	sctp_con.sock = sock;
-	sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
-
-	/* Bind to all interfaces. */
-	for (i = 0; i < dlm_local_count; i++) {
-		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
-		make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
-
-		result = add_bind_addr(&localaddr, addr_len, num);
-		if (result)
-			goto create_delsock;
-		++num;
-	}
-
-	result = sock->ops->listen(sock, 5);
-	if (result < 0) {
-		log_print("Can't set socket listening");
-		goto create_delsock;
-	}
-
-	return 0;
-
-create_delsock:
-	sock_release(sock);
-	sctp_con.sock = NULL;
-out:
-	return result;
-}
-
-
-static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
-{
-	struct writequeue_entry *entry;
-
-	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
-	if (!entry)
-		return NULL;
-
-	entry->page = alloc_page(allocation);
-	if (!entry->page) {
-		kfree(entry);
-		return NULL;
-	}
-
-	entry->offset = 0;
-	entry->len = 0;
-	entry->end = 0;
-	entry->users = 0;
-
-	return entry;
-}
-
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
-{
-	struct writequeue_entry *e;
-	int offset = 0;
-	int users = 0;
-	struct nodeinfo *ni;
-
-	ni = nodeid2nodeinfo(nodeid, allocation);
-	if (!ni)
-		return NULL;
-
-	spin_lock(&ni->writequeue_lock);
-	e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
-	if ((&e->list == &ni->writequeue) ||
-	    (PAGE_CACHE_SIZE - e->end < len)) {
-		e = NULL;
-	} else {
-		offset = e->end;
-		e->end += len;
-		users = e->users++;
-	}
-	spin_unlock(&ni->writequeue_lock);
-
-	if (e) {
-	got_one:
-		if (users == 0)
-			kmap(e->page);
-		*ppc = page_address(e->page) + offset;
-		return e;
-	}
-
-	e = new_writequeue_entry(allocation);
-	if (e) {
-		spin_lock(&ni->writequeue_lock);
-		offset = e->end;
-		e->end += len;
-		e->ni = ni;
-		users = e->users++;
-		list_add_tail(&e->list, &ni->writequeue);
-		spin_unlock(&ni->writequeue_lock);
-		goto got_one;
-	}
-	return NULL;
-}
-
-void dlm_lowcomms_commit_buffer(void *arg)
-{
-	struct writequeue_entry *e = (struct writequeue_entry *) arg;
-	int users;
-	struct nodeinfo *ni = e->ni;
-
-	spin_lock(&ni->writequeue_lock);
-	users = --e->users;
-	if (users)
-		goto out;
-	e->len = e->end - e->offset;
-	kunmap(e->page);
-	spin_unlock(&ni->writequeue_lock);
-
-	if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-		spin_lock_bh(&write_nodes_lock);
-		list_add_tail(&ni->write_list, &write_nodes);
-		spin_unlock_bh(&write_nodes_lock);
-
-		queue_work(send_workqueue, &ni->swork);
-	}
-	return;
-
-out:
-	spin_unlock(&ni->writequeue_lock);
-	return;
-}
-
-static void free_entry(struct writequeue_entry *e)
-{
-	__free_page(e->page);
-	kfree(e);
-}
-
-/* Initiate an SCTP association. In theory we could just use sendmsg() on
-   the first IP address and it should work, but this allows us to set up the
-   association before sending any valuable data that we can't afford to lose.
-   It also keeps the send path clean as it can now always use the association ID */
-static void initiate_association(int nodeid)
-{
-	struct sockaddr_storage rem_addr;
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct msghdr outmessage;
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	int ret;
-	int addrlen;
-	char buf[1];
-	struct kvec iov[1];
-	struct nodeinfo *ni;
-
-	log_print("Initiating association with node %d", nodeid);
-
-	ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
-	if (!ni)
-		return;
-
-	if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
-		log_print("no address for nodeid %d", nodeid);
-		return;
-	}
-
-	make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
-
-	outmessage.msg_name = &rem_addr;
-	outmessage.msg_namelen = addrlen;
-	outmessage.msg_control = outcmsg;
-	outmessage.msg_controllen = sizeof(outcmsg);
-	outmessage.msg_flags = MSG_EOR;
-
-	iov[0].iov_base = buf;
-	iov[0].iov_len = 1;
-
-	/* Real INIT messages seem to cause trouble. Just send a 1 byte message
-	   we can afford to lose */
-	cmsg = CMSG_FIRSTHDR(&outmessage);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-
-	outmessage.msg_controllen = cmsg->cmsg_len;
-	ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
-	if (ret < 0) {
-		log_print("send INIT to node failed: %d", ret);
-		/* Try again later */
-		clear_bit(NI_INIT_PENDING, &ni->flags);
-	}
-}
-
-/* Send a message */
-static void send_to_sock(struct nodeinfo *ni)
-{
-	int ret = 0;
-	struct writequeue_entry *e;
-	int len, offset;
-	struct msghdr outmsg;
-	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
-	struct cmsghdr *cmsg;
-	struct sctp_sndrcvinfo *sinfo;
-	struct kvec iov;
-
-	/* See if we need to init an association before we start
-	   sending precious messages */
-	spin_lock(&ni->lock);
-	if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-		spin_unlock(&ni->lock);
-		initiate_association(ni->nodeid);
-		return;
-	}
-	spin_unlock(&ni->lock);
-
-	outmsg.msg_name = NULL; /* We use assoc_id */
-	outmsg.msg_namelen = 0;
-	outmsg.msg_control = outcmsg;
-	outmsg.msg_controllen = sizeof(outcmsg);
-	outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
-
-	cmsg = CMSG_FIRSTHDR(&outmsg);
-	cmsg->cmsg_level = IPPROTO_SCTP;
-	cmsg->cmsg_type = SCTP_SNDRCV;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
-	sinfo = CMSG_DATA(cmsg);
-	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
-	sinfo->sinfo_assoc_id = ni->assoc_id;
-	outmsg.msg_controllen = cmsg->cmsg_len;
-
-	spin_lock(&ni->writequeue_lock);
-	for (;;) {
-		if (list_empty(&ni->writequeue))
-			break;
-		e = list_entry(ni->writequeue.next, struct writequeue_entry,
-			       list);
-		len = e->len;
-		offset = e->offset;
-		BUG_ON(len == 0 && e->users == 0);
-		spin_unlock(&ni->writequeue_lock);
-		kmap(e->page);
-
-		ret = 0;
-		if (len) {
-			iov.iov_base = page_address(e->page)+offset;
-			iov.iov_len = len;
-
-			ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
-					     len);
-			if (ret == -EAGAIN) {
-				sctp_con.eagain_flag = 1;
-				goto out;
-			} else if (ret < 0)
-				goto send_error;
-		} else {
-			/* Don't starve people filling buffers */
-			cond_resched();
-		}
-
-		spin_lock(&ni->writequeue_lock);
-		e->offset += ret;
-		e->len -= ret;
-
-		if (e->len == 0 && e->users == 0) {
-			list_del(&e->list);
-			kunmap(e->page);
-			free_entry(e);
-			continue;
-		}
-	}
-	spin_unlock(&ni->writequeue_lock);
-out:
-	return;
-
-send_error:
-	log_print("Error sending to node %d %d", ni->nodeid, ret);
-	spin_lock(&ni->lock);
-	if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
-		ni->assoc_id = 0;
-		spin_unlock(&ni->lock);
-		initiate_association(ni->nodeid);
-	} else
-		spin_unlock(&ni->lock);
-
-	return;
-}
-
-/* Try to send any messages that are pending */
-static void process_output_queue(void)
-{
-	struct list_head *list;
-	struct list_head *temp;
-
-	spin_lock_bh(&write_nodes_lock);
-	list_for_each_safe(list, temp, &write_nodes) {
-		struct nodeinfo *ni =
-			list_entry(list, struct nodeinfo, write_list);
-		clear_bit(NI_WRITE_PENDING, &ni->flags);
-		list_del(&ni->write_list);
-
-		spin_unlock_bh(&write_nodes_lock);
-
-		send_to_sock(ni);
-		spin_lock_bh(&write_nodes_lock);
-	}
-	spin_unlock_bh(&write_nodes_lock);
-}
-
-/* Called after we've had -EAGAIN and been woken up */
-static void refill_write_queue(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-
-		if (ni) {
-			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
-				spin_lock_bh(&write_nodes_lock);
-				list_add_tail(&ni->write_list, &write_nodes);
-				spin_unlock_bh(&write_nodes_lock);
-			}
-		}
-	}
-}
-
-static void clean_one_writequeue(struct nodeinfo *ni)
-{
-	struct list_head *list;
-	struct list_head *temp;
-
-	spin_lock(&ni->writequeue_lock);
-	list_for_each_safe(list, temp, &ni->writequeue) {
-		struct writequeue_entry *e =
-			list_entry(list, struct writequeue_entry, list);
-		list_del(&e->list);
-		free_entry(e);
-	}
-	spin_unlock(&ni->writequeue_lock);
-}
-
-static void clean_writequeues(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-		if (ni)
-			clean_one_writequeue(ni);
-	}
-}
-
-
-static void dealloc_nodeinfo(void)
-{
-	int i;
-
-	for (i=1; i<=max_nodeid; i++) {
-		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
-		if (ni) {
-			idr_remove(&nodeinfo_idr, i);
-			kfree(ni);
-		}
-	}
-}
-
-int dlm_lowcomms_close(int nodeid)
-{
-	struct nodeinfo *ni;
-
-	ni = nodeid2nodeinfo(nodeid, 0);
-	if (!ni)
-		return -1;
-
-	spin_lock(&ni->lock);
-	if (ni->assoc_id) {
-		ni->assoc_id = 0;
-		/* Don't send shutdown here, sctp will just queue it
-		   till the node comes back up! */
-	}
-	spin_unlock(&ni->lock);
-
-	clean_one_writequeue(ni);
-	clear_bit(NI_INIT_PENDING, &ni->flags);
-	return 0;
-}
-
-// PJC: The work queue function for receiving.
-static void process_recv_sockets(struct work_struct *work)
-{
-	if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
-		int ret;
-		int count = 0;
-
-		do {
-			ret = receive_from_sock();
-
-			/* Don't starve out everyone else */
-			if (++count >= MAX_RX_MSG_COUNT) {
-				cond_resched();
-				count = 0;
-			}
-		} while (!kthread_should_stop() && ret >=0);
-	}
-	cond_resched();
-}
-
-// PJC: the work queue function for sending
-static void process_send_sockets(struct work_struct *work)
-{
-	if (sctp_con.eagain_flag) {
-		sctp_con.eagain_flag = 0;
-		refill_write_queue();
-	}
-	process_output_queue();
-}
-
-// PJC: Process lock requests from a particular node.
-// TODO: can we optimise this out on UP ??
-static void process_lock_request(struct work_struct *work)
-{
-}
-
-static void daemons_stop(void)
-{
-	destroy_workqueue(recv_workqueue);
-	destroy_workqueue(send_workqueue);
-	destroy_workqueue(lock_workqueue);
-}
-
-static int daemons_start(void)
-{
-	int error;
-	recv_workqueue = create_workqueue("dlm_recv");
-	error = IS_ERR(recv_workqueue);
-	if (error) {
-		log_print("can't start dlm_recv %d", error);
-		return error;
-	}
-
-	send_workqueue = create_singlethread_workqueue("dlm_send");
-	error = IS_ERR(send_workqueue);
-	if (error) {
-		log_print("can't start dlm_send %d", error);
-		destroy_workqueue(recv_workqueue);
-		return error;
-	}
-
-	lock_workqueue = create_workqueue("dlm_rlock");
-	error = IS_ERR(lock_workqueue);
-	if (error) {
-		log_print("can't start dlm_rlock %d", error);
-		destroy_workqueue(send_workqueue);
-		destroy_workqueue(recv_workqueue);
-		return error;
-	}
-
-	return 0;
-}
-
-/*
- * This is quite likely to sleep...
- */
-int dlm_lowcomms_start(void)
-{
-	int error;
-
-	INIT_WORK(&sctp_con.work, process_recv_sockets);
-
-	error = init_sock();
-	if (error)
-		goto fail_sock;
-	error = daemons_start();
-	if (error)
-		goto fail_sock;
-	return 0;
-
-fail_sock:
-	close_connection();
-	return error;
-}
-
-void dlm_lowcomms_stop(void)
-{
-	int i;
-
-	sctp_con.flags = 0x7;
-	daemons_stop();
-	clean_writequeues();
-	close_connection();
-	dealloc_nodeinfo();
-	max_nodeid = 0;
-
-	dlm_local_count = 0;
-	dlm_local_nodeid = 0;
-
-	for (i = 0; i < dlm_local_count; i++)
-		kfree(dlm_local_addr[i]);
-}
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms.c
index 07e0a122c32f..27970a58d29b 100644
--- a/fs/dlm/lowcomms-tcp.c
+++ b/fs/dlm/lowcomms.c
@@ -36,30 +36,36 @@
  * of high load. Also, this way, the sending thread can collect together
  * messages bound for one node and send them in one block.
  *
- * I don't see any problem with the recv thread executing the locking
- * code on behalf of remote processes as the locking code is
- * short, efficient and never waits.
+ * lowcomms will choose to use wither TCP or SCTP as its transport layer
+ * depending on the configuration variable 'protocol'. This should be set
+ * to 0 (default) for TCP or 1 for SCTP. It shouldbe configured using a
+ * cluster-wide mechanism as it must be the same on all nodes of the cluster
+ * for the DLM to function.
  *
  */
 
-
 #include <asm/ioctls.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/pagemap.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/sctp.h>
+#include <net/sctp/user.h>
 
 #include "dlm_internal.h"
 #include "lowcomms.h"
 #include "midcomms.h"
 #include "config.h"
 
+#define NEEDED_RMEM (4*1024*1024)
+
 struct cbuf {
 	unsigned int base;
 	unsigned int len;
 	unsigned int mask;
 };
 
-#define NODE_INCREMENT 32
 static void cbuf_add(struct cbuf *cb, int n)
 {
 	cb->len += n;
@@ -88,28 +94,25 @@ static bool cbuf_empty(struct cbuf *cb)
 	return cb->len == 0;
 }
 
-/* Maximum number of incoming messages to process before
-   doing a cond_resched()
-*/
-#define MAX_RX_MSG_COUNT 25
-
 struct connection {
 	struct socket *sock;	/* NULL if not connected */
 	uint32_t nodeid;	/* So we know who we are in the list */
 	struct mutex sock_mutex;
-	unsigned long flags;	/* bit 1,2 = We are on the read/write lists */
+	unsigned long flags;
 #define CF_READ_PENDING 1
 #define CF_WRITE_PENDING 2
 #define CF_CONNECT_PENDING 3
-#define CF_IS_OTHERCON 4
+#define CF_INIT_PENDING 4
+#define CF_IS_OTHERCON 5
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
-	struct list_head listenlist;  /* List of allocated listening sockets */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
+	void (*connect_action) (struct connection *);	/* What to do to connect */
 	struct page *rx_page;
 	struct cbuf cb;
 	int retries;
 #define MAX_CONNECT_RETRIES 3
+	int sctp_assoc;
 	struct connection *othercon;
 	struct work_struct rwork; /* Receive workqueue */
 	struct work_struct swork; /* Send workqueue */
@@ -127,68 +130,136 @@ struct writequeue_entry {
 	struct connection *con;
 };
 
-static struct sockaddr_storage dlm_local_addr;
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int dlm_local_count;
 
 /* Work queues */
 static struct workqueue_struct *recv_workqueue;
 static struct workqueue_struct *send_workqueue;
 
-/* An array of pointers to connections, indexed by NODEID */
-static struct connection **connections;
+static DEFINE_IDR(connections_idr);
 static DECLARE_MUTEX(connections_lock);
+static int max_nodeid;
 static struct kmem_cache *con_cache;
-static int conn_array_size;
 
 static void process_recv_sockets(struct work_struct *work);
 static void process_send_sockets(struct work_struct *work);
 
-static struct connection *nodeid2con(int nodeid, gfp_t allocation)
+/*
+ * If 'allocation' is zero then we don't attempt to create a new
+ * connection structure for this node.
+ */
+static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
 {
 	struct connection *con = NULL;
+	int r;
+	int n;
 
-	down(&connections_lock);
-	if (nodeid >= conn_array_size) {
-		int new_size = nodeid + NODE_INCREMENT;
-		struct connection **new_conns;
+	con = idr_find(&connections_idr, nodeid);
+	if (con || !alloc)
+		return con;
 
-		new_conns = kzalloc(sizeof(struct connection *) *
-				    new_size, allocation);
-		if (!new_conns)
-			goto finish;
+	r = idr_pre_get(&connections_idr, alloc);
+	if (!r)
+		return NULL;
+
+	con = kmem_cache_zalloc(con_cache, alloc);
+	if (!con)
+		return NULL;
 
-		memcpy(new_conns, connections,  sizeof(struct connection *) * conn_array_size);
-		conn_array_size = new_size;
-		kfree(connections);
-		connections = new_conns;
+	r = idr_get_new_above(&connections_idr, con, nodeid, &n);
+	if (r) {
+		kmem_cache_free(con_cache, con);
+		return NULL;
+	}
 
+	if (n != nodeid) {
+		idr_remove(&connections_idr, n);
+		kmem_cache_free(con_cache, con);
+		return NULL;
 	}
 
-	con = connections[nodeid];
-	if (con == NULL && allocation) {
-		con = kmem_cache_zalloc(con_cache, allocation);
-		if (!con)
-			goto finish;
+	con->nodeid = nodeid;
+	mutex_init(&con->sock_mutex);
+	INIT_LIST_HEAD(&con->writequeue);
+	spin_lock_init(&con->writequeue_lock);
+	INIT_WORK(&con->swork, process_send_sockets);
+	INIT_WORK(&con->rwork, process_recv_sockets);
 
-		con->nodeid = nodeid;
-		mutex_init(&con->sock_mutex);
-		INIT_LIST_HEAD(&con->writequeue);
-		spin_lock_init(&con->writequeue_lock);
-		INIT_WORK(&con->swork, process_send_sockets);
-		INIT_WORK(&con->rwork, process_recv_sockets);
+	/* Setup action pointers for child sockets */
+	if (con->nodeid) {
+		struct connection *zerocon = idr_find(&connections_idr, 0);
 
-		connections[nodeid] = con;
+		con->connect_action = zerocon->connect_action;
+		if (!con->rx_action)
+			con->rx_action = zerocon->rx_action;
 	}
 
-finish:
+	if (nodeid > max_nodeid)
+		max_nodeid = nodeid;
+
+	return con;
+}
+
+static struct connection *nodeid2con(int nodeid, gfp_t allocation)
+{
+	struct connection *con;
+
+	down(&connections_lock);
+	con = __nodeid2con(nodeid, allocation);
 	up(&connections_lock);
+
 	return con;
 }
 
+/* This is a bit drastic, but only called when things go wrong */
+static struct connection *assoc2con(int assoc_id)
+{
+	int i;
+	struct connection *con;
+
+	down(&connections_lock);
+	for (i=0; i<=max_nodeid; i++) {
+		con = __nodeid2con(i, 0);
+		if (con && con->sctp_assoc == assoc_id) {
+			up(&connections_lock);
+			return con;
+		}
+	}
+	up(&connections_lock);
+	return NULL;
+}
+
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+	struct sockaddr_storage addr;
+	int error;
+
+	if (!dlm_local_count)
+		return -1;
+
+	error = dlm_nodeid_to_addr(nodeid, &addr);
+	if (error)
+		return error;
+
+	if (dlm_local_addr[0]->ss_family == AF_INET) {
+		struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+	} else {
+		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+		       sizeof(in6->sin6_addr));
+	}
+
+	return 0;
+}
+
 /* Data available on socket or listen socket received a connect */
 static void lowcomms_data_ready(struct sock *sk, int count_unused)
 {
 	struct connection *con = sock2con(sk);
-
 	if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
 		queue_work(recv_workqueue, &con->rwork);
 }
@@ -222,20 +293,21 @@ static int add_sock(struct socket *sock, struct connection *con)
 	con->sock->sk->sk_data_ready = lowcomms_data_ready;
 	con->sock->sk->sk_write_space = lowcomms_write_space;
 	con->sock->sk->sk_state_change = lowcomms_state_change;
-
+	con->sock->sk->sk_user_data = con;
 	return 0;
 }
 
-/* Add the port number to an IP6 or 4 sockaddr and return the address
+/* Add the port number to an IPv6 or 4 sockaddr and return the address
    length */
 static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 			  int *addr_len)
 {
-	saddr->ss_family =  dlm_local_addr.ss_family;
+	saddr->ss_family =  dlm_local_addr[0]->ss_family;
 	if (saddr->ss_family == AF_INET) {
 		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
 		in4_addr->sin_port = cpu_to_be16(port);
 		*addr_len = sizeof(struct sockaddr_in);
+		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
 	} else {
 		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
 		in6_addr->sin6_port = cpu_to_be16(port);
@@ -264,6 +336,193 @@ static void close_connection(struct connection *con, bool and_other)
 	mutex_unlock(&con->sock_mutex);
 }
 
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void sctp_send_shutdown(sctp_assoc_t associd)
+{
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	int ret;
+	struct connection *con;
+
+	con = nodeid2con(0,0);
+	BUG_ON(con == NULL);
+
+	outmessage.msg_name = NULL;
+	outmessage.msg_namelen = 0;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	outmessage.msg_controllen = cmsg->cmsg_len;
+	sinfo = CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+
+	sinfo->sinfo_flags |= MSG_EOF;
+	sinfo->sinfo_assoc_id = associd;
+
+	ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0);
+
+	if (ret != 0)
+		log_print("send EOF to node failed: %d", ret);
+}
+
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void sctp_init_failed(void)
+{
+	int i;
+	struct connection *con;
+
+	down(&connections_lock);
+	for (i=1; i<=max_nodeid; i++) {
+		con = __nodeid2con(i, 0);
+		if (!con)
+			continue;
+		con->sctp_assoc = 0;
+		if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+			if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
+				queue_work(send_workqueue, &con->swork);
+			}
+		}
+	}
+	up(&connections_lock);
+}
+
+/* Something happened to an association */
+static void process_sctp_notification(struct connection *con,
+				      struct msghdr *msg, char *buf)
+{
+	union sctp_notification *sn = (union sctp_notification *)buf;
+
+	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+		switch (sn->sn_assoc_change.sac_state) {
+
+		case SCTP_COMM_UP:
+		case SCTP_RESTART:
+		{
+			/* Check that the new node is in the lockspace */
+			struct sctp_prim prim;
+			int nodeid;
+			int prim_len, ret;
+			int addr_len;
+			struct connection *new_con;
+			struct file *file;
+			sctp_peeloff_arg_t parg;
+			int parglen = sizeof(parg);
+
+			/*
+			 * We get this before any data for an association.
+			 * We verify that the node is in the cluster and
+			 * then peel off a socket for it.
+			 */
+			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+				log_print("COMM_UP for invalid assoc ID %d",
+					 (int)sn->sn_assoc_change.sac_assoc_id);
+				sctp_init_failed();
+				return;
+			}
+			memset(&prim, 0, sizeof(struct sctp_prim));
+			prim_len = sizeof(struct sctp_prim);
+			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+
+			ret = kernel_getsockopt(con->sock,
+						IPPROTO_SCTP,
+						SCTP_PRIMARY_ADDR,
+						(char*)&prim,
+						&prim_len);
+			if (ret < 0) {
+				log_print("getsockopt/sctp_primary_addr on "
+					  "new assoc %d failed : %d",
+					  (int)sn->sn_assoc_change.sac_assoc_id,
+					  ret);
+
+				/* Retry INIT later */
+				new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+				if (new_con)
+					clear_bit(CF_CONNECT_PENDING, &con->flags);
+				return;
+			}
+			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+			if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+				int i;
+				unsigned char *b=(unsigned char *)&prim.ssp_addr;
+				log_print("reject connect from unknown addr");
+				for (i=0; i<sizeof(struct sockaddr_storage);i++)
+					printk("%02x ", b[i]);
+				printk("\n");
+				sctp_send_shutdown(prim.ssp_assoc_id);
+				return;
+			}
+
+			new_con = nodeid2con(nodeid, GFP_KERNEL);
+			if (!new_con)
+				return;
+
+			/* Peel off a new sock */
+			parg.associd = sn->sn_assoc_change.sac_assoc_id;
+			ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
+						SCTP_SOCKOPT_PEELOFF,
+						(void *)&parg, &parglen);
+			if (ret) {
+				log_print("Can't peel off a socket for "
+					  "connection %d to node %d: err=%d\n",
+					  parg.associd, nodeid, ret);
+			}
+			file = fget(parg.sd);
+			new_con->sock = SOCKET_I(file->f_dentry->d_inode);
+			add_sock(new_con->sock, new_con);
+			fput(file);
+			put_unused_fd(parg.sd);
+
+			log_print("got new/restarted association %d nodeid %d",
+				 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+
+			/* Send any pending writes */
+			clear_bit(CF_CONNECT_PENDING, &new_con->flags);
+			clear_bit(CF_INIT_PENDING, &con->flags);
+			if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
+				queue_work(send_workqueue, &new_con->swork);
+			}
+			if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags))
+				queue_work(recv_workqueue, &new_con->rwork);
+		}
+		break;
+
+		case SCTP_COMM_LOST:
+		case SCTP_SHUTDOWN_COMP:
+		{
+			con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
+			if (con) {
+				con->sctp_assoc = 0;
+			}
+		}
+		break;
+
+		/* We don't know which INIT failed, so clear the PENDING flags
+		 * on them all.  if assoc_id is zero then it will then try
+		 * again */
+
+		case SCTP_CANT_STR_ASSOC:
+		{
+			log_print("Can't start SCTP association - retrying");
+			sctp_init_failed();
+		}
+		break;
+
+		default:
+			log_print("unexpected SCTP assoc change id=%d state=%d",
+				  (int)sn->sn_assoc_change.sac_assoc_id,
+				  sn->sn_assoc_change.sac_state);
+		}
+	}
+}
+
 /* Data received from remote end */
 static int receive_from_sock(struct connection *con)
 {
@@ -274,6 +533,7 @@ static int receive_from_sock(struct connection *con)
 	int r;
 	int call_again_soon = 0;
 	int nvec;
+	char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
 
 	mutex_lock(&con->sock_mutex);
 
@@ -293,12 +553,18 @@ static int receive_from_sock(struct connection *con)
 		cbuf_init(&con->cb, PAGE_CACHE_SIZE);
 	}
 
+	/* Only SCTP needs these really */
+	memset(&incmsg, 0, sizeof(incmsg));
+	msg.msg_control = incmsg;
+	msg.msg_controllen = sizeof(incmsg);
+
 	/*
 	 * iov[0] is the bit of the circular buffer between the current end
 	 * point (cb.base + cb.len) and the end of the buffer.
 	 */
 	iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
 	iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
+	iov[1].iov_len = 0;
 	nvec = 1;
 
 	/*
@@ -315,11 +581,20 @@ static int receive_from_sock(struct connection *con)
 
 	r = ret = kernel_recvmsg(con->sock, &msg, iov, nvec, len,
 			       MSG_DONTWAIT | MSG_NOSIGNAL);
-
 	if (ret <= 0)
 		goto out_close;
-	if (ret == -EAGAIN)
-		goto out_resched;
+
+	/* Process SCTP notifications */
+	if (msg.msg_flags & MSG_NOTIFICATION) {
+		msg.msg_control = incmsg;
+		msg.msg_controllen = sizeof(incmsg);
+
+		process_sctp_notification(con, &msg,
+				page_address(con->rx_page) + con->cb.base);
+		mutex_unlock(&con->sock_mutex);
+		return 0;
+	}
+	BUG_ON(con->nodeid == 0);
 
 	if (ret == len)
 		call_again_soon = 1;
@@ -329,10 +604,10 @@ static int receive_from_sock(struct connection *con)
 					  con->cb.base, con->cb.len,
 					  PAGE_CACHE_SIZE);
 	if (ret == -EBADMSG) {
-		printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
-		       "iov_len=%u, iov_base[0]=%p, read=%d\n",
-		       page_address(con->rx_page), con->cb.base, con->cb.len,
-		       len, iov[0].iov_base, r);
+		log_print("lowcomms: addr=%p, base=%u, len=%u, "
+			  "iov_len=%u, iov_base[0]=%p, read=%d",
+			  page_address(con->rx_page), con->cb.base, con->cb.len,
+			  len, iov[0].iov_base, r);
 	}
 	if (ret < 0)
 		goto out_close;
@@ -368,7 +643,7 @@ out_close:
 }
 
 /* Listening socket is busy, accept a connection */
-static int accept_from_sock(struct connection *con)
+static int tcp_accept_from_sock(struct connection *con)
 {
 	int result;
 	struct sockaddr_storage peeraddr;
@@ -379,7 +654,7 @@ static int accept_from_sock(struct connection *con)
 	struct connection *addcon;
 
 	memset(&peeraddr, 0, sizeof(peeraddr));
-	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
 				  IPPROTO_TCP, &newsock);
 	if (result < 0)
 		return -ENOMEM;
@@ -408,7 +683,7 @@ static int accept_from_sock(struct connection *con)
 	/* Get the new node's NODEID */
 	make_sockaddr(&peeraddr, 0, &len);
 	if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
-		printk("dlm: connect from non cluster node\n");
+		log_print("connect from non cluster node");
 		sock_release(newsock);
 		mutex_unlock(&con->sock_mutex);
 		return -1;
@@ -419,7 +694,6 @@ static int accept_from_sock(struct connection *con)
 	/*  Check to see if we already have a connection to this node. This
 	 *  could happen if the two nodes initiate a connection at roughly
 	 *  the same time and the connections cross on the wire.
-	 * TEMPORARY FIX:
 	 *  In this case we store the incoming one in "othercon"
 	 */
 	newcon = nodeid2con(nodeid, GFP_KERNEL);
@@ -434,7 +708,7 @@ static int accept_from_sock(struct connection *con)
 		if (!othercon) {
 			othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
 			if (!othercon) {
-				printk("dlm: failed to allocate incoming socket\n");
+				log_print("failed to allocate incoming socket");
 				mutex_unlock(&newcon->sock_mutex);
 				result = -ENOMEM;
 				goto accept_err;
@@ -477,12 +751,107 @@ accept_err:
 	sock_release(newsock);
 
 	if (result != -EAGAIN)
-		printk("dlm: error accepting connection from node: %d\n", result);
+		log_print("error accepting connection from node: %d", result);
 	return result;
 }
 
+static void free_entry(struct writequeue_entry *e)
+{
+	__free_page(e->page);
+	kfree(e);
+}
+
+/* Initiate an SCTP association.
+   This is a special case of send_to_sock() in that we don't yet have a
+   peeled-off socket for this association, so we use the listening socket
+   and add the primary IP address of the remote node.
+ */
+static void sctp_init_assoc(struct connection *con)
+{
+	struct sockaddr_storage rem_addr;
+	char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	struct connection *base_con;
+	struct writequeue_entry *e;
+	int len, offset;
+	int ret;
+	int addrlen;
+	struct kvec iov[1];
+
+	if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
+		return;
+
+	if (con->retries++ > MAX_CONNECT_RETRIES)
+		return;
+
+	log_print("Initiating association with node %d", con->nodeid);
+
+	if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
+		log_print("no address for nodeid %d", con->nodeid);
+		return;
+	}
+	base_con = nodeid2con(0, 0);
+	BUG_ON(base_con == NULL);
+
+	make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
+
+	outmessage.msg_name = &rem_addr;
+	outmessage.msg_namelen = addrlen;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	spin_lock(&con->writequeue_lock);
+	e = list_entry(con->writequeue.next, struct writequeue_entry,
+		       list);
+
+	BUG_ON((struct list_head *) e == &con->writequeue);
+
+	len = e->len;
+	offset = e->offset;
+	spin_unlock(&con->writequeue_lock);
+	kmap(e->page);
+
+	/* Send the first block off the write queue */
+	iov[0].iov_base = page_address(e->page)+offset;
+	iov[0].iov_len = len;
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sinfo = CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+	sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+	outmessage.msg_controllen = cmsg->cmsg_len;
+
+	ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
+	if (ret < 0) {
+		log_print("Send first packet to node %d failed: %d",
+			  con->nodeid, ret);
+
+		/* Try again later */
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		clear_bit(CF_INIT_PENDING, &con->flags);
+	}
+	else {
+		spin_lock(&con->writequeue_lock);
+		e->offset += ret;
+		e->len -= ret;
+
+		if (e->len == 0 && e->users == 0) {
+			list_del(&e->list);
+			kunmap(e->page);
+			free_entry(e);
+		}
+		spin_unlock(&con->writequeue_lock);
+	}
+}
+
 /* Connect a new socket to its peer */
-static void connect_to_sock(struct connection *con)
+static void tcp_connect_to_sock(struct connection *con)
 {
 	int result = -EHOSTUNREACH;
 	struct sockaddr_storage saddr;
@@ -505,7 +874,7 @@ static void connect_to_sock(struct connection *con)
 	}
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
 				  IPPROTO_TCP, &sock);
 	if (result < 0)
 		goto out_err;
@@ -516,11 +885,11 @@ static void connect_to_sock(struct connection *con)
 
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
+	con->connect_action = tcp_connect_to_sock;
+	add_sock(sock, con);
 
 	make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
 
-	add_sock(sock, con);
-
 	log_print("connecting to %d", con->nodeid);
 	result =
 		sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
@@ -550,64 +919,57 @@ out:
 	return;
 }
 
-static struct socket *create_listen_sock(struct connection *con,
-					 struct sockaddr_storage *saddr)
+static struct socket *tcp_create_listen_sock(struct connection *con,
+					     struct sockaddr_storage *saddr)
 {
 	struct socket *sock = NULL;
-	mm_segment_t fs;
 	int result = 0;
 	int one = 1;
 	int addr_len;
 
-	if (dlm_local_addr.ss_family == AF_INET)
+	if (dlm_local_addr[0]->ss_family == AF_INET)
 		addr_len = sizeof(struct sockaddr_in);
 	else
 		addr_len = sizeof(struct sockaddr_in6);
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock);
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
+				  IPPROTO_TCP, &sock);
 	if (result < 0) {
-		printk("dlm: Can't create listening comms socket\n");
+		log_print("Can't create listening comms socket");
 		goto create_out;
 	}
 
-	fs = get_fs();
-	set_fs(get_ds());
-	result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-				 (char *)&one, sizeof(one));
-	set_fs(fs);
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+				   (char *)&one, sizeof(one));
+
 	if (result < 0) {
-		printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",
-		       result);
+		log_print("Failed to set SO_REUSEADDR on socket: %d", result);
 	}
 	sock->sk->sk_user_data = con;
-	con->rx_action = accept_from_sock;
+	con->rx_action = tcp_accept_from_sock;
+	con->connect_action = tcp_connect_to_sock;
 	con->sock = sock;
 
 	/* Bind to our port */
 	make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
 	result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
 	if (result < 0) {
-		printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port);
+		log_print("Can't bind to port %d", dlm_config.ci_tcp_port);
 		sock_release(sock);
 		sock = NULL;
 		con->sock = NULL;
 		goto create_out;
 	}
-
-	fs = get_fs();
-	set_fs(get_ds());
-
-	result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
 				 (char *)&one, sizeof(one));
-	set_fs(fs);
 	if (result < 0) {
-		printk("dlm: Set keepalive failed: %d\n", result);
+		log_print("Set keepalive failed: %d", result);
 	}
 
 	result = sock->ops->listen(sock, 5);
 	if (result < 0) {
-		printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port);
+		log_print("Can't listen on port %d", dlm_config.ci_tcp_port);
 		sock_release(sock);
 		sock = NULL;
 		goto create_out;
@@ -617,18 +979,146 @@ create_out:
 	return sock;
 }
 
+/* Get local addresses */
+static void init_local(void)
+{
+	struct sockaddr_storage sas, *addr;
+	int i;
+
+	dlm_local_count = 0;
+	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+		if (dlm_our_addr(&sas, i))
+			break;
+
+		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+		if (!addr)
+			break;
+		memcpy(addr, &sas, sizeof(*addr));
+		dlm_local_addr[dlm_local_count++] = addr;
+	}
+}
+
+/* Bind to an IP address. SCTP allows multiple address so it can do
+   multi-homing */
+static int add_sctp_bind_addr(struct connection *sctp_con,
+			      struct sockaddr_storage *addr,
+			      int addr_len, int num)
+{
+	int result = 0;
+
+	if (num == 1)
+		result = kernel_bind(sctp_con->sock,
+				     (struct sockaddr *) addr,
+				     addr_len);
+	else
+		result = kernel_setsockopt(sctp_con->sock, SOL_SCTP,
+					   SCTP_SOCKOPT_BINDX_ADD,
+					   (char *)addr, addr_len);
+
+	if (result < 0)
+		log_print("Can't bind to port %d addr number %d",
+			  dlm_config.ci_tcp_port, num);
+
+	return result;
+}
 
-/* Listen on all interfaces */
-static int listen_for_all(void)
+/* Initialise SCTP socket and bind to all interfaces */
+static int sctp_listen_for_all(void)
+{
+	struct socket *sock = NULL;
+	struct sockaddr_storage localaddr;
+	struct sctp_event_subscribe subscribe;
+	int result = -EINVAL, num = 1, i, addr_len;
+	struct connection *con = nodeid2con(0, GFP_KERNEL);
+	int bufsize = NEEDED_RMEM;
+
+	if (!con)
+		return -ENOMEM;
+
+	log_print("Using SCTP for communications");
+
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+				  IPPROTO_SCTP, &sock);
+	if (result < 0) {
+		log_print("Can't create comms socket, check SCTP is loaded");
+		goto out;
+	}
+
+	/* Listen for events */
+	memset(&subscribe, 0, sizeof(subscribe));
+	subscribe.sctp_data_io_event = 1;
+	subscribe.sctp_association_event = 1;
+	subscribe.sctp_send_failure_event = 1;
+	subscribe.sctp_shutdown_event = 1;
+	subscribe.sctp_partial_delivery_event = 1;
+
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+				 (char *)&bufsize, sizeof(bufsize));
+	if (result)
+		log_print("Error increasing buffer space on socket %d", result);
+
+	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+				   (char *)&subscribe, sizeof(subscribe));
+	if (result < 0) {
+		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+			  result);
+		goto create_delsock;
+	}
+
+	/* Init con struct */
+	sock->sk->sk_user_data = con;
+	con->sock = sock;
+	con->sock->sk->sk_data_ready = lowcomms_data_ready;
+	con->rx_action = receive_from_sock;
+	con->connect_action = sctp_init_assoc;
+
+	/* Bind to all interfaces. */
+	for (i = 0; i < dlm_local_count; i++) {
+		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+		make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
+
+		result = add_sctp_bind_addr(con, &localaddr, addr_len, num);
+		if (result)
+			goto create_delsock;
+		++num;
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		log_print("Can't set socket listening");
+		goto create_delsock;
+	}
+
+	return 0;
+
+create_delsock:
+	sock_release(sock);
+	con->sock = NULL;
+out:
+	return result;
+}
+
+static int tcp_listen_for_all(void)
 {
 	struct socket *sock = NULL;
 	struct connection *con = nodeid2con(0, GFP_KERNEL);
 	int result = -EINVAL;
 
+	if (!con)
+		return -ENOMEM;
+
 	/* We don't support multi-homed hosts */
+	if (dlm_local_addr[1] != NULL) {
+		log_print("TCP protocol can't handle multi-homed hosts, "
+			  "try SCTP");
+		return -EINVAL;
+	}
+
+	log_print("Using TCP for communications");
+
 	set_bit(CF_IS_OTHERCON, &con->flags);
 
-	sock = create_listen_sock(con, &dlm_local_addr);
+	sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
 	if (sock) {
 		add_sock(sock, con);
 		result = 0;
@@ -666,8 +1156,7 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con,
 	return entry;
 }
 
-void *dlm_lowcomms_get_buffer(int nodeid, int len,
-			      gfp_t allocation, char **ppc)
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
 {
 	struct connection *con;
 	struct writequeue_entry *e;
@@ -735,12 +1224,6 @@ out:
 	return;
 }
 
-static void free_entry(struct writequeue_entry *e)
-{
-	__free_page(e->page);
-	kfree(e);
-}
-
 /* Send a message */
 static void send_to_sock(struct connection *con)
 {
@@ -777,8 +1260,7 @@ static void send_to_sock(struct connection *con)
 				goto out;
 			if (ret <= 0)
 				goto send_error;
-		}
-		else {
+		} else {
 			/* Don't starve people filling buffers */
 			cond_resched();
 		}
@@ -807,7 +1289,8 @@ send_error:
 
 out_connect:
 	mutex_unlock(&con->sock_mutex);
-	connect_to_sock(con);
+	if (!test_bit(CF_INIT_PENDING, &con->flags))
+		lowcomms_connect_sock(con);
 	return;
 }
 
@@ -832,9 +1315,6 @@ int dlm_lowcomms_close(int nodeid)
 {
 	struct connection *con;
 
-	if (!connections)
-		goto out;
-
 	log_print("closing connection to node %d", nodeid);
 	con = nodeid2con(nodeid, 0);
 	if (con) {
@@ -842,12 +1322,9 @@ int dlm_lowcomms_close(int nodeid)
 		close_connection(con, true);
 	}
 	return 0;
-
-out:
-	return -1;
 }
 
-/* Look for activity on active sockets */
+/* Receive workqueue function */
 static void process_recv_sockets(struct work_struct *work)
 {
 	struct connection *con = container_of(work, struct connection, rwork);
@@ -859,15 +1336,14 @@ static void process_recv_sockets(struct work_struct *work)
 	} while (!err);
 }
 
-
+/* Send workqueue function */
 static void process_send_sockets(struct work_struct *work)
 {
 	struct connection *con = container_of(work, struct connection, swork);
 
 	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
-		connect_to_sock(con);
+		con->connect_action(con);
 	}
-
 	clear_bit(CF_WRITE_PENDING, &con->flags);
 	send_to_sock(con);
 }
@@ -878,8 +1354,8 @@ static void clean_writequeues(void)
 {
 	int nodeid;
 
-	for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
-		struct connection *con = nodeid2con(nodeid, 0);
+	for (nodeid = 1; nodeid <= max_nodeid; nodeid++) {
+		struct connection *con = __nodeid2con(nodeid, 0);
 
 		if (con)
 			clean_one_writequeue(con);
@@ -916,64 +1392,67 @@ static int work_start(void)
 void dlm_lowcomms_stop(void)
 {
 	int i;
+	struct connection *con;
 
 	/* Set all the flags to prevent any
 	   socket activity.
 	*/
-	for (i = 0; i < conn_array_size; i++) {
-		if (connections[i])
-			connections[i]->flags |= 0xFF;
+	down(&connections_lock);
+	for (i = 0; i <= max_nodeid; i++) {
+		con = __nodeid2con(i, 0);
+		if (con)
+			con->flags |= 0xFF;
 	}
+	up(&connections_lock);
 
 	work_stop();
+
+	down(&connections_lock);
 	clean_writequeues();
 
-	for (i = 0; i < conn_array_size; i++) {
-		if (connections[i]) {
-			close_connection(connections[i], true);
-			if (connections[i]->othercon)
-				kmem_cache_free(con_cache, connections[i]->othercon);
-			kmem_cache_free(con_cache, connections[i]);
+	for (i = 0; i <= max_nodeid; i++) {
+		con = __nodeid2con(i, 0);
+		if (con) {
+			close_connection(con, true);
+			if (con->othercon)
+				kmem_cache_free(con_cache, con->othercon);
+			kmem_cache_free(con_cache, con);
 		}
 	}
-
-	kfree(connections);
-	connections = NULL;
-
+	max_nodeid = 0;
+	up(&connections_lock);
 	kmem_cache_destroy(con_cache);
+	idr_init(&connections_idr);
 }
 
-/* This is quite likely to sleep... */
 int dlm_lowcomms_start(void)
 {
-	int error = 0;
-
-	error = -ENOMEM;
-	connections = kzalloc(sizeof(struct connection *) *
-			      NODE_INCREMENT, GFP_KERNEL);
-	if (!connections)
-		goto out;
-
-	conn_array_size = NODE_INCREMENT;
+	int error = -EINVAL;
+	struct connection *con;
 
-	if (dlm_our_addr(&dlm_local_addr, 0)) {
+	init_local();
+	if (!dlm_local_count) {
+		error = -ENOTCONN;
 		log_print("no local IP address has been set");
-		goto fail_free_conn;
-	}
-	if (!dlm_our_addr(&dlm_local_addr, 1)) {
-		log_print("This dlm comms module does not support multi-homed clustering");
-		goto fail_free_conn;
+		goto out;
 	}
 
+	error = -ENOMEM;
 	con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
 				      __alignof__(struct connection), 0,
 				      NULL, NULL);
 	if (!con_cache)
-		goto fail_free_conn;
+		goto out;
 
+	/* Set some sysctl minima */
+	if (sysctl_rmem_max < NEEDED_RMEM)
+		sysctl_rmem_max = NEEDED_RMEM;
 
 	/* Start listening */
-	error = listen_for_all();
+	if (dlm_config.ci_protocol == 0)
+		error = tcp_listen_for_all();
+	else
+		error = sctp_listen_for_all();
 	if (error)
 		goto fail_unlisten;
 
@@ -984,24 +1463,13 @@ int dlm_lowcomms_start(void)
 	return 0;
 
 fail_unlisten:
-	close_connection(connections[0], false);
-	kmem_cache_free(con_cache, connections[0]);
+	con = nodeid2con(0,0);
+	if (con) {
+		close_connection(con, false);
+		kmem_cache_free(con_cache, con);
+	}
 	kmem_cache_destroy(con_cache);
 
-fail_free_conn:
-	kfree(connections);
-
 out:
 	return error;
 }
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 3870150b83a4..b0201ec325a7 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2007 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -56,6 +56,7 @@ struct dlm_write_request32 {
 	union  {
 		struct dlm_lock_params32 lock;
 		struct dlm_lspace_params lspace;
+		struct dlm_purge_params purge;
 	} i;
 };
 
@@ -92,6 +93,9 @@ static void compat_input(struct dlm_write_request *kb,
 		kb->i.lspace.flags = kb32->i.lspace.flags;
 		kb->i.lspace.minor = kb32->i.lspace.minor;
 		strcpy(kb->i.lspace.name, kb32->i.lspace.name);
+	} else if (kb->cmd == DLM_USER_PURGE) {
+		kb->i.purge.nodeid = kb32->i.purge.nodeid;
+		kb->i.purge.pid = kb32->i.purge.pid;
 	} else {
 		kb->i.lock.mode = kb32->i.lock.mode;
 		kb->i.lock.namelen = kb32->i.lock.namelen;
@@ -111,8 +115,6 @@ static void compat_input(struct dlm_write_request *kb,
 static void compat_output(struct dlm_lock_result *res,
 			  struct dlm_lock_result32 *res32)
 {
-	res32->length = res->length - (sizeof(struct dlm_lock_result) -
-				       sizeof(struct dlm_lock_result32));
 	res32->user_astaddr = (__u32)(long)res->user_astaddr;
 	res32->user_astparam = (__u32)(long)res->user_astparam;
 	res32->user_lksb = (__u32)(long)res->user_lksb;
@@ -128,35 +130,30 @@ static void compat_output(struct dlm_lock_result *res,
 }
 #endif
 
+/* we could possibly check if the cancel of an orphan has resulted in the lkb
+   being removed and then remove that lkb from the orphans list and free it */
 
 void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 {
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
 	struct dlm_user_proc *proc;
-	int remove_ownqueue = 0;
+	int eol = 0, ast_type;
 
-	/* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
-	   lkb before dealing with it.  We need to check this
-	   flag before taking ls_clear_proc_locks mutex because if
-	   it's set, dlm_clear_proc_locks() holds the mutex. */
-
-	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
-		/* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
 		return;
-	}
 
 	ls = lkb->lkb_resource->res_ls;
 	mutex_lock(&ls->ls_clear_proc_locks);
 
 	/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
 	   can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
-	   lkb->ua so we can't try to use it. */
+	   lkb->ua so we can't try to use it.  This second check is necessary
+	   for cases where a completion ast is received for an operation that
+	   began before clear_proc_locks did its cancel/unlock. */
 
-	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
-		/* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
 		goto out;
-	}
 
 	DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
 	ua = (struct dlm_user_args *)lkb->lkb_astparam;
@@ -166,28 +163,42 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 		goto out;
 
 	spin_lock(&proc->asts_spin);
-	if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+
+	ast_type = lkb->lkb_ast_type;
+	lkb->lkb_ast_type |= type;
+
+	if (!ast_type) {
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_astqueue, &proc->asts);
-		lkb->lkb_ast_type |= type;
 		wake_up_interruptible(&proc->wait);
 	}
-
-	/* noqueue requests that fail may need to be removed from the
-	   proc's locks list, there should be a better way of detecting
-	   this situation than checking all these things... */
-
-	if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
-	    ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
-		remove_ownqueue = 1;
-
-	/* unlocks or cancels of waiting requests need to be removed from the
-	   proc's unlocking list, again there must be a better way...  */
-
-	if (ua->lksb.sb_status == -DLM_EUNLOCK ||
+	if (type == AST_COMP && (ast_type & AST_COMP))
+		log_debug(ls, "ast overlap %x status %x %x",
+			  lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
+
+	/* Figure out if this lock is at the end of its life and no longer
+	   available for the application to use.  The lkb still exists until
+	   the final ast is read.  A lock becomes EOL in three situations:
+	     1. a noqueue request fails with EAGAIN
+	     2. an unlock completes with EUNLOCK
+	     3. a cancel of a waiting request completes with ECANCEL
+	   An EOL lock needs to be removed from the process's list of locks.
+	   And we can't allow any new operation on an EOL lock.  This is
+	   not related to the lifetime of the lkb struct which is managed
+	   entirely by refcount. */
+
+	if (type == AST_COMP &&
+	    lkb->lkb_grmode == DLM_LOCK_IV &&
+	    ua->lksb.sb_status == -EAGAIN)
+		eol = 1;
+	else if (ua->lksb.sb_status == -DLM_EUNLOCK ||
 	    (ua->lksb.sb_status == -DLM_ECANCEL &&
 	     lkb->lkb_grmode == DLM_LOCK_IV))
-		remove_ownqueue = 1;
+		eol = 1;
+	if (eol) {
+		lkb->lkb_ast_type &= ~AST_BAST;
+		lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
+	}
 
 	/* We want to copy the lvb to userspace when the completion
 	   ast is read if the status is 0, the lock has an lvb and
@@ -204,11 +215,13 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 
 	spin_unlock(&proc->asts_spin);
 
-	if (remove_ownqueue) {
+	if (eol) {
 		spin_lock(&ua->proc->locks_spin);
-		list_del_init(&lkb->lkb_ownqueue);
+		if (!list_empty(&lkb->lkb_ownqueue)) {
+			list_del_init(&lkb->lkb_ownqueue);
+			dlm_put_lkb(lkb);
+		}
 		spin_unlock(&ua->proc->locks_spin);
-		dlm_put_lkb(lkb);
 	}
  out:
 	mutex_unlock(&ls->ls_clear_proc_locks);
@@ -286,47 +299,71 @@ static int device_user_unlock(struct dlm_user_proc *proc,
 	return error;
 }
 
-static int device_create_lockspace(struct dlm_lspace_params *params)
+static int create_misc_device(struct dlm_ls *ls, char *name)
 {
-	dlm_lockspace_t *lockspace;
-	struct dlm_ls *ls;
 	int error, len;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	error = dlm_new_lockspace(params->name, strlen(params->name),
-				  &lockspace, 0, DLM_USER_LVB_LEN);
-	if (error)
-		return error;
-
-	ls = dlm_find_lockspace_local(lockspace);
-	if (!ls)
-		return -ENOENT;
-
 	error = -ENOMEM;
-	len = strlen(params->name) + strlen(name_prefix) + 2;
+	len = strlen(name) + strlen(name_prefix) + 2;
 	ls->ls_device.name = kzalloc(len, GFP_KERNEL);
 	if (!ls->ls_device.name)
 		goto fail;
+
 	snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
-		 params->name);
+		 name);
 	ls->ls_device.fops = &device_fops;
 	ls->ls_device.minor = MISC_DYNAMIC_MINOR;
 
 	error = misc_register(&ls->ls_device);
 	if (error) {
 		kfree(ls->ls_device.name);
-		goto fail;
 	}
+fail:
+	return error;
+}
+
+static int device_user_purge(struct dlm_user_proc *proc,
+			     struct dlm_purge_params *params)
+{
+	struct dlm_ls *ls;
+	int error;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = dlm_user_purge(ls, proc, params->nodeid, params->pid);
 
-	error = ls->ls_device.minor;
 	dlm_put_lockspace(ls);
 	return error;
+}
+
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+	dlm_lockspace_t *lockspace;
+	struct dlm_ls *ls;
+	int error;
 
- fail:
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	error = dlm_new_lockspace(params->name, strlen(params->name),
+				  &lockspace, 0, DLM_USER_LVB_LEN);
+	if (error)
+		return error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = create_misc_device(ls, params->name);
 	dlm_put_lockspace(ls);
-	dlm_release_lockspace(lockspace, 0);
+
+	if (error)
+		dlm_release_lockspace(lockspace, 0);
+	else
+		error = ls->ls_device.minor;
+
 	return error;
 }
 
@@ -343,6 +380,10 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 	if (!ls)
 		return -ENOENT;
 
+	/* Deregister the misc device first, so we don't have
+	 * a device that's not attached to a lockspace. If
+	 * dlm_release_lockspace fails then we can recreate it
+	 */
 	error = misc_deregister(&ls->ls_device);
 	if (error) {
 		dlm_put_lockspace(ls);
@@ -361,6 +402,8 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 
 	dlm_put_lockspace(ls);
 	error = dlm_release_lockspace(lockspace, force);
+	if (error)
+		create_misc_device(ls, ls->ls_name);
  out:
 	return error;
 }
@@ -497,6 +540,14 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 		error = device_remove_lockspace(&kbuf->i.lspace);
 		break;
 
+	case DLM_USER_PURGE:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_sig;
+		}
+		error = device_user_purge(proc, &kbuf->i.purge);
+		break;
+
 	default:
 		log_print("Unknown command passed to DLM device : %d\n",
 			  kbuf->cmd);
diff --git a/fs/dquot.c b/fs/dquot.c
index b16f991662c1..8819d281500c 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -69,7 +69,6 @@
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
@@ -475,7 +474,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
 		spin_lock(&dq_list_lock);
 		dirty = &dqopt->info[cnt].dqi_dirty_list;
 		while (!list_empty(dirty)) {
-			dquot = list_entry(dirty->next, struct dquot, dq_dirty);
+			dquot = list_first_entry(dirty, struct dquot, dq_dirty);
 			/* Dirty and inactive can be only bad dquot... */
 			if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
 				clear_dquot_dirty(dquot);
@@ -721,7 +720,8 @@ static inline int dqput_blocks(struct dquot *dquot)
 
 /* Remove references to dquots from inode - add dquot to list for freeing if needed */
 /* We can't race with anybody because we hold dqptr_sem for writing... */
-int remove_inode_dquot_ref(struct inode *inode, int type, struct list_head *tofree_head)
+static int remove_inode_dquot_ref(struct inode *inode, int type,
+				  struct list_head *tofree_head)
 {
 	struct dquot *dquot = inode->i_dquot[type];
 
@@ -1421,7 +1421,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
 			if (!sb_has_quota_enabled(sb, cnt)) {
-				mutex_lock(&toputinode[cnt]->i_mutex);
+				mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
 				truncate_inode_pages(&toputinode[cnt]->i_data, 0);
@@ -1432,7 +1432,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 			mutex_unlock(&dqopt->dqonoff_mutex);
 		}
 	if (sb->s_bdev)
-		invalidate_bdev(sb->s_bdev, 0);
+		invalidate_bdev(sb->s_bdev);
 	return 0;
 }
 
@@ -1468,7 +1468,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	 * we see all the changes from userspace... */
 	write_inode_now(inode, 1);
 	/* And now flush the block cache so that kernel sees the changes */
-	invalidate_bdev(sb->s_bdev, 0);
+	invalidate_bdev(sb->s_bdev);
 	mutex_lock(&inode->i_mutex);
 	mutex_lock(&dqopt->dqonoff_mutex);
 	if (sb_has_quota_enabled(sb, type)) {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 7a7d25d541e7..59288d817078 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -28,69 +28,11 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/security.h>
-#include <linux/smp_lock.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
 #include "ecryptfs_kernel.h"
 
 /**
- * ecryptfs_llseek
- * @file: File we are seeking in
- * @offset: The offset to seek to
- * @origin: 2 - offset from i_size; 1 - offset from f_pos
- *
- * Returns the position we have seeked to, or negative on error
- */
-static loff_t ecryptfs_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t rv;
-	loff_t new_end_pos;
-	int rc;
-	int expanding_file = 0;
-	struct inode *inode = file->f_mapping->host;
-
-	/* If our offset is past the end of our file, we're going to
-	 * need to grow it so we have a valid length of 0's */
-	new_end_pos = offset;
-	switch (origin) {
-	case 2:
-		new_end_pos += i_size_read(inode);
-		expanding_file = 1;
-		break;
-	case 1:
-		new_end_pos += file->f_pos;
-		if (new_end_pos > i_size_read(inode)) {
-			ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
-					"> i_size_read(inode)(=[0x%.16x])\n",
-					new_end_pos, i_size_read(inode));
-			expanding_file = 1;
-		}
-		break;
-	default:
-		if (new_end_pos > i_size_read(inode)) {
-			ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
-					"> i_size_read(inode)(=[0x%.16x])\n",
-					new_end_pos, i_size_read(inode));
-			expanding_file = 1;
-		}
-	}
-	ecryptfs_printk(KERN_DEBUG, "new_end_pos = [0x%.16x]\n", new_end_pos);
-	if (expanding_file) {
-		rc = ecryptfs_truncate(file->f_path.dentry, new_end_pos);
-		if (rc) {
-			rv = rc;
-			ecryptfs_printk(KERN_ERR, "Error on attempt to "
-					"truncate to (higher) offset [0x%.16x];"
-					" rc = [%d]\n", new_end_pos, rc);
-			goto out;
-		}
-	}
-	rv = generic_file_llseek(file, offset, origin);
-out:
-	return rv;
-}
-
-/**
  * ecryptfs_read_update_atime
  *
  * generic_file_read updates the atime of upper layer inode.  But, it
@@ -426,7 +368,7 @@ const struct file_operations ecryptfs_dir_fops = {
 };
 
 const struct file_operations ecryptfs_main_fops = {
-	.llseek = ecryptfs_llseek,
+	.llseek = generic_file_llseek,
 	.read = do_sync_read,
 	.aio_read = ecryptfs_read_update_atime,
 	.write = do_sync_write,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fc4a3a224641..606128f5c927 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -583,9 +583,7 @@ inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
 
 static struct ecryptfs_cache_info {
@@ -793,7 +791,7 @@ static int do_sysfs_registration(void)
 		       "Unable to register ecryptfs sysfs subsystem\n");
 		goto out;
 	}
-	rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+	rc = sysfs_create_file(&ecryptfs_subsys.kobj,
 			       &sysfs_attr_version.attr);
 	if (rc) {
 		printk(KERN_ERR
@@ -801,12 +799,12 @@ static int do_sysfs_registration(void)
 		subsystem_unregister(&ecryptfs_subsys);
 		goto out;
 	}
-	rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+	rc = sysfs_create_file(&ecryptfs_subsys.kobj,
 			       &sysfs_attr_version_str.attr);
 	if (rc) {
 		printk(KERN_ERR
 		       "Unable to create ecryptfs version_str attribute\n");
-		sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+		sysfs_remove_file(&ecryptfs_subsys.kobj,
 				  &sysfs_attr_version.attr);
 		subsystem_unregister(&ecryptfs_subsys);
 		goto out;
@@ -841,7 +839,7 @@ static int __init ecryptfs_init(void)
 		ecryptfs_free_kmem_caches();
 		goto out;
 	}
-	kset_set_kset_s(&ecryptfs_subsys, fs_subsys);
+	kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
 	sysfs_attr_version.attr.owner = THIS_MODULE;
 	sysfs_attr_version_str.attr.owner = THIS_MODULE;
 	rc = do_sysfs_registration();
@@ -862,9 +860,9 @@ out:
 
 static void __exit ecryptfs_exit(void)
 {
-	sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+	sysfs_remove_file(&ecryptfs_subsys.kobj,
 			  &sysfs_attr_version.attr);
-	sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+	sysfs_remove_file(&ecryptfs_subsys.kobj,
 			  &sysfs_attr_version_str.attr);
 	subsystem_unregister(&ecryptfs_subsys);
 	ecryptfs_release_messaging(ecryptfs_transport);
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 3baf253be95a..a9d87c47f72d 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -19,7 +19,7 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  * 02111-1307, USA.
  */
-
+#include <linux/sched.h>
 #include "ecryptfs_kernel.h"
 
 static LIST_HEAD(ecryptfs_msg_ctx_free_list);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b731b09499cb..55cec98a84e7 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -46,7 +46,6 @@ struct kmem_cache *ecryptfs_lower_page_cache;
  */
 static struct page *ecryptfs_get1page(struct file *file, int index)
 {
-	struct page *page;
 	struct dentry *dentry;
 	struct inode *inode;
 	struct address_space *mapping;
@@ -54,14 +53,7 @@ static struct page *ecryptfs_get1page(struct file *file, int index)
 	dentry = file->f_path.dentry;
 	inode = dentry->d_inode;
 	mapping = inode->i_mapping;
-	page = read_cache_page(mapping, index,
-			       (filler_t *)mapping->a_ops->readpage,
-			       (void *)file);
-	if (IS_ERR(page))
-		goto out;
-	wait_on_page_locked(page);
-out:
-	return page;
+	return read_mapping_page(mapping, index, (void *)file);
 }
 
 static
@@ -233,7 +225,6 @@ int ecryptfs_do_readpage(struct file *file, struct page *page,
 		ecryptfs_printk(KERN_ERR, "Error reading from page cache\n");
 		goto out;
 	}
-	wait_on_page_locked(lower_page);
 	page_data = kmap_atomic(page, KM_USER0);
 	lower_page_data = kmap_atomic(lower_page, KM_USER1);
 	memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE);
@@ -373,25 +364,43 @@ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
 {
 	struct inode *inode = page->mapping->host;
 	int end_byte_in_page;
-	char *page_virt;
 
 	if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index)
 		goto out;
 	end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
 	if (to > end_byte_in_page)
 		end_byte_in_page = to;
-	page_virt = kmap_atomic(page, KM_USER0);
-	memset((page_virt + end_byte_in_page), 0,
-	       (PAGE_CACHE_SIZE - end_byte_in_page));
-	kunmap_atomic(page_virt, KM_USER0);
-	flush_dcache_page(page);
+	zero_user_page(page, end_byte_in_page,
+		PAGE_CACHE_SIZE - end_byte_in_page, KM_USER0);
 out:
 	return 0;
 }
 
+/**
+ * eCryptfs does not currently support holes. When writing after a
+ * seek past the end of the file, eCryptfs fills in 0's through to the
+ * current location. The code to fill in the 0's to all the
+ * intermediate pages calls ecryptfs_prepare_write_no_truncate().
+ */
+static int
+ecryptfs_prepare_write_no_truncate(struct file *file, struct page *page,
+				   unsigned from, unsigned to)
+{
+	int rc = 0;
+
+	if (from == 0 && to == PAGE_CACHE_SIZE)
+		goto out;	/* If we are writing a full page, it will be
+				   up to date. */
+	if (!PageUptodate(page))
+		rc = ecryptfs_do_readpage(file, page, page->index);
+out:
+	return rc;
+}
+
 static int ecryptfs_prepare_write(struct file *file, struct page *page,
 				  unsigned from, unsigned to)
 {
+	loff_t pos;
 	int rc = 0;
 
 	if (from == 0 && to == PAGE_CACHE_SIZE)
@@ -399,6 +408,16 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 				   up to date. */
 	if (!PageUptodate(page))
 		rc = ecryptfs_do_readpage(file, page, page->index);
+	pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (pos > i_size_read(page->mapping->host)) {
+		rc = ecryptfs_truncate(file->f_path.dentry, pos);
+		if (rc) {
+			printk(KERN_ERR "Error on attempt to "
+			       "truncate to (higher) offset [%lld];"
+			       " rc = [%d]\n", pos, rc);
+			goto out;
+		}
+	}
 out:
 	return rc;
 }
@@ -749,7 +768,6 @@ int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros)
 {
 	int rc = 0;
 	struct page *tmp_page;
-	char *tmp_page_virt;
 
 	tmp_page = ecryptfs_get1page(file, index);
 	if (IS_ERR(tmp_page)) {
@@ -758,18 +776,15 @@ int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros)
 		rc = PTR_ERR(tmp_page);
 		goto out;
 	}
-	rc = ecryptfs_prepare_write(file, tmp_page, start, start + num_zeros);
-	if (rc) {
+	if ((rc = ecryptfs_prepare_write_no_truncate(file, tmp_page, start,
+						     (start + num_zeros)))) {
 		ecryptfs_printk(KERN_ERR, "Error preparing to write zero's "
-				"to remainder of page at index [0x%.16x]\n",
+				"to page at index [0x%.16x]\n",
 				index);
 		page_cache_release(tmp_page);
 		goto out;
 	}
-	tmp_page_virt = kmap_atomic(tmp_page, KM_USER0);
-	memset(((char *)tmp_page_virt + start), 0, num_zeros);
-	kunmap_atomic(tmp_page_virt, KM_USER0);
-	flush_dcache_page(tmp_page);
+	zero_user_page(tmp_page, start, num_zeros, KM_USER0);
 	rc = ecryptfs_commit_write(file, tmp_page, start, start + num_zeros);
 	if (rc < 0) {
 		ecryptfs_printk(KERN_ERR, "Error attempting to write zero's "
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c2235e46edcd..e0a6839e68ae 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -72,9 +72,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/eventfd.c b/fs/eventfd.c
new file mode 100644
index 000000000000..2ce19c000d2a
--- /dev/null
+++ b/fs/eventfd.c
@@ -0,0 +1,226 @@
+/*
+ *  fs/eventfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/anon_inodes.h>
+#include <linux/eventfd.h>
+
+struct eventfd_ctx {
+	wait_queue_head_t wqh;
+	/*
+	 * Every time that a write(2) is performed on an eventfd, the
+	 * value of the __u64 being written is added to "count" and a
+	 * wakeup is performed on "wqh". A read(2) will return the "count"
+	 * value to userspace, and will reset "count" to zero. The kernel
+	 * size eventfd_signal() also, adds to the "count" counter and
+	 * issue a wakeup.
+	 */
+	__u64 count;
+};
+
+/*
+ * Adds "n" to the eventfd counter "count". Returns "n" in case of
+ * success, or a value lower then "n" in case of coutner overflow.
+ * This function is supposed to be called by the kernel in paths
+ * that do not allow sleeping. In this function we allow the counter
+ * to reach the ULLONG_MAX value, and we signal this as overflow
+ * condition by returining a POLLERR to poll(2).
+ */
+int eventfd_signal(struct file *file, int n)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	unsigned long flags;
+
+	if (n < 0)
+		return -EINVAL;
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ULLONG_MAX - ctx->count < n)
+		n = (int) (ULLONG_MAX - ctx->count);
+	ctx->count += n;
+	if (waitqueue_active(&ctx->wqh))
+		wake_up_locked(&ctx->wqh);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return n;
+}
+
+static int eventfd_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static unsigned int eventfd_poll(struct file *file, poll_table *wait)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	unsigned int events = 0;
+	unsigned long flags;
+
+	poll_wait(file, &ctx->wqh, wait);
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ctx->count > 0)
+		events |= POLLIN;
+	if (ctx->count == ULLONG_MAX)
+		events |= POLLERR;
+	if (ULLONG_MAX - 1 > ctx->count)
+		events |= POLLOUT;
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return events;
+}
+
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	__u64 ucnt;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (count < sizeof(ucnt))
+		return -EINVAL;
+	spin_lock_irq(&ctx->wqh.lock);
+	res = -EAGAIN;
+	ucnt = ctx->count;
+	if (ucnt > 0)
+		res = sizeof(ucnt);
+	else if (!(file->f_flags & O_NONBLOCK)) {
+		__add_wait_queue(&ctx->wqh, &wait);
+		for (res = 0;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ctx->count > 0) {
+				ucnt = ctx->count;
+				res = sizeof(ucnt);
+				break;
+			}
+			if (signal_pending(current)) {
+				res = -ERESTARTSYS;
+				break;
+			}
+			spin_unlock_irq(&ctx->wqh.lock);
+			schedule();
+			spin_lock_irq(&ctx->wqh.lock);
+		}
+		__remove_wait_queue(&ctx->wqh, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+	if (res > 0) {
+		ctx->count = 0;
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked(&ctx->wqh);
+	}
+	spin_unlock_irq(&ctx->wqh.lock);
+	if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
+		return -EFAULT;
+
+	return res;
+}
+
+static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
+			     loff_t *ppos)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	__u64 ucnt;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (count < sizeof(ucnt))
+		return -EINVAL;
+	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
+		return -EFAULT;
+	if (ucnt == ULLONG_MAX)
+		return -EINVAL;
+	spin_lock_irq(&ctx->wqh.lock);
+	res = -EAGAIN;
+	if (ULLONG_MAX - ctx->count > ucnt)
+		res = sizeof(ucnt);
+	else if (!(file->f_flags & O_NONBLOCK)) {
+		__add_wait_queue(&ctx->wqh, &wait);
+		for (res = 0;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ULLONG_MAX - ctx->count > ucnt) {
+				res = sizeof(ucnt);
+				break;
+			}
+			if (signal_pending(current)) {
+				res = -ERESTARTSYS;
+				break;
+			}
+			spin_unlock_irq(&ctx->wqh.lock);
+			schedule();
+			spin_lock_irq(&ctx->wqh.lock);
+		}
+		__remove_wait_queue(&ctx->wqh, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+	if (res > 0) {
+		ctx->count += ucnt;
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked(&ctx->wqh);
+	}
+	spin_unlock_irq(&ctx->wqh.lock);
+
+	return res;
+}
+
+static const struct file_operations eventfd_fops = {
+	.release	= eventfd_release,
+	.poll		= eventfd_poll,
+	.read		= eventfd_read,
+	.write		= eventfd_write,
+};
+
+struct file *eventfd_fget(int fd)
+{
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
+	if (file->f_op != &eventfd_fops) {
+		fput(file);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return file;
+}
+
+asmlinkage long sys_eventfd(unsigned int count)
+{
+	int error, fd;
+	struct eventfd_ctx *ctx;
+	struct file *file;
+	struct inode *inode;
+
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	init_waitqueue_head(&ctx->wqh);
+	ctx->count = count;
+
+	/*
+	 * When we call this, the initialization must be complete, since
+	 * anon_inode_getfd() will install the fd.
+	 */
+	error = anon_inode_getfd(&fd, &inode, &file, "[eventfd]",
+				 &eventfd_fops, ctx);
+	if (!error)
+		return fd;
+
+	kfree(ctx);
+	return error;
+}
+
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3ae644e7e860..0b73cd45a06d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
- *  fs/eventpoll.c ( Efficent event polling implementation )
- *  Copyright (C) 2001,...,2006	 Davide Libenzi
+ *  fs/eventpoll.c (Efficent event polling implementation)
+ *  Copyright (C) 2001,...,2007	 Davide Libenzi
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -11,7 +11,6 @@
  *
  */
 
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -22,34 +21,31 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
-#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/spinlock.h>
 #include <linux/syscalls.h>
-#include <linux/rwsem.h>
 #include <linux/rbtree.h>
 #include <linux/wait.h>
 #include <linux/eventpoll.h>
 #include <linux/mount.h>
 #include <linux/bitops.h>
 #include <linux/mutex.h>
+#include <linux/anon_inodes.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/mman.h>
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
-
 
 /*
  * LOCKING:
  * There are three level of locking required by epoll :
  *
  * 1) epmutex (mutex)
- * 2) ep->sem (rw_semaphore)
- * 3) ep->lock (rw_lock)
+ * 2) ep->mtx (mutex)
+ * 3) ep->lock (spinlock)
  *
  * The acquire order is the one listed above, from 1 to 3.
  * We need a spinlock (ep->lock) because we manipulate objects
@@ -59,25 +55,22 @@
  * a spinlock. During the event transfer loop (from kernel to
  * user space) we could end up sleeping due a copy_to_user(), so
  * we need a lock that will allow us to sleep. This lock is a
- * read-write semaphore (ep->sem). It is acquired on read during
- * the event transfer loop and in write during epoll_ctl(EPOLL_CTL_DEL)
- * and during eventpoll_release_file(). Then we also need a global
- * semaphore to serialize eventpoll_release_file() and ep_free().
- * This semaphore is acquired by ep_free() during the epoll file
+ * mutex (ep->mtx). It is acquired during the event transfer loop,
+ * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
+ * Then we also need a global mutex to serialize eventpoll_release_file()
+ * and ep_free().
+ * This mutex is acquired by ep_free() during the epoll file
  * cleanup path and it is also acquired by eventpoll_release_file()
  * if a file has been pushed inside an epoll set and it is then
  * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
- * It is possible to drop the "ep->sem" and to use the global
- * semaphore "epmutex" (together with "ep->lock") to have it working,
- * but having "ep->sem" will make the interface more scalable.
+ * It is possible to drop the "ep->mtx" and to use the global
+ * mutex "epmutex" (together with "ep->lock") to have it working,
+ * but having "ep->mtx" will make the interface more scalable.
  * Events that require holding "epmutex" are very rare, while for
- * normal operations the epoll private "ep->sem" will guarantee
- * a greater scalability.
+ * normal operations the epoll private "ep->mtx" will guarantee
+ * a better scalability.
  */
 
-
-#define EVENTPOLLFS_MAGIC 0x03111965 /* My birthday should work for this :) */
-
 #define DEBUG_EPOLL 0
 
 #if DEBUG_EPOLL > 0
@@ -107,6 +100,7 @@
 
 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
 
+#define EP_UNACTIVE_PTR ((void *) -1L)
 
 struct epoll_filefd {
 	struct file *file;
@@ -117,7 +111,7 @@ struct epoll_filefd {
  * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
  * It is used to keep track on all tasks that are currently inside the wake_up() code
  * to 1) short-circuit the one coming from the same task and same wait queue head
- * ( loop ) 2) allow a maximum number of epoll descriptors inclusion nesting
+ * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
  * 3) let go the ones coming from other tasks.
  */
 struct wake_task_node {
@@ -136,21 +130,57 @@ struct poll_safewake {
 };
 
 /*
+ * Each file descriptor added to the eventpoll interface will
+ * have an entry of this type linked to the "rbr" RB tree.
+ */
+struct epitem {
+	/* RB tree node used to link this structure to the eventpoll RB tree */
+	struct rb_node rbn;
+
+	/* List header used to link this structure to the eventpoll ready list */
+	struct list_head rdllink;
+
+	/*
+	 * Works together "struct eventpoll"->ovflist in keeping the
+	 * single linked chain of items.
+	 */
+	struct epitem *next;
+
+	/* The file descriptor information this item refers to */
+	struct epoll_filefd ffd;
+
+	/* Number of active wait queue attached to poll operations */
+	int nwait;
+
+	/* List containing poll wait queues */
+	struct list_head pwqlist;
+
+	/* The "container" of this item */
+	struct eventpoll *ep;
+
+	/* List header used to link this item to the "struct file" items list */
+	struct list_head fllink;
+
+	/* The structure that describe the interested events and the source fd */
+	struct epoll_event event;
+};
+
+/*
  * This structure is stored inside the "private_data" member of the file
  * structure and rapresent the main data sructure for the eventpoll
  * interface.
  */
 struct eventpoll {
 	/* Protect the this structure access */
-	rwlock_t lock;
+	spinlock_t lock;
 
 	/*
-	 * This semaphore is used to ensure that files are not removed
-	 * while epoll is using them. This is read-held during the event
-	 * collection loop and it is write-held during the file cleanup
-	 * path, the epoll file exit code and the ctl operations.
+	 * This mutex is used to ensure that files are not removed
+	 * while epoll is using them. This is held during the event
+	 * collection loop, the file cleanup path, the epoll file exit
+	 * code and the ctl operations.
 	 */
-	struct rw_semaphore sem;
+	struct mutex mtx;
 
 	/* Wait queue used by sys_epoll_wait() */
 	wait_queue_head_t wq;
@@ -161,8 +191,15 @@ struct eventpoll {
 	/* List of ready file descriptors */
 	struct list_head rdllist;
 
-	/* RB-Tree root used to store monitored fd structs */
+	/* RB tree root used to store monitored fd structs */
 	struct rb_root rbr;
+
+	/*
+	 * This is a single linked list that chains all the "struct epitem" that
+	 * happened while transfering ready events to userspace w/out
+	 * holding ->lock.
+	 */
+	struct epitem *ovflist;
 };
 
 /* Wait structure used by the poll hooks */
@@ -183,99 +220,14 @@ struct eppoll_entry {
 	wait_queue_head_t *whead;
 };
 
-/*
- * Each file descriptor added to the eventpoll interface will
- * have an entry of this type linked to the hash.
- */
-struct epitem {
-	/* RB-Tree node used to link this structure to the eventpoll rb-tree */
-	struct rb_node rbn;
-
-	/* List header used to link this structure to the eventpoll ready list */
-	struct list_head rdllink;
-
-	/* The file descriptor information this item refers to */
-	struct epoll_filefd ffd;
-
-	/* Number of active wait queue attached to poll operations */
-	int nwait;
-
-	/* List containing poll wait queues */
-	struct list_head pwqlist;
-
-	/* The "container" of this item */
-	struct eventpoll *ep;
-
-	/* The structure that describe the interested events and the source fd */
-	struct epoll_event event;
-
-	/*
-	 * Used to keep track of the usage count of the structure. This avoids
-	 * that the structure will desappear from underneath our processing.
-	 */
-	atomic_t usecnt;
-
-	/* List header used to link this item to the "struct file" items list */
-	struct list_head fllink;
-
-	/* List header used to link the item to the transfer list */
-	struct list_head txlink;
-
-	/*
-	 * This is used during the collection/transfer of events to userspace
-	 * to pin items empty events set.
-	 */
-	unsigned int revents;
-};
-
 /* Wrapper struct used by poll queueing */
 struct ep_pqueue {
 	poll_table pt;
 	struct epitem *epi;
 };
 
-
-
-static void ep_poll_safewake_init(struct poll_safewake *psw);
-static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
-static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
-		    struct eventpoll *ep);
-static int ep_alloc(struct eventpoll **pep);
-static void ep_free(struct eventpoll *ep);
-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
-static void ep_use_epitem(struct epitem *epi);
-static void ep_release_epitem(struct epitem *epi);
-static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
-				 poll_table *pt);
-static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi);
-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
-		     struct file *tfile, int fd);
-static int ep_modify(struct eventpoll *ep, struct epitem *epi,
-		     struct epoll_event *event);
-static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi);
-static int ep_unlink(struct eventpoll *ep, struct epitem *epi);
-static int ep_remove(struct eventpoll *ep, struct epitem *epi);
-static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key);
-static int ep_eventpoll_close(struct inode *inode, struct file *file);
-static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
-static int ep_collect_ready_items(struct eventpoll *ep,
-				  struct list_head *txlist, int maxevents);
-static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
-			  struct epoll_event __user *events);
-static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist);
-static int ep_events_transfer(struct eventpoll *ep,
-			      struct epoll_event __user *events,
-			      int maxevents);
-static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
-		   int maxevents, long timeout);
-static int eventpollfs_delete_dentry(struct dentry *dentry);
-static struct inode *ep_eventpoll_inode(void);
-static int eventpollfs_get_sb(struct file_system_type *fs_type,
-			      int flags, const char *dev_name,
-			      void *data, struct vfsmount *mnt);
-
 /*
- * This semaphore is used to serialize ep_free() and eventpoll_release_file().
+ * This mutex is used to serialize ep_free() and eventpoll_release_file().
  */
 static struct mutex epmutex;
 
@@ -288,39 +240,8 @@ static struct kmem_cache *epi_cache __read_mostly;
 /* Slab cache used to allocate "struct eppoll_entry" */
 static struct kmem_cache *pwq_cache __read_mostly;
 
-/* Virtual fs used to allocate inodes for eventpoll files */
-static struct vfsmount *eventpoll_mnt __read_mostly;
-
-/* File callbacks that implement the eventpoll file behaviour */
-static const struct file_operations eventpoll_fops = {
-	.release	= ep_eventpoll_close,
-	.poll		= ep_eventpoll_poll
-};
-
-/*
- * This is used to register the virtual file system from where
- * eventpoll inodes are allocated.
- */
-static struct file_system_type eventpoll_fs_type = {
-	.name		= "eventpollfs",
-	.get_sb		= eventpollfs_get_sb,
-	.kill_sb	= kill_anon_super,
-};
-
-/* Very basic directory entry operations for the eventpoll virtual file system */
-static struct dentry_operations eventpollfs_dentry_operations = {
-	.d_delete	= eventpollfs_delete_dentry,
-};
-
 
-
-/* Fast test to see if the file is an evenpoll file */
-static inline int is_file_epoll(struct file *f)
-{
-	return f->f_op == &eventpoll_fops;
-}
-
-/* Setup the structure that is used as key for the rb-tree */
+/* Setup the structure that is used as key for the RB tree */
 static inline void ep_set_ffd(struct epoll_filefd *ffd,
 			      struct file *file, int fd)
 {
@@ -328,7 +249,7 @@ static inline void ep_set_ffd(struct epoll_filefd *ffd,
 	ffd->fd = fd;
 }
 
-/* Compare rb-tree keys */
+/* Compare RB tree keys */
 static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 			     struct epoll_filefd *p2)
 {
@@ -336,36 +257,25 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
 }
 
-/* Special initialization for the rb-tree node to detect linkage */
+/* Special initialization for the RB tree node to detect linkage */
 static inline void ep_rb_initnode(struct rb_node *n)
 {
 	rb_set_parent(n, n);
 }
 
-/* Removes a node from the rb-tree and marks it for a fast is-linked check */
+/* Removes a node from the RB tree and marks it for a fast is-linked check */
 static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r)
 {
 	rb_erase(n, r);
 	rb_set_parent(n, n);
 }
 
-/* Fast check to verify that the item is linked to the main rb-tree */
+/* Fast check to verify that the item is linked to the main RB tree */
 static inline int ep_rb_linked(struct rb_node *n)
 {
 	return rb_parent(n) != n;
 }
 
-/*
- * Remove the item from the list and perform its initialization.
- * This is useful for us because we can test if the item is linked
- * using "ep_is_linked(p)".
- */
-static inline void ep_list_del(struct list_head *p)
-{
-	list_del(p);
-	INIT_LIST_HEAD(p);
-}
-
 /* Tells us if the item is currently linked */
 static inline int ep_is_linked(struct list_head *p)
 {
@@ -385,7 +295,7 @@ static inline struct epitem * ep_item_from_epqueue(poll_table *p)
 }
 
 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
-static inline int ep_op_hash_event(int op)
+static inline int ep_op_has_event(int op)
 {
 	return op != EPOLL_CTL_DEL;
 }
@@ -398,7 +308,6 @@ static void ep_poll_safewake_init(struct poll_safewake *psw)
 	spin_lock_init(&psw->lock);
 }
 
-
 /*
  * Perform a safe wake up of the poll wait list. The problem is that
  * with the new callback'd wake up system, it is possible that the
@@ -453,378 +362,195 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
 	spin_unlock_irqrestore(&psw->lock, flags);
 }
 
-
 /*
- * This is called from eventpoll_release() to unlink files from the eventpoll
- * interface. We need to have this facility to cleanup correctly files that are
- * closed without being removed from the eventpoll interface.
+ * This function unregister poll callbacks from the associated file descriptor.
+ * Since this must be called without holding "ep->lock" the atomic exchange trick
+ * will protect us from multiple unregister.
  */
-void eventpoll_release_file(struct file *file)
+static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 {
-	struct list_head *lsthead = &file->f_ep_links;
-	struct eventpoll *ep;
-	struct epitem *epi;
+	int nwait;
+	struct list_head *lsthead = &epi->pwqlist;
+	struct eppoll_entry *pwq;
 
-	/*
-	 * We don't want to get "file->f_ep_lock" because it is not
-	 * necessary. It is not necessary because we're in the "struct file"
-	 * cleanup path, and this means that noone is using this file anymore.
-	 * The only hit might come from ep_free() but by holding the semaphore
-	 * will correctly serialize the operation. We do need to acquire
-	 * "ep->sem" after "epmutex" because ep_remove() requires it when called
-	 * from anywhere but ep_free().
-	 */
-	mutex_lock(&epmutex);
+	/* This is called without locks, so we need the atomic exchange */
+	nwait = xchg(&epi->nwait, 0);
 
-	while (!list_empty(lsthead)) {
-		epi = list_entry(lsthead->next, struct epitem, fllink);
+	if (nwait) {
+		while (!list_empty(lsthead)) {
+			pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
 
-		ep = epi->ep;
-		ep_list_del(&epi->fllink);
-		down_write(&ep->sem);
-		ep_remove(ep, epi);
-		up_write(&ep->sem);
+			list_del_init(&pwq->llink);
+			remove_wait_queue(pwq->whead, &pwq->wait);
+			kmem_cache_free(pwq_cache, pwq);
+		}
 	}
-
-	mutex_unlock(&epmutex);
 }
 
-
 /*
- * It opens an eventpoll file descriptor by suggesting a storage of "size"
- * file descriptors. The size parameter is just an hint about how to size
- * data structures. It won't prevent the user to store more than "size"
- * file descriptors inside the epoll interface. It is the kernel part of
- * the userspace epoll_create(2).
+ * Removes a "struct epitem" from the eventpoll RB tree and deallocates
+ * all the associated resources. Must be called with "mtx" held.
  */
-asmlinkage long sys_epoll_create(int size)
+static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 {
-	int error, fd = -1;
-	struct eventpoll *ep;
-	struct inode *inode;
-	struct file *file;
-
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
-		     current, size));
+	unsigned long flags;
+	struct file *file = epi->ffd.file;
 
 	/*
-	 * Sanity check on the size parameter, and create the internal data
-	 * structure ( "struct eventpoll" ).
+	 * Removes poll wait queue hooks. We _have_ to do this without holding
+	 * the "ep->lock" otherwise a deadlock might occur. This because of the
+	 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
+	 * queue head lock when unregistering the wait queue. The wakeup callback
+	 * will run by holding the wait queue head lock and will call our callback
+	 * that will try to get "ep->lock".
 	 */
-	error = -EINVAL;
-	if (size <= 0 || (error = ep_alloc(&ep)) != 0)
-		goto eexit_1;
+	ep_unregister_pollwait(ep, epi);
 
-	/*
-	 * Creates all the items needed to setup an eventpoll file. That is,
-	 * a file structure, and inode and a free file descriptor.
-	 */
-	error = ep_getfd(&fd, &inode, &file, ep);
-	if (error)
-		goto eexit_2;
+	/* Remove the current item from the list of epoll hooks */
+	spin_lock(&file->f_ep_lock);
+	if (ep_is_linked(&epi->fllink))
+		list_del_init(&epi->fllink);
+	spin_unlock(&file->f_ep_lock);
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-		     current, size, fd));
+	if (ep_rb_linked(&epi->rbn))
+		ep_rb_erase(&epi->rbn, &ep->rbr);
 
-	return fd;
+	spin_lock_irqsave(&ep->lock, flags);
+	if (ep_is_linked(&epi->rdllink))
+		list_del_init(&epi->rdllink);
+	spin_unlock_irqrestore(&ep->lock, flags);
 
-eexit_2:
-	ep_free(ep);
-	kfree(ep);
-eexit_1:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-		     current, size, error));
-	return error;
-}
+	/* At this point it is safe to free the eventpoll item */
+	kmem_cache_free(epi_cache, epi);
 
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
+		     current, ep, file));
 
-/*
- * The following function implements the controller interface for
- * the eventpoll file that enables the insertion/removal/change of
- * file descriptors inside the interest set.  It represents
- * the kernel part of the user space epoll_ctl(2).
- */
-asmlinkage long
-sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
+	return 0;
+}
+
+static void ep_free(struct eventpoll *ep)
 {
-	int error;
-	struct file *file, *tfile;
-	struct eventpoll *ep;
+	struct rb_node *rbp;
 	struct epitem *epi;
-	struct epoll_event epds;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
-		     current, epfd, op, fd, event));
-
-	error = -EFAULT;
-	if (ep_op_hash_event(op) &&
-	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
-		goto eexit_1;
-
-	/* Get the "struct file *" for the eventpoll file */
-	error = -EBADF;
-	file = fget(epfd);
-	if (!file)
-		goto eexit_1;
-
-	/* Get the "struct file *" for the target file */
-	tfile = fget(fd);
-	if (!tfile)
-		goto eexit_2;
-
-	/* The target file descriptor must support poll */
-	error = -EPERM;
-	if (!tfile->f_op || !tfile->f_op->poll)
-		goto eexit_3;
+	/* We need to release all tasks waiting for these file */
+	if (waitqueue_active(&ep->poll_wait))
+		ep_poll_safewake(&psw, &ep->poll_wait);
 
 	/*
-	 * We have to check that the file structure underneath the file descriptor
-	 * the user passed to us _is_ an eventpoll file. And also we do not permit
-	 * adding an epoll file descriptor inside itself.
+	 * We need to lock this because we could be hit by
+	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
+	 * We do not need to hold "ep->mtx" here because the epoll file
+	 * is on the way to be removed and no one has references to it
+	 * anymore. The only hit might come from eventpoll_release_file() but
+	 * holding "epmutex" is sufficent here.
 	 */
-	error = -EINVAL;
-	if (file == tfile || !is_file_epoll(file))
-		goto eexit_3;
+	mutex_lock(&epmutex);
 
 	/*
-	 * At this point it is safe to assume that the "private_data" contains
-	 * our own data structure.
+	 * Walks through the whole tree by unregistering poll callbacks.
 	 */
-	ep = file->private_data;
-
-	down_write(&ep->sem);
-
-	/* Try to lookup the file inside our hash table */
-	epi = ep_find(ep, tfile, fd);
-
-	error = -EINVAL;
-	switch (op) {
-	case EPOLL_CTL_ADD:
-		if (!epi) {
-			epds.events |= POLLERR | POLLHUP;
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
 
-			error = ep_insert(ep, &epds, tfile, fd);
-		} else
-			error = -EEXIST;
-		break;
-	case EPOLL_CTL_DEL:
-		if (epi)
-			error = ep_remove(ep, epi);
-		else
-			error = -ENOENT;
-		break;
-	case EPOLL_CTL_MOD:
-		if (epi) {
-			epds.events |= POLLERR | POLLHUP;
-			error = ep_modify(ep, epi, &epds);
-		} else
-			error = -ENOENT;
-		break;
+		ep_unregister_pollwait(ep, epi);
 	}
 
 	/*
-	 * The function ep_find() increments the usage count of the structure
-	 * so, if this is not NULL, we need to release it.
+	 * Walks through the whole tree by freeing each "struct epitem". At this
+	 * point we are sure no poll callbacks will be lingering around, and also by
+	 * holding "epmutex" we can be sure that no file cleanup code will hit
+	 * us during this operation. So we can avoid the lock on "ep->lock".
 	 */
-	if (epi)
-		ep_release_epitem(epi);
-
-	up_write(&ep->sem);
-
-eexit_3:
-	fput(tfile);
-eexit_2:
-	fput(file);
-eexit_1:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
-		     current, epfd, op, fd, event, error));
+	while ((rbp = rb_first(&ep->rbr)) != 0) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		ep_remove(ep, epi);
+	}
 
-	return error;
+	mutex_unlock(&epmutex);
+	mutex_destroy(&ep->mtx);
+	kfree(ep);
 }
 
-
-/*
- * Implement the event wait interface for the eventpoll file. It is the kernel
- * part of the user space epoll_wait(2).
- */
-asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
-			       int maxevents, int timeout)
+static int ep_eventpoll_release(struct inode *inode, struct file *file)
 {
-	int error;
-	struct file *file;
-	struct eventpoll *ep;
-
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
-		     current, epfd, events, maxevents, timeout));
-
-	/* The maximum number of event must be greater than zero */
-	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
-		return -EINVAL;
-
-	/* Verify that the area passed by the user is writeable */
-	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
-		error = -EFAULT;
-		goto eexit_1;
-	}
+	struct eventpoll *ep = file->private_data;
 
-	/* Get the "struct file *" for the eventpoll file */
-	error = -EBADF;
-	file = fget(epfd);
-	if (!file)
-		goto eexit_1;
+	if (ep)
+		ep_free(ep);
 
-	/*
-	 * We have to check that the file structure underneath the fd
-	 * the user passed to us _is_ an eventpoll file.
-	 */
-	error = -EINVAL;
-	if (!is_file_epoll(file))
-		goto eexit_2;
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
+	return 0;
+}
 
-	/*
-	 * At this point it is safe to assume that the "private_data" contains
-	 * our own data structure.
-	 */
-	ep = file->private_data;
+static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
+{
+	unsigned int pollflags = 0;
+	unsigned long flags;
+	struct eventpoll *ep = file->private_data;
 
-	/* Time to fish for events ... */
-	error = ep_poll(ep, events, maxevents, timeout);
+	/* Insert inside our poll wait queue */
+	poll_wait(file, &ep->poll_wait, wait);
 
-eexit_2:
-	fput(file);
-eexit_1:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
-		     current, epfd, events, maxevents, timeout, error));
+	/* Check our condition */
+	spin_lock_irqsave(&ep->lock, flags);
+	if (!list_empty(&ep->rdllist))
+		pollflags = POLLIN | POLLRDNORM;
+	spin_unlock_irqrestore(&ep->lock, flags);
 
-	return error;
+	return pollflags;
 }
 
+/* File callbacks that implement the eventpoll file behaviour */
+static const struct file_operations eventpoll_fops = {
+	.release	= ep_eventpoll_release,
+	.poll		= ep_eventpoll_poll
+};
 
-#ifdef TIF_RESTORE_SIGMASK
+/* Fast test to see if the file is an evenpoll file */
+static inline int is_file_epoll(struct file *f)
+{
+	return f->f_op == &eventpoll_fops;
+}
 
 /*
- * Implement the event wait interface for the eventpoll file. It is the kernel
- * part of the user space epoll_pwait(2).
+ * This is called from eventpoll_release() to unlink files from the eventpoll
+ * interface. We need to have this facility to cleanup correctly files that are
+ * closed without being removed from the eventpoll interface.
  */
-asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
-		int maxevents, int timeout, const sigset_t __user *sigmask,
-		size_t sigsetsize)
+void eventpoll_release_file(struct file *file)
 {
-	int error;
-	sigset_t ksigmask, sigsaved;
-
-	/*
-	 * If the caller wants a certain signal mask to be set during the wait,
-	 * we apply it here.
-	 */
-	if (sigmask) {
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	error = sys_epoll_wait(epfd, events, maxevents, timeout);
+	struct list_head *lsthead = &file->f_ep_links;
+	struct eventpoll *ep;
+	struct epitem *epi;
 
 	/*
-	 * If we changed the signal mask, we need to restore the original one.
-	 * In case we've got a signal while waiting, we do not restore the
-	 * signal mask yet, and we allow do_signal() to deliver the signal on
-	 * the way back to userspace, before the signal mask is restored.
+	 * We don't want to get "file->f_ep_lock" because it is not
+	 * necessary. It is not necessary because we're in the "struct file"
+	 * cleanup path, and this means that noone is using this file anymore.
+	 * So, for example, epoll_ctl() cannot hit here sicne if we reach this
+	 * point, the file counter already went to zero and fget() would fail.
+	 * The only hit might come from ep_free() but by holding the mutex
+	 * will correctly serialize the operation. We do need to acquire
+	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
+	 * from anywhere but ep_free().
 	 */
-	if (sigmask) {
-		if (error == -EINTR) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-				sizeof(sigsaved));
-			set_thread_flag(TIF_RESTORE_SIGMASK);
-		} else
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-	}
-
-	return error;
-}
-
-#endif /* #ifdef TIF_RESTORE_SIGMASK */
-
-
-/*
- * Creates the file descriptor to be used by the epoll interface.
- */
-static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
-		    struct eventpoll *ep)
-{
-	struct qstr this;
-	char name[32];
-	struct dentry *dentry;
-	struct inode *inode;
-	struct file *file;
-	int error, fd;
+	mutex_lock(&epmutex);
 
-	/* Get an ready to use file */
-	error = -ENFILE;
-	file = get_empty_filp();
-	if (!file)
-		goto eexit_1;
+	while (!list_empty(lsthead)) {
+		epi = list_first_entry(lsthead, struct epitem, fllink);
 
-	/* Allocates an inode from the eventpoll file system */
-	inode = ep_eventpoll_inode();
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		goto eexit_2;
+		ep = epi->ep;
+		list_del_init(&epi->fllink);
+		mutex_lock(&ep->mtx);
+		ep_remove(ep, epi);
+		mutex_unlock(&ep->mtx);
 	}
 
-	/* Allocates a free descriptor to plug the file onto */
-	error = get_unused_fd();
-	if (error < 0)
-		goto eexit_3;
-	fd = error;
-
-	/*
-	 * Link the inode to a directory entry by creating a unique name
-	 * using the inode number.
-	 */
-	error = -ENOMEM;
-	sprintf(name, "[%lu]", inode->i_ino);
-	this.name = name;
-	this.len = strlen(name);
-	this.hash = inode->i_ino;
-	dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
-	if (!dentry)
-		goto eexit_4;
-	dentry->d_op = &eventpollfs_dentry_operations;
-	d_add(dentry, inode);
-	file->f_path.mnt = mntget(eventpoll_mnt);
-	file->f_path.dentry = dentry;
-	file->f_mapping = inode->i_mapping;
-
-	file->f_pos = 0;
-	file->f_flags = O_RDONLY;
-	file->f_op = &eventpoll_fops;
-	file->f_mode = FMODE_READ;
-	file->f_version = 0;
-	file->private_data = ep;
-
-	/* Install the new setup file into the allocated fd. */
-	fd_install(fd, file);
-
-	*efd = fd;
-	*einode = inode;
-	*efile = file;
-	return 0;
-
-eexit_4:
-	put_unused_fd(fd);
-eexit_3:
-	iput(inode);
-eexit_2:
-	put_filp(file);
-eexit_1:
-	return error;
+	mutex_unlock(&epmutex);
 }
 
-
 static int ep_alloc(struct eventpoll **pep)
 {
 	struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
@@ -832,12 +558,13 @@ static int ep_alloc(struct eventpoll **pep)
 	if (!ep)
 		return -ENOMEM;
 
-	rwlock_init(&ep->lock);
-	init_rwsem(&ep->sem);
+	spin_lock_init(&ep->lock);
+	mutex_init(&ep->mtx);
 	init_waitqueue_head(&ep->wq);
 	init_waitqueue_head(&ep->poll_wait);
 	INIT_LIST_HEAD(&ep->rdllist);
 	ep->rbr = RB_ROOT;
+	ep->ovflist = EP_UNACTIVE_PTR;
 
 	*pep = ep;
 
@@ -846,65 +573,19 @@ static int ep_alloc(struct eventpoll **pep)
 	return 0;
 }
 
-
-static void ep_free(struct eventpoll *ep)
-{
-	struct rb_node *rbp;
-	struct epitem *epi;
-
-	/* We need to release all tasks waiting for these file */
-	if (waitqueue_active(&ep->poll_wait))
-		ep_poll_safewake(&psw, &ep->poll_wait);
-
-	/*
-	 * We need to lock this because we could be hit by
-	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
-	 * We do not need to hold "ep->sem" here because the epoll file
-	 * is on the way to be removed and no one has references to it
-	 * anymore. The only hit might come from eventpoll_release_file() but
-	 * holding "epmutex" is sufficent here.
-	 */
-	mutex_lock(&epmutex);
-
-	/*
-	 * Walks through the whole tree by unregistering poll callbacks.
-	 */
-	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
-		epi = rb_entry(rbp, struct epitem, rbn);
-
-		ep_unregister_pollwait(ep, epi);
-	}
-
-	/*
-	 * Walks through the whole hash by freeing each "struct epitem". At this
-	 * point we are sure no poll callbacks will be lingering around, and also by
-	 * write-holding "sem" we can be sure that no file cleanup code will hit
-	 * us during this operation. So we can avoid the lock on "ep->lock".
-	 */
-	while ((rbp = rb_first(&ep->rbr)) != 0) {
-		epi = rb_entry(rbp, struct epitem, rbn);
-		ep_remove(ep, epi);
-	}
-
-	mutex_unlock(&epmutex);
-}
-
-
 /*
- * Search the file inside the eventpoll hash. It add usage count to
- * the returned item, so the caller must call ep_release_epitem()
- * after finished using the "struct epitem".
+ * Search the file inside the eventpoll tree. The RB tree operations
+ * are protected by the "mtx" mutex, and ep_find() must be called with
+ * "mtx" held.
  */
 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 {
 	int kcmp;
-	unsigned long flags;
 	struct rb_node *rbp;
 	struct epitem *epi, *epir = NULL;
 	struct epoll_filefd ffd;
 
 	ep_set_ffd(&ffd, file, fd);
-	read_lock_irqsave(&ep->lock, flags);
 	for (rbp = ep->rbr.rb_node; rbp; ) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 		kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
@@ -913,12 +594,10 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 		else if (kcmp < 0)
 			rbp = rbp->rb_left;
 		else {
-			ep_use_epitem(epi);
 			epir = epi;
 			break;
 		}
 	}
-	read_unlock_irqrestore(&ep->lock, flags);
 
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
 		     current, file, epir));
@@ -926,30 +605,72 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 	return epir;
 }
 
-
 /*
- * Increment the usage count of the "struct epitem" making it sure
- * that the user will have a valid pointer to reference.
+ * This is the callback that is passed to the wait queue wakeup
+ * machanism. It is called by the stored file descriptors when they
+ * have events to report.
  */
-static void ep_use_epitem(struct epitem *epi)
+static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
+	int pwake = 0;
+	unsigned long flags;
+	struct epitem *epi = ep_item_from_wait(wait);
+	struct eventpoll *ep = epi->ep;
 
-	atomic_inc(&epi->usecnt);
-}
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
+		     current, epi->ffd.file, epi, ep));
 
+	spin_lock_irqsave(&ep->lock, flags);
 
-/*
- * Decrement ( release ) the usage count by signaling that the user
- * has finished using the structure. It might lead to freeing the
- * structure itself if the count goes to zero.
- */
-static void ep_release_epitem(struct epitem *epi)
-{
+	/*
+	 * If the event mask does not contain any poll(2) event, we consider the
+	 * descriptor to be disabled. This condition is likely the effect of the
+	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
+	 * until the next EPOLL_CTL_MOD will be issued.
+	 */
+	if (!(epi->event.events & ~EP_PRIVATE_BITS))
+		goto out_unlock;
 
-	if (atomic_dec_and_test(&epi->usecnt))
-		kmem_cache_free(epi_cache, epi);
-}
+	/*
+	 * If we are trasfering events to userspace, we can hold no locks
+	 * (because we're accessing user memory, and because of linux f_op->poll()
+	 * semantics). All the events that happens during that period of time are
+	 * chained in ep->ovflist and requeued later on.
+	 */
+	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
+		if (epi->next == EP_UNACTIVE_PTR) {
+			epi->next = ep->ovflist;
+			ep->ovflist = epi;
+		}
+		goto out_unlock;
+	}
 
+	/* If this file is already in the ready list we exit soon */
+	if (ep_is_linked(&epi->rdllink))
+		goto is_linked;
+
+	list_add_tail(&epi->rdllink, &ep->rdllist);
+
+is_linked:
+	/*
+	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
+	 * wait list.
+	 */
+	if (waitqueue_active(&ep->wq))
+		__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+				 TASK_INTERRUPTIBLE);
+	if (waitqueue_active(&ep->poll_wait))
+		pwake++;
+
+out_unlock:
+	spin_unlock_irqrestore(&ep->lock, flags);
+
+	/* We have to call this outside the lock */
+	if (pwake)
+		ep_poll_safewake(&psw, &ep->poll_wait);
+
+	return 1;
+}
 
 /*
  * This is the callback that is used to add our wait queue to the
@@ -974,7 +695,6 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 	}
 }
 
-
 static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
 {
 	int kcmp;
@@ -994,7 +714,9 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
 	rb_insert_color(&epi->rbn, &ep->rbr);
 }
 
-
+/*
+ * Must be called with "mtx" held.
+ */
 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		     struct file *tfile, int fd)
 {
@@ -1005,19 +727,18 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	error = -ENOMEM;
 	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
-		goto eexit_1;
+		goto error_return;
 
 	/* Item initialization follow here ... */
 	ep_rb_initnode(&epi->rbn);
 	INIT_LIST_HEAD(&epi->rdllink);
 	INIT_LIST_HEAD(&epi->fllink);
-	INIT_LIST_HEAD(&epi->txlink);
 	INIT_LIST_HEAD(&epi->pwqlist);
 	epi->ep = ep;
 	ep_set_ffd(&epi->ffd, tfile, fd);
 	epi->event = *event;
-	atomic_set(&epi->usecnt, 1);
 	epi->nwait = 0;
+	epi->next = EP_UNACTIVE_PTR;
 
 	/* Initialize the poll table using the queue callback */
 	epq.epi = epi;
@@ -1026,7 +747,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	/*
 	 * Attach the item to the poll hooks and get current event bits.
 	 * We can safely use the file* here because its usage count has
-	 * been increased by the caller of this function.
+	 * been increased by the caller of this function. Note that after
+	 * this operation completes, the poll callback can start hitting
+	 * the new item.
 	 */
 	revents = tfile->f_op->poll(tfile, &epq.pt);
 
@@ -1036,19 +759,22 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 * high memory pressure.
 	 */
 	if (epi->nwait < 0)
-		goto eexit_2;
+		goto error_unregister;
 
 	/* Add the current item to the list of active epoll hook for this file */
 	spin_lock(&tfile->f_ep_lock);
 	list_add_tail(&epi->fllink, &tfile->f_ep_links);
 	spin_unlock(&tfile->f_ep_lock);
 
-	/* We have to drop the new item inside our item list to keep track of it */
-	write_lock_irqsave(&ep->lock, flags);
-
-	/* Add the current item to the rb-tree */
+	/*
+	 * Add the current item to the RB tree. All RB tree operations are
+	 * protected by "mtx", and ep_insert() is called with "mtx" held.
+	 */
 	ep_rbtree_insert(ep, epi);
 
+	/* We have to drop the new item inside our item list to keep track of it */
+	spin_lock_irqsave(&ep->lock, flags);
+
 	/* If the file is already "ready" we drop it inside the ready list */
 	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
 		list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1060,7 +786,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 			pwake++;
 	}
 
-	write_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->lock, flags);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -1071,27 +797,28 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	return 0;
 
-eexit_2:
+error_unregister:
 	ep_unregister_pollwait(ep, epi);
 
 	/*
 	 * We need to do this because an event could have been arrived on some
-	 * allocated wait queue.
+	 * allocated wait queue. Note that we don't care about the ep->ovflist
+	 * list, since that is used/cleaned only inside a section bound by "mtx".
+	 * And ep_insert() is called with "mtx" held.
 	 */
-	write_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->lock, flags);
 	if (ep_is_linked(&epi->rdllink))
-		ep_list_del(&epi->rdllink);
-	write_unlock_irqrestore(&ep->lock, flags);
+		list_del_init(&epi->rdllink);
+	spin_unlock_irqrestore(&ep->lock, flags);
 
 	kmem_cache_free(epi_cache, epi);
-eexit_1:
+error_return:
 	return error;
 }
 
-
 /*
  * Modify the interest event mask by dropping an event if the new mask
- * has a match in the current file status.
+ * has a match in the current file status. Must be called with "mtx" held.
  */
 static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
 {
@@ -1113,36 +840,28 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 */
 	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
 
-	write_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->lock, flags);
 
 	/* Copy the data member from inside the lock */
 	epi->event.data = event->data;
 
 	/*
-	 * If the item is not linked to the hash it means that it's on its
-	 * way toward the removal. Do nothing in this case.
+	 * If the item is "hot" and it is not registered inside the ready
+	 * list, push it inside.
 	 */
-	if (ep_rb_linked(&epi->rbn)) {
-		/*
-		 * If the item is "hot" and it is not registered inside the ready
-		 * list, push it inside. If the item is not "hot" and it is currently
-		 * registered inside the ready list, unlink it.
-		 */
-		if (revents & event->events) {
-			if (!ep_is_linked(&epi->rdllink)) {
-				list_add_tail(&epi->rdllink, &ep->rdllist);
-
-				/* Notify waiting tasks that events are available */
-				if (waitqueue_active(&ep->wq))
-					__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
-							 TASK_INTERRUPTIBLE);
-				if (waitqueue_active(&ep->poll_wait))
-					pwake++;
-			}
+	if (revents & event->events) {
+		if (!ep_is_linked(&epi->rdllink)) {
+			list_add_tail(&epi->rdllink, &ep->rdllist);
+
+			/* Notify waiting tasks that events are available */
+			if (waitqueue_active(&ep->wq))
+				__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+						 TASK_INTERRUPTIBLE);
+			if (waitqueue_active(&ep->poll_wait))
+				pwake++;
 		}
 	}
-
-	write_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->lock, flags);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -1151,350 +870,113 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	return 0;
 }
 
-
-/*
- * This function unregister poll callbacks from the associated file descriptor.
- * Since this must be called without holding "ep->lock" the atomic exchange trick
- * will protect us from multiple unregister.
- */
-static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
-{
-	int nwait;
-	struct list_head *lsthead = &epi->pwqlist;
-	struct eppoll_entry *pwq;
-
-	/* This is called without locks, so we need the atomic exchange */
-	nwait = xchg(&epi->nwait, 0);
-
-	if (nwait) {
-		while (!list_empty(lsthead)) {
-			pwq = list_entry(lsthead->next, struct eppoll_entry, llink);
-
-			ep_list_del(&pwq->llink);
-			remove_wait_queue(pwq->whead, &pwq->wait);
-			kmem_cache_free(pwq_cache, pwq);
-		}
-	}
-}
-
-
-/*
- * Unlink the "struct epitem" from all places it might have been hooked up.
- * This function must be called with write IRQ lock on "ep->lock".
- */
-static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
+static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
+			  int maxevents)
 {
-	int error;
-
-	/*
-	 * It can happen that this one is called for an item already unlinked.
-	 * The check protect us from doing a double unlink ( crash ).
-	 */
-	error = -ENOENT;
-	if (!ep_rb_linked(&epi->rbn))
-		goto eexit_1;
-
-	/*
-	 * Clear the event mask for the unlinked item. This will avoid item
-	 * notifications to be sent after the unlink operation from inside
-	 * the kernel->userspace event transfer loop.
-	 */
-	epi->event.events = 0;
-
-	/*
-	 * At this point is safe to do the job, unlink the item from our rb-tree.
-	 * This operation togheter with the above check closes the door to
-	 * double unlinks.
-	 */
-	ep_rb_erase(&epi->rbn, &ep->rbr);
-
-	/*
-	 * If the item we are going to remove is inside the ready file descriptors
-	 * we want to remove it from this list to avoid stale events.
-	 */
-	if (ep_is_linked(&epi->rdllink))
-		ep_list_del(&epi->rdllink);
-
-	error = 0;
-eexit_1:
-
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
-		     current, ep, epi->ffd.file, error));
-
-	return error;
-}
-
-
-/*
- * Removes a "struct epitem" from the eventpoll hash and deallocates
- * all the associated resources.
- */
-static int ep_remove(struct eventpoll *ep, struct epitem *epi)
-{
-	int error;
-	unsigned long flags;
-	struct file *file = epi->ffd.file;
-
-	/*
-	 * Removes poll wait queue hooks. We _have_ to do this without holding
-	 * the "ep->lock" otherwise a deadlock might occur. This because of the
-	 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
-	 * queue head lock when unregistering the wait queue. The wakeup callback
-	 * will run by holding the wait queue head lock and will call our callback
-	 * that will try to get "ep->lock".
-	 */
-	ep_unregister_pollwait(ep, epi);
-
-	/* Remove the current item from the list of epoll hooks */
-	spin_lock(&file->f_ep_lock);
-	if (ep_is_linked(&epi->fllink))
-		ep_list_del(&epi->fllink);
-	spin_unlock(&file->f_ep_lock);
-
-	/* We need to acquire the write IRQ lock before calling ep_unlink() */
-	write_lock_irqsave(&ep->lock, flags);
-
-	/* Really unlink the item from the hash */
-	error = ep_unlink(ep, epi);
-
-	write_unlock_irqrestore(&ep->lock, flags);
-
-	if (error)
-		goto eexit_1;
-
-	/* At this point it is safe to free the eventpoll item */
-	ep_release_epitem(epi);
-
-	error = 0;
-eexit_1:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n",
-		     current, ep, file, error));
-
-	return error;
-}
-
-
-/*
- * This is the callback that is passed to the wait queue wakeup
- * machanism. It is called by the stored file descriptors when they
- * have events to report.
- */
-static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
-{
-	int pwake = 0;
+	int eventcnt, error = -EFAULT, pwake = 0;
+	unsigned int revents;
 	unsigned long flags;
-	struct epitem *epi = ep_item_from_wait(wait);
-	struct eventpoll *ep = epi->ep;
-
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
-		     current, epi->ffd.file, epi, ep));
+	struct epitem *epi, *nepi;
+	struct list_head txlist;
 
-	write_lock_irqsave(&ep->lock, flags);
+	INIT_LIST_HEAD(&txlist);
 
 	/*
-	 * If the event mask does not contain any poll(2) event, we consider the
-	 * descriptor to be disabled. This condition is likely the effect of the
-	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
-	 * until the next EPOLL_CTL_MOD will be issued.
+	 * We need to lock this because we could be hit by
+	 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
 	 */
-	if (!(epi->event.events & ~EP_PRIVATE_BITS))
-		goto is_disabled;
+	mutex_lock(&ep->mtx);
 
-	/* If this file is already in the ready list we exit soon */
-	if (ep_is_linked(&epi->rdllink))
-		goto is_linked;
-
-	list_add_tail(&epi->rdllink, &ep->rdllist);
-
-is_linked:
 	/*
-	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
-	 * wait list.
+	 * Steal the ready list, and re-init the original one to the
+	 * empty list. Also, set ep->ovflist to NULL so that events
+	 * happening while looping w/out locks, are not lost. We cannot
+	 * have the poll callback to queue directly on ep->rdllist,
+	 * because we are doing it in the loop below, in a lockless way.
 	 */
-	if (waitqueue_active(&ep->wq))
-		__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
-				 TASK_INTERRUPTIBLE);
-	if (waitqueue_active(&ep->poll_wait))
-		pwake++;
-
-is_disabled:
-	write_unlock_irqrestore(&ep->lock, flags);
-
-	/* We have to call this outside the lock */
-	if (pwake)
-		ep_poll_safewake(&psw, &ep->poll_wait);
-
-	return 1;
-}
-
-
-static int ep_eventpoll_close(struct inode *inode, struct file *file)
-{
-	struct eventpoll *ep = file->private_data;
-
-	if (ep) {
-		ep_free(ep);
-		kfree(ep);
-	}
-
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
-	return 0;
-}
-
-
-static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
-{
-	unsigned int pollflags = 0;
-	unsigned long flags;
-	struct eventpoll *ep = file->private_data;
-
-	/* Insert inside our poll wait queue */
-	poll_wait(file, &ep->poll_wait, wait);
-
-	/* Check our condition */
-	read_lock_irqsave(&ep->lock, flags);
-	if (!list_empty(&ep->rdllist))
-		pollflags = POLLIN | POLLRDNORM;
-	read_unlock_irqrestore(&ep->lock, flags);
-
-	return pollflags;
-}
-
-
-/*
- * Since we have to release the lock during the __copy_to_user() operation and
- * during the f_op->poll() call, we try to collect the maximum number of items
- * by reducing the irqlock/irqunlock switching rate.
- */
-static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
-{
-	int nepi;
-	unsigned long flags;
-	struct list_head *lsthead = &ep->rdllist, *lnk;
-	struct epitem *epi;
-
-	write_lock_irqsave(&ep->lock, flags);
-
-	for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
-		epi = list_entry(lnk, struct epitem, rdllink);
-
-		lnk = lnk->next;
-
-		/* If this file is already in the ready list we exit soon */
-		if (!ep_is_linked(&epi->txlink)) {
-			/*
-			 * This is initialized in this way so that the default
-			 * behaviour of the reinjecting code will be to push back
-			 * the item inside the ready list.
-			 */
-			epi->revents = epi->event.events;
-
-			/* Link the ready item into the transfer list */
-			list_add(&epi->txlink, txlist);
-			nepi++;
-
-			/*
-			 * Unlink the item from the ready list.
-			 */
-			ep_list_del(&epi->rdllink);
-		}
-	}
-
-	write_unlock_irqrestore(&ep->lock, flags);
-
-	return nepi;
-}
-
-
-/*
- * This function is called without holding the "ep->lock" since the call to
- * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
- * because of the way poll() is traditionally implemented in Linux.
- */
-static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
-			  struct epoll_event __user *events)
-{
-	int eventcnt = 0;
-	unsigned int revents;
-	struct list_head *lnk;
-	struct epitem *epi;
+	spin_lock_irqsave(&ep->lock, flags);
+	list_splice(&ep->rdllist, &txlist);
+	INIT_LIST_HEAD(&ep->rdllist);
+	ep->ovflist = NULL;
+	spin_unlock_irqrestore(&ep->lock, flags);
 
 	/*
 	 * We can loop without lock because this is a task private list.
-	 * The test done during the collection loop will guarantee us that
-	 * another task will not try to collect this file. Also, items
-	 * cannot vanish during the loop because we are holding "sem".
+	 * We just splice'd out the ep->rdllist in ep_collect_ready_items().
+	 * Items cannot vanish during the loop because we are holding "mtx".
 	 */
-	list_for_each(lnk, txlist) {
-		epi = list_entry(lnk, struct epitem, txlink);
+	for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {
+		epi = list_first_entry(&txlist, struct epitem, rdllink);
+
+		list_del_init(&epi->rdllink);
 
 		/*
 		 * Get the ready file event set. We can safely use the file
-		 * because we are holding the "sem" in read and this will
-		 * guarantee that both the file and the item will not vanish.
+		 * because we are holding the "mtx" and this will guarantee
+		 * that both the file and the item will not vanish.
 		 */
 		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
+		revents &= epi->event.events;
 
 		/*
-		 * Set the return event set for the current file descriptor.
-		 * Note that only the task task was successfully able to link
-		 * the item to its "txlist" will write this field.
+		 * Is the event mask intersect the caller-requested one,
+		 * deliver the event to userspace. Again, we are holding
+		 * "mtx", so no operations coming from userspace can change
+		 * the item.
 		 */
-		epi->revents = revents & epi->event.events;
-
-		if (epi->revents) {
-			if (__put_user(epi->revents,
+		if (revents) {
+			if (__put_user(revents,
 				       &events[eventcnt].events) ||
 			    __put_user(epi->event.data,
 				       &events[eventcnt].data))
-				return -EFAULT;
+				goto errxit;
 			if (epi->event.events & EPOLLONESHOT)
 				epi->event.events &= EP_PRIVATE_BITS;
 			eventcnt++;
 		}
+		/*
+		 * At this point, noone can insert into ep->rdllist besides
+		 * us. The epoll_ctl() callers are locked out by us holding
+		 * "mtx" and the poll callback will queue them in ep->ovflist.
+		 */
+		if (!(epi->event.events & EPOLLET) &&
+		    (revents & epi->event.events))
+			list_add_tail(&epi->rdllink, &ep->rdllist);
 	}
-	return eventcnt;
-}
-
-
-/*
- * Walk through the transfer list we collected with ep_collect_ready_items()
- * and, if 1) the item is still "alive" 2) its event set is not empty 3) it's
- * not already linked, links it to the ready list. Same as above, we are holding
- * "sem" so items cannot vanish underneath our nose.
- */
-static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
-{
-	int ricnt = 0, pwake = 0;
-	unsigned long flags;
-	struct epitem *epi;
-
-	write_lock_irqsave(&ep->lock, flags);
-
-	while (!list_empty(txlist)) {
-		epi = list_entry(txlist->next, struct epitem, txlink);
+	error = 0;
 
-		/* Unlink the current item from the transfer list */
-		ep_list_del(&epi->txlink);
+errxit:
 
-		/*
-		 * If the item is no more linked to the interest set, we don't
-		 * have to push it inside the ready list because the following
-		 * ep_release_epitem() is going to drop it. Also, if the current
-		 * item is set to have an Edge Triggered behaviour, we don't have
-		 * to push it back either.
-		 */
-		if (ep_rb_linked(&epi->rbn) && !(epi->event.events & EPOLLET) &&
-		    (epi->revents & epi->event.events) && !ep_is_linked(&epi->rdllink)) {
+	spin_lock_irqsave(&ep->lock, flags);
+	/*
+	 * During the time we spent in the loop above, some other events
+	 * might have been queued by the poll callback. We re-insert them
+	 * here (in case they are not already queued, or they're one-shot).
+	 */
+	for (nepi = ep->ovflist; (epi = nepi) != NULL;
+	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+		if (!ep_is_linked(&epi->rdllink) &&
+		    (epi->event.events & ~EP_PRIVATE_BITS))
 			list_add_tail(&epi->rdllink, &ep->rdllist);
-			ricnt++;
-		}
 	}
+	/*
+	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
+	 * releasing the lock, events will be queued in the normal way inside
+	 * ep->rdllist.
+	 */
+	ep->ovflist = EP_UNACTIVE_PTR;
 
-	if (ricnt) {
+	/*
+	 * In case of error in the event-send loop, or in case the number of
+	 * ready events exceeds the userspace limit, we need to splice the
+	 * "txlist" back inside ep->rdllist.
+	 */
+	list_splice(&txlist, &ep->rdllist);
+
+	if (!list_empty(&ep->rdllist)) {
 		/*
-		 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
-		 * wait list.
+		 * Wake up (if active) both the eventpoll wait list and the ->poll()
+		 * wait list (delayed after we release the lock).
 		 */
 		if (waitqueue_active(&ep->wq))
 			__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
@@ -1502,47 +984,17 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
+	spin_unlock_irqrestore(&ep->lock, flags);
 
-	write_unlock_irqrestore(&ep->lock, flags);
+	mutex_unlock(&ep->mtx);
 
 	/* We have to call this outside the lock */
 	if (pwake)
 		ep_poll_safewake(&psw, &ep->poll_wait);
-}
 
-
-/*
- * Perform the transfer of events to user space.
- */
-static int ep_events_transfer(struct eventpoll *ep,
-			      struct epoll_event __user *events, int maxevents)
-{
-	int eventcnt = 0;
-	struct list_head txlist;
-
-	INIT_LIST_HEAD(&txlist);
-
-	/*
-	 * We need to lock this because we could be hit by
-	 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
-	 */
-	down_read(&ep->sem);
-
-	/* Collect/extract ready items */
-	if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
-		/* Build result set in userspace */
-		eventcnt = ep_send_events(ep, &txlist, events);
-
-		/* Reinject ready items into the ready list */
-		ep_reinject_items(ep, &txlist);
-	}
-
-	up_read(&ep->sem);
-
-	return eventcnt;
+	return eventcnt == 0 ? error: eventcnt;
 }
 
-
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout)
 {
@@ -1560,7 +1012,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
 
 retry:
-	write_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->lock, flags);
 
 	res = 0;
 	if (list_empty(&ep->rdllist)) {
@@ -1570,6 +1022,7 @@ retry:
 		 * ep_poll_callback() when events will become available.
 		 */
 		init_waitqueue_entry(&wait, current);
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue(&ep->wq, &wait);
 
 		for (;;) {
@@ -1586,9 +1039,9 @@ retry:
 				break;
 			}
 
-			write_unlock_irqrestore(&ep->lock, flags);
+			spin_unlock_irqrestore(&ep->lock, flags);
 			jtimeout = schedule_timeout(jtimeout);
-			write_lock_irqsave(&ep->lock, flags);
+			spin_lock_irqsave(&ep->lock, flags);
 		}
 		__remove_wait_queue(&ep->wq, &wait);
 
@@ -1598,7 +1051,7 @@ retry:
 	/* Is it worth to try to dig for events ? */
 	eavail = !list_empty(&ep->rdllist);
 
-	write_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->lock, flags);
 
 	/*
 	 * Try to transfer events to user space. In case we get 0 events and
@@ -1606,61 +1059,263 @@ retry:
 	 * more luck.
 	 */
 	if (!res && eavail &&
-	    !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
+	    !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
 		goto retry;
 
 	return res;
 }
 
-
-static int eventpollfs_delete_dentry(struct dentry *dentry)
+/*
+ * It opens an eventpoll file descriptor. The "size" parameter is there
+ * for historical reasons, when epoll was using an hash instead of an
+ * RB tree. With the current implementation, the "size" parameter is ignored
+ * (besides sanity checks).
+ */
+asmlinkage long sys_epoll_create(int size)
 {
+	int error, fd = -1;
+	struct eventpoll *ep;
+	struct inode *inode;
+	struct file *file;
 
-	return 1;
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
+		     current, size));
+
+	/*
+	 * Sanity check on the size parameter, and create the internal data
+	 * structure ( "struct eventpoll" ).
+	 */
+	error = -EINVAL;
+	if (size <= 0 || (error = ep_alloc(&ep)) != 0)
+		goto error_return;
+
+	/*
+	 * Creates all the items needed to setup an eventpoll file. That is,
+	 * a file structure, and inode and a free file descriptor.
+	 */
+	error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]",
+				 &eventpoll_fops, ep);
+	if (error)
+		goto error_free;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
+		     current, size, fd));
+
+	return fd;
+
+error_free:
+	ep_free(ep);
+error_return:
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
+		     current, size, error));
+	return error;
 }
 
+/*
+ * The following function implements the controller interface for
+ * the eventpoll file that enables the insertion/removal/change of
+ * file descriptors inside the interest set.
+ */
+asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
+			      struct epoll_event __user *event)
+{
+	int error;
+	struct file *file, *tfile;
+	struct eventpoll *ep;
+	struct epitem *epi;
+	struct epoll_event epds;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
+		     current, epfd, op, fd, event));
 
-static struct inode *ep_eventpoll_inode(void)
+	error = -EFAULT;
+	if (ep_op_has_event(op) &&
+	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
+		goto error_return;
+
+	/* Get the "struct file *" for the eventpoll file */
+	error = -EBADF;
+	file = fget(epfd);
+	if (!file)
+		goto error_return;
+
+	/* Get the "struct file *" for the target file */
+	tfile = fget(fd);
+	if (!tfile)
+		goto error_fput;
+
+	/* The target file descriptor must support poll */
+	error = -EPERM;
+	if (!tfile->f_op || !tfile->f_op->poll)
+		goto error_tgt_fput;
+
+	/*
+	 * We have to check that the file structure underneath the file descriptor
+	 * the user passed to us _is_ an eventpoll file. And also we do not permit
+	 * adding an epoll file descriptor inside itself.
+	 */
+	error = -EINVAL;
+	if (file == tfile || !is_file_epoll(file))
+		goto error_tgt_fput;
+
+	/*
+	 * At this point it is safe to assume that the "private_data" contains
+	 * our own data structure.
+	 */
+	ep = file->private_data;
+
+	mutex_lock(&ep->mtx);
+
+	/*
+	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
+	 * above, we can be sure to be able to use the item looked up by
+	 * ep_find() till we release the mutex.
+	 */
+	epi = ep_find(ep, tfile, fd);
+
+	error = -EINVAL;
+	switch (op) {
+	case EPOLL_CTL_ADD:
+		if (!epi) {
+			epds.events |= POLLERR | POLLHUP;
+
+			error = ep_insert(ep, &epds, tfile, fd);
+		} else
+			error = -EEXIST;
+		break;
+	case EPOLL_CTL_DEL:
+		if (epi)
+			error = ep_remove(ep, epi);
+		else
+			error = -ENOENT;
+		break;
+	case EPOLL_CTL_MOD:
+		if (epi) {
+			epds.events |= POLLERR | POLLHUP;
+			error = ep_modify(ep, epi, &epds);
+		} else
+			error = -ENOENT;
+		break;
+	}
+	mutex_unlock(&ep->mtx);
+
+error_tgt_fput:
+	fput(tfile);
+error_fput:
+	fput(file);
+error_return:
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
+		     current, epfd, op, fd, event, error));
+
+	return error;
+}
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_wait(2).
+ */
+asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
+			       int maxevents, int timeout)
 {
-	int error = -ENOMEM;
-	struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
+	int error;
+	struct file *file;
+	struct eventpoll *ep;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
+		     current, epfd, events, maxevents, timeout));
+
+	/* The maximum number of event must be greater than zero */
+	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
+		return -EINVAL;
 
-	if (!inode)
-		goto eexit_1;
+	/* Verify that the area passed by the user is writeable */
+	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
+		error = -EFAULT;
+		goto error_return;
+	}
+
+	/* Get the "struct file *" for the eventpoll file */
+	error = -EBADF;
+	file = fget(epfd);
+	if (!file)
+		goto error_return;
 
-	inode->i_fop = &eventpoll_fops;
+	/*
+	 * We have to check that the file structure underneath the fd
+	 * the user passed to us _is_ an eventpoll file.
+	 */
+	error = -EINVAL;
+	if (!is_file_epoll(file))
+		goto error_fput;
 
 	/*
-	 * Mark the inode dirty from the very beginning,
-	 * that way it will never be moved to the dirty
-	 * list because mark_inode_dirty() will think
-	 * that it already _is_ on the dirty list.
+	 * At this point it is safe to assume that the "private_data" contains
+	 * our own data structure.
 	 */
-	inode->i_state = I_DIRTY;
-	inode->i_mode = S_IRUSR | S_IWUSR;
-	inode->i_uid = current->fsuid;
-	inode->i_gid = current->fsgid;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	return inode;
-
-eexit_1:
-	return ERR_PTR(error);
+	ep = file->private_data;
+
+	/* Time to fish for events ... */
+	error = ep_poll(ep, events, maxevents, timeout);
+
+error_fput:
+	fput(file);
+error_return:
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
+		     current, epfd, events, maxevents, timeout, error));
+
+	return error;
 }
 
+#ifdef TIF_RESTORE_SIGMASK
 
-static int
-eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
-		   const char *dev_name, void *data, struct vfsmount *mnt)
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
+		int maxevents, int timeout, const sigset_t __user *sigmask,
+		size_t sigsetsize)
 {
-	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
-			     mnt);
+	int error;
+	sigset_t ksigmask, sigsaved;
+
+	/*
+	 * If the caller wants a certain signal mask to be set during the wait,
+	 * we apply it here.
+	 */
+	if (sigmask) {
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	error = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+	/*
+	 * If we changed the signal mask, we need to restore the original one.
+	 * In case we've got a signal while waiting, we do not restore the
+	 * signal mask yet, and we allow do_signal() to deliver the signal on
+	 * the way back to userspace, before the signal mask is restored.
+	 */
+	if (sigmask) {
+		if (error == -EINTR) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+			       sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		} else
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	}
+
+	return error;
 }
 
+#endif /* #ifdef TIF_RESTORE_SIGMASK */
 
 static int __init eventpoll_init(void)
 {
-	int error;
-
 	mutex_init(&epmutex);
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
@@ -1676,39 +1331,7 @@ static int __init eventpoll_init(void)
 			sizeof(struct eppoll_entry), 0,
 			EPI_SLAB_DEBUG|SLAB_PANIC, NULL, NULL);
 
-	/*
-	 * Register the virtual file system that will be the source of inodes
-	 * for the eventpoll files
-	 */
-	error = register_filesystem(&eventpoll_fs_type);
-	if (error)
-		goto epanic;
-
-	/* Mount the above commented virtual file system */
-	eventpoll_mnt = kern_mount(&eventpoll_fs_type);
-	error = PTR_ERR(eventpoll_mnt);
-	if (IS_ERR(eventpoll_mnt))
-		goto epanic;
-
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n",
-			current));
 	return 0;
-
-epanic:
-	panic("eventpoll_init() failed\n");
 }
+fs_initcall(eventpoll_init);
 
-
-static void __exit eventpoll_exit(void)
-{
-	/* Undo all operations done inside eventpoll_init() */
-	unregister_filesystem(&eventpoll_fs_type);
-	mntput(eventpoll_mnt);
-	kmem_cache_destroy(pwq_cache);
-	kmem_cache_destroy(epi_cache);
-}
-
-module_init(eventpoll_init);
-module_exit(eventpoll_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/fs/exec.c b/fs/exec.c
index 3155e915307a..f20561ff4528 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -50,6 +50,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
+#include <linux/signalfd.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -59,7 +60,7 @@
 #endif
 
 int core_uses_pid;
-char core_pattern[128] = "core";
+char core_pattern[CORENAME_MAX_SIZE] = "core";
 int suid_dumpable = 0;
 
 EXPORT_SYMBOL(suid_dumpable);
@@ -100,6 +101,7 @@ int unregister_binfmt(struct linux_binfmt * fmt)
 	while (*tmp) {
 		if (fmt == *tmp) {
 			*tmp = fmt->next;
+			fmt->next = NULL;
 			write_unlock(&binfmt_lock);
 			return 0;
 		}
@@ -132,6 +134,9 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (error)
 		goto out;
 
+	error = -EACCES;
+	if (nd.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
 	error = -EINVAL;
 	if (!S_ISREG(nd.dentry->d_inode->i_mode))
 		goto exit;
@@ -581,6 +586,13 @@ static int de_thread(struct task_struct *tsk)
 	int count;
 
 	/*
+	 * Tell all the sighand listeners that this sighand has
+	 * been detached. The signalfd_detach() function grabs the
+	 * sighand lock, if signal listeners are present on the sighand.
+	 */
+	signalfd_detach(tsk);
+
+	/*
 	 * If we don't share sighandlers, then we aren't sharing anything
 	 * and we can just re-use it all.
 	 */
@@ -701,7 +713,7 @@ static int de_thread(struct task_struct *tsk)
 		 */
 		detach_pid(tsk, PIDTYPE_PID);
 		tsk->pid = leader->pid;
-		attach_pid(tsk, PIDTYPE_PID,  tsk->pid);
+		attach_pid(tsk, PIDTYPE_PID,  find_pid(tsk->pid));
 		transfer_pid(leader, tsk, PIDTYPE_PGID);
 		transfer_pid(leader, tsk, PIDTYPE_SID);
 		list_replace_rcu(&leader->tasks, &tsk->tasks);
@@ -756,8 +768,7 @@ no_thread_group:
 		spin_unlock(&oldsighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 
-		if (atomic_dec_and_test(&oldsighand->count))
-			kmem_cache_free(sighand_cachep, oldsighand);
+		__cleanup_sighand(oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(tsk));
@@ -982,33 +993,51 @@ void compute_creds(struct linux_binprm *bprm)
 	task_unlock(current);
 	security_bprm_post_apply_creds(bprm);
 }
-
 EXPORT_SYMBOL(compute_creds);
 
+/*
+ * Arguments are '\0' separated strings found at the location bprm->p
+ * points to; chop off the first by relocating brpm->p to right after
+ * the first '\0' encountered.
+ */
 void remove_arg_zero(struct linux_binprm *bprm)
 {
 	if (bprm->argc) {
-		unsigned long offset;
-		char * kaddr;
-		struct page *page;
+		char ch;
+
+		do {
+			unsigned long offset;
+			unsigned long index;
+			char *kaddr;
+			struct page *page;
 
-		offset = bprm->p % PAGE_SIZE;
-		goto inside;
+			offset = bprm->p & ~PAGE_MASK;
+			index = bprm->p >> PAGE_SHIFT;
 
-		while (bprm->p++, *(kaddr+offset++)) {
-			if (offset != PAGE_SIZE)
-				continue;
-			offset = 0;
-			kunmap_atomic(kaddr, KM_USER0);
-inside:
-			page = bprm->page[bprm->p/PAGE_SIZE];
+			page = bprm->page[index];
 			kaddr = kmap_atomic(page, KM_USER0);
-		}
-		kunmap_atomic(kaddr, KM_USER0);
+
+			/* run through page until we reach end or find NUL */
+			do {
+				ch = *(kaddr + offset);
+
+				/* discard that character... */
+				bprm->p++;
+				offset++;
+			} while (offset < PAGE_SIZE && ch != '\0');
+
+			kunmap_atomic(kaddr, KM_USER0);
+
+			/* free the old page */
+			if (offset == PAGE_SIZE) {
+				__free_page(page);
+				bprm->page[index] = NULL;
+			}
+		} while (ch != '\0');
+
 		bprm->argc--;
 	}
 }
-
 EXPORT_SYMBOL(remove_arg_zero);
 
 /*
@@ -1238,8 +1267,6 @@ int set_binfmt(struct linux_binfmt *new)
 
 EXPORT_SYMBOL(set_binfmt);
 
-#define CORENAME_MAX_SIZE 64
-
 /* format_corename will inspect the pattern parameter, and output a
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
@@ -1469,6 +1496,8 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	int flag = 0;
 	int ispipe = 0;
 
+	audit_core_dumps(signr);
+
 	binfmt = current->binfmt;
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 93e77c3d2490..e98f6cd7200c 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -2,7 +2,6 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/namei.h>
 
 struct export_operations export_op_default;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index e89bfc8cf957..2bf49d7ef841 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -23,7 +23,6 @@
 
 #include "ext2.h"
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 typedef struct ext2_dir_entry_2 ext2_dirent;
 
@@ -161,10 +160,7 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n)
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
 		if (!PageChecked(page))
 			ext2_check_page(page);
 		if (PageError(page))
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e2a0ea50af1d..9fd0ec5ba0d0 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -133,6 +133,7 @@ extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void ext2_truncate (struct inode *);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
+extern void ext2_get_inode_flags(struct ext2_inode_info *);
 
 /* ioctl.c */
 extern int ext2_ioctl (struct inode *, struct file *, unsigned int,
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
index 7806b9e8155b..fc66c93fcb5c 100644
--- a/fs/ext2/fsync.c
+++ b/fs/ext2/fsync.c
@@ -23,7 +23,6 @@
  */
 
 #include "ext2.h"
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>		/* for sync_mapping_buffers() */
 
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index dd4e14c221e0..0079b2cd5314 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1055,6 +1055,25 @@ void ext2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DIRSYNC;
 }
 
+/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
+void ext2_get_inode_flags(struct ext2_inode_info *ei)
+{
+	unsigned int flags = ei->vfs_inode.i_flags;
+
+	ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
+			EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		ei->i_flags |= EXT2_SYNC_FL;
+	if (flags & S_APPEND)
+		ei->i_flags |= EXT2_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		ei->i_flags |= EXT2_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		ei->i_flags |= EXT2_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		ei->i_flags |= EXT2_DIRSYNC_FL;
+}
+
 void ext2_read_inode (struct inode * inode)
 {
 	struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1079,9 +1098,9 @@ void ext2_read_inode (struct inode * inode)
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
-	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
-	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
@@ -1188,6 +1207,7 @@ static int ext2_update_inode(struct inode * inode, int do_sync)
 	if (ei->i_state & EXT2_STATE_NEW)
 		memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
 
+	ext2_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if (!(test_opt(sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 4b099d310712..e85c48218239 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -27,6 +27,7 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 
 	switch (cmd) {
 	case EXT2_IOC_GETFLAGS:
+		ext2_get_inode_flags(ei);
 		flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
 	case EXT2_IOC_SETFLAGS: {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a046a419d8af..16337bff0272 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -160,14 +160,11 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		rwlock_init(&ei->i_meta_lock);
+	rwlock_init(&ei->i_meta_lock);
 #ifdef CONFIG_EXT2_FS_XATTR
-		init_rwsem(&ei->xattr_sem);
+	init_rwsem(&ei->xattr_sem);
 #endif
-		inode_init_once(&ei->vfs_inode);
-	}
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index a26612798471..eaa23d2d5213 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -6,7 +6,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/ext2_fs.h>
 #include <linux/security.h>
 #include "xattr.h"
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index f28a6a499c96..83ee149f353d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -9,7 +9,6 @@
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/ext2_fs.h>
 #include "xattr.h"
 
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 665adee99b31..852869840f24 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -25,7 +25,6 @@
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a5b150f7e8a2..a6cb6171c3af 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,7 +27,6 @@
 #include <linux/time.h>
 #include <linux/ext3_jbd.h>
 #include <linux/jbd.h>
-#include <linux/smp_lock.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
@@ -1768,7 +1767,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
 	int err = 0;
-	void *kaddr;
 
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
@@ -1780,10 +1778,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 	 */
 	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
 	     ext3_should_writeback_data(inode) && PageUptodate(page)) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + offset, 0, length);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, offset, length, KM_USER0);
 		set_page_dirty(page);
 		goto unlock;
 	}
@@ -1836,11 +1831,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 			goto unlock;
 	}
 
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, length);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-
+	zero_user_page(page, offset, length, KM_USER0);
 	BUFFER_TRACE(bh, "zeroed end of block");
 
 	err = 0;
@@ -2581,6 +2572,25 @@ void ext3_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DIRSYNC;
 }
 
+/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
+void ext3_get_inode_flags(struct ext3_inode_info *ei)
+{
+	unsigned int flags = ei->vfs_inode.i_flags;
+
+	ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
+			EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		ei->i_flags |= EXT3_SYNC_FL;
+	if (flags & S_APPEND)
+		ei->i_flags |= EXT3_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		ei->i_flags |= EXT3_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		ei->i_flags |= EXT3_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		ei->i_flags |= EXT3_DIRSYNC_FL;
+}
+
 void ext3_read_inode(struct inode * inode)
 {
 	struct ext3_iloc iloc;
@@ -2608,9 +2618,9 @@ void ext3_read_inode(struct inode * inode)
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
-	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
-	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 
 	ei->i_state = 0;
@@ -2736,6 +2746,7 @@ static int ext3_do_update_inode(handle_t *handle,
 	if (ei->i_state & EXT3_STATE_NEW)
 		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
 
+	ext3_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if(!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 9b8090d94e6c..965006dba6be 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -28,6 +28,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 
 	switch (cmd) {
 	case EXT3_IOC_GETFLAGS:
+		ext3_get_inode_flags(ei);
 		flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
 	case EXT3_IOC_SETFLAGS: {
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 49159f13cc1f..9bb046df827a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -36,7 +36,6 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
-#include <linux/smp_lock.h>
 
 #include "namei.h"
 #include "xattr.h"
@@ -969,6 +968,7 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
 				  (block<<EXT3_BLOCK_SIZE_BITS(sb))
 					  +((char *)de - bh->b_data))) {
 				brelse (bh);
+				*err = ERR_BAD_DX_DIR;
 				goto errout;
 			}
 			*res_dir = de;
@@ -1134,9 +1134,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	char *data1 = (*bh)->b_data, *data2;
 	unsigned split;
 	struct ext3_dir_entry_2 *de = NULL, *de2;
-	int	err;
+	int	err = 0;
 
-	bh2 = ext3_append (handle, dir, &newblock, error);
+	bh2 = ext3_append (handle, dir, &newblock, &err);
 	if (!(bh2)) {
 		brelse(*bh);
 		*bh = NULL;
@@ -1145,14 +1145,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 
 	BUFFER_TRACE(*bh, "get_write_access");
 	err = ext3_journal_get_write_access(handle, *bh);
-	if (err) {
-	journal_error:
-		brelse(*bh);
-		brelse(bh2);
-		*bh = NULL;
-		ext3_std_error(dir->i_sb, err);
-		goto errout;
-	}
+	if (err)
+		goto journal_error;
+
 	BUFFER_TRACE(frame->bh, "get_write_access");
 	err = ext3_journal_get_write_access(handle, frame->bh);
 	if (err)
@@ -1195,8 +1190,16 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 		goto journal_error;
 	brelse (bh2);
 	dxtrace(dx_show_index ("frame", frame->entries));
-errout:
 	return de;
+
+journal_error:
+	brelse(*bh);
+	brelse(bh2);
+	*bh = NULL;
+	ext3_std_error(dir->i_sb, err);
+errout:
+	*error = err;
+	return NULL;
 }
 #endif
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index ecf89904c113..2c97e09c6c6b 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -11,7 +11,6 @@
 
 #define EXT3FS_DEBUG
 
-#include <linux/smp_lock.h>
 #include <linux/ext3_jbd.h>
 
 #include <linux/errno.h>
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 4a4fcd6868c7..6e3062913a92 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -420,7 +420,7 @@ static void ext3_put_super (struct super_block * sb)
 		dump_orphan_list(sb, sbi);
 	J_ASSERT(list_empty(&sbi->s_orphan));
 
-	invalidate_bdev(sb->s_bdev, 0);
+	invalidate_bdev(sb->s_bdev);
 	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
 		/*
 		 * Invalidate the journal device's buffers.  We don't want them
@@ -428,7 +428,7 @@ static void ext3_put_super (struct super_block * sb)
 		 * hotswapped, and it breaks the `ro-after' testing code.
 		 */
 		sync_blockdev(sbi->journal_bdev);
-		invalidate_bdev(sbi->journal_bdev, 0);
+		invalidate_bdev(sbi->journal_bdev);
 		ext3_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
@@ -466,15 +466,12 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		INIT_LIST_HEAD(&ei->i_orphan);
+	INIT_LIST_HEAD(&ei->i_orphan);
 #ifdef CONFIG_EXT3_FS_XATTR
-		init_rwsem(&ei->xattr_sem);
+	init_rwsem(&ei->xattr_sem);
 #endif
-		mutex_init(&ei->truncate_mutex);
-		inode_init_once(&ei->vfs_inode);
-	}
+	mutex_init(&ei->truncate_mutex);
+	inode_init_once(&ei->vfs_inode);
 }
 
 static int init_inodecache(void)
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index b9c40c15647b..821efaf2b94e 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -6,7 +6,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/ext3_jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/security.h>
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index 86d91f1186dc..0327497a55ce 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -9,7 +9,6 @@
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/ext3_jbd.h>
 #include <linux/ext3_fs.h>
 #include "xattr.h"
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index a85a0a17c4fd..1abd8f92c440 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -8,7 +8,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/ext3_jbd.h>
 #include <linux/ext3_fs.h>
 #include "xattr.h"
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8a23483ca8d0..3b64bb16c727 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -30,15 +30,15 @@
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 		unsigned long *blockgrpp, ext4_grpblk_t *offsetp)
 {
-        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	ext4_grpblk_t offset;
 
-        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
 	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
 	if (offsetp)
 		*offsetp = offset;
 	if (blockgrpp)
-	        *blockgrpp = blocknr;
+		*blockgrpp = blocknr;
 
 }
 
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index da80368b66f0..e8ad06e28318 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -25,7 +25,6 @@
 #include <linux/jbd2.h>
 #include <linux/ext4_fs.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7916b50f9a13..b9ce24129070 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -34,7 +34,6 @@
 #include <linux/time.h>
 #include <linux/ext4_jbd2.h>
 #include <linux/jbd.h>
-#include <linux/smp_lock.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
@@ -375,7 +374,7 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc
 				       le32_to_cpu(ix[-1].ei_block));
 			}
 			BUG_ON(k && le32_to_cpu(ix->ei_block)
-				           <= le32_to_cpu(ix[-1].ei_block));
+					   <= le32_to_cpu(ix[-1].ei_block));
 			if (block < le32_to_cpu(ix->ei_block))
 				break;
 			chix = ix;
@@ -424,8 +423,8 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
 
 	path->p_ext = l - 1;
 	ext_debug("  -> %d:%llu:%d ",
-		        le32_to_cpu(path->p_ext->ee_block),
-		        ext_pblock(path->p_ext),
+			le32_to_cpu(path->p_ext->ee_block),
+			ext_pblock(path->p_ext),
 			le16_to_cpu(path->p_ext->ee_len));
 
 #ifdef CHECK_BINSEARCH
@@ -436,7 +435,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
 		chex = ex = EXT_FIRST_EXTENT(eh);
 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
 			BUG_ON(k && le32_to_cpu(ex->ee_block)
-				          <= le32_to_cpu(ex[-1].ee_block));
+					  <= le32_to_cpu(ex[-1].ee_block));
 			if (block < le32_to_cpu(ex->ee_block))
 				break;
 			chex = ex;
@@ -578,7 +577,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 	curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
 
 	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
-	                     > le16_to_cpu(curp->p_hdr->eh_max));
+			     > le16_to_cpu(curp->p_hdr->eh_max));
 	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
 
 	err = ext4_ext_dirty(handle, inode, curp);
@@ -622,12 +621,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		border = path[depth].p_ext[1].ee_block;
 		ext_debug("leaf will be split."
 				" next leaf starts at %d\n",
-			          le32_to_cpu(border));
+				  le32_to_cpu(border));
 	} else {
 		border = newext->ee_block;
 		ext_debug("leaf will be added."
 				" next leaf starts at %d\n",
-			        le32_to_cpu(border));
+				le32_to_cpu(border));
 	}
 
 	/*
@@ -685,9 +684,9 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	while (path[depth].p_ext <=
 			EXT_MAX_EXTENT(path[depth].p_hdr)) {
 		ext_debug("move %d:%llu:%d in new leaf %llu\n",
-			        le32_to_cpu(path[depth].p_ext->ee_block),
-			        ext_pblock(path[depth].p_ext),
-			        le16_to_cpu(path[depth].p_ext->ee_len),
+				le32_to_cpu(path[depth].p_ext->ee_block),
+				ext_pblock(path[depth].p_ext),
+				le16_to_cpu(path[depth].p_ext->ee_len),
 				newblock);
 		/*memmove(ex++, path[depth].p_ext++,
 				sizeof(struct ext4_extent));
@@ -766,9 +765,9 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 				EXT_LAST_INDEX(path[i].p_hdr));
 		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
 			ext_debug("%d: move %d:%d in new index %llu\n", i,
-				        le32_to_cpu(path[i].p_idx->ei_block),
-				        idx_pblock(path[i].p_idx),
-				        newblock);
+					le32_to_cpu(path[i].p_idx->ei_block),
+					idx_pblock(path[i].p_idx),
+					newblock);
 			/*memmove(++fidx, path[i].p_idx++,
 					sizeof(struct ext4_extent_idx));
 			neh->eh_entries++;
@@ -1129,6 +1128,55 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 }
 
 /*
+ * check if a portion of the "newext" extent overlaps with an
+ * existing extent.
+ *
+ * If there is an overlap discovered, it updates the length of the newext
+ * such that there will be no overlap, and then returns 1.
+ * If there is no overlap found, it returns 0.
+ */
+unsigned int ext4_ext_check_overlap(struct inode *inode,
+				    struct ext4_extent *newext,
+				    struct ext4_ext_path *path)
+{
+	unsigned long b1, b2;
+	unsigned int depth, len1;
+	unsigned int ret = 0;
+
+	b1 = le32_to_cpu(newext->ee_block);
+	len1 = le16_to_cpu(newext->ee_len);
+	depth = ext_depth(inode);
+	if (!path[depth].p_ext)
+		goto out;
+	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+
+	/*
+	 * get the next allocated block if the extent in the path
+	 * is before the requested block(s) 
+	 */
+	if (b2 < b1) {
+		b2 = ext4_ext_next_allocated_block(path);
+		if (b2 == EXT_MAX_BLOCK)
+			goto out;
+	}
+
+	/* check for wrap through zero */
+	if (b1 + len1 < b1) {
+		len1 = EXT_MAX_BLOCK - b1;
+		newext->ee_len = cpu_to_le16(len1);
+		ret = 1;
+	}
+
+	/* check for overlap */
+	if (b1 + len1 > b2) {
+		newext->ee_len = cpu_to_le16(b2 - b1);
+		ret = 1;
+	}
+out:
+	return ret;
+}
+
+/*
  * ext4_ext_insert_extent:
  * tries to merge requsted extent into the existing extent or
  * inserts requested extent as new one into the tree,
@@ -1213,12 +1261,12 @@ has_space:
 	if (!nearex) {
 		/* there is no extent in this leaf, create first one */
 		ext_debug("first extent in the leaf: %d:%llu:%d\n",
-			        le32_to_cpu(newext->ee_block),
-			        ext_pblock(newext),
-			        le16_to_cpu(newext->ee_len));
+				le32_to_cpu(newext->ee_block),
+				ext_pblock(newext),
+				le16_to_cpu(newext->ee_len));
 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
 	} else if (le32_to_cpu(newext->ee_block)
-		           > le32_to_cpu(nearex->ee_block)) {
+			   > le32_to_cpu(nearex->ee_block)) {
 /*		BUG_ON(newext->ee_block == nearex->ee_block); */
 		if (nearex != EXT_LAST_EXTENT(eh)) {
 			len = EXT_MAX_EXTENT(eh) - nearex;
@@ -1226,9 +1274,9 @@ has_space:
 			len = len < 0 ? 0 : len;
 			ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
 					"move %d from 0x%p to 0x%p\n",
-				        le32_to_cpu(newext->ee_block),
-				        ext_pblock(newext),
-				        le16_to_cpu(newext->ee_len),
+					le32_to_cpu(newext->ee_block),
+					ext_pblock(newext),
+					le16_to_cpu(newext->ee_len),
 					nearex, len, nearex + 1, nearex + 2);
 			memmove(nearex + 2, nearex + 1, len);
 		}
@@ -1359,9 +1407,9 @@ int ext4_ext_walk_space(struct inode *inode, unsigned long block,
 			cbex.ec_start = 0;
 			cbex.ec_type = EXT4_EXT_CACHE_GAP;
 		} else {
-		        cbex.ec_block = le32_to_cpu(ex->ee_block);
-		        cbex.ec_len = le16_to_cpu(ex->ee_len);
-		        cbex.ec_start = ext_pblock(ex);
+			cbex.ec_block = le32_to_cpu(ex->ee_block);
+			cbex.ec_len = le16_to_cpu(ex->ee_len);
+			cbex.ec_start = ext_pblock(ex);
 			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
 		}
 
@@ -1432,16 +1480,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 		len = le32_to_cpu(ex->ee_block) - block;
 		ext_debug("cache gap(before): %lu [%lu:%lu]",
 				(unsigned long) block,
-			        (unsigned long) le32_to_cpu(ex->ee_block),
-			        (unsigned long) le16_to_cpu(ex->ee_len));
+				(unsigned long) le32_to_cpu(ex->ee_block),
+				(unsigned long) le16_to_cpu(ex->ee_len));
 	} else if (block >= le32_to_cpu(ex->ee_block)
-		            + le16_to_cpu(ex->ee_len)) {
-	        lblock = le32_to_cpu(ex->ee_block)
-		         + le16_to_cpu(ex->ee_len);
+			    + le16_to_cpu(ex->ee_len)) {
+		lblock = le32_to_cpu(ex->ee_block)
+			 + le16_to_cpu(ex->ee_len);
 		len = ext4_ext_next_allocated_block(path);
 		ext_debug("cache gap(after): [%lu:%lu] %lu",
-			        (unsigned long) le32_to_cpu(ex->ee_block),
-			        (unsigned long) le16_to_cpu(ex->ee_len),
+				(unsigned long) le32_to_cpu(ex->ee_block),
+				(unsigned long) le16_to_cpu(ex->ee_len),
 				(unsigned long) block);
 		BUG_ON(len == lblock);
 		len = len - lblock;
@@ -1469,9 +1517,9 @@ ext4_ext_in_cache(struct inode *inode, unsigned long block,
 	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
 			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
 	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
-	        ex->ee_block = cpu_to_le32(cex->ec_block);
+		ex->ee_block = cpu_to_le32(cex->ec_block);
 		ext4_ext_store_pblock(ex, cex->ec_start);
-	        ex->ee_len = cpu_to_le16(cex->ec_len);
+		ex->ee_len = cpu_to_le16(cex->ec_len);
 		ext_debug("%lu cached by %lu:%lu:%llu\n",
 				(unsigned long) block,
 				(unsigned long) cex->ec_block,
@@ -1957,9 +2005,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			/* we should allocate requested block */
 		} else if (goal == EXT4_EXT_CACHE_EXTENT) {
 			/* block is already allocated */
-		        newblock = iblock
-		                   - le32_to_cpu(newex.ee_block)
-			           + ext_pblock(&newex);
+			newblock = iblock
+				   - le32_to_cpu(newex.ee_block)
+				   + ext_pblock(&newex);
 			/* number of remaining blocks in the extent */
 			allocated = le16_to_cpu(newex.ee_len) -
 					(iblock - le32_to_cpu(newex.ee_block));
@@ -1988,7 +2036,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 
 	ex = path[depth].p_ext;
 	if (ex) {
-	        unsigned long ee_block = le32_to_cpu(ex->ee_block);
+		unsigned long ee_block = le32_to_cpu(ex->ee_block);
 		ext4_fsblk_t ee_start = ext_pblock(ex);
 		unsigned short ee_len  = le16_to_cpu(ex->ee_len);
 
@@ -2001,7 +2049,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		if (ee_len > EXT_MAX_LEN)
 			goto out2;
 		/* if found extent covers block, simply return it */
-	        if (iblock >= ee_block && iblock < ee_block + ee_len) {
+		if (iblock >= ee_block && iblock < ee_block + ee_len) {
 			newblock = iblock - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
 			allocated = ee_len - (iblock - ee_block);
@@ -2032,7 +2080,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 
 	/* allocate new block */
 	goal = ext4_ext_find_goal(inode, path, iblock);
-	allocated = max_blocks;
+
+	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
+	newex.ee_block = cpu_to_le32(iblock);
+	newex.ee_len = cpu_to_le16(max_blocks);
+	err = ext4_ext_check_overlap(inode, &newex, path);
+	if (err)
+		allocated = le16_to_cpu(newex.ee_len);
+	else
+		allocated = max_blocks;
 	newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
 	if (!newblock)
 		goto out2;
@@ -2040,12 +2096,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			goal, newblock, allocated);
 
 	/* try to insert new extent into found leaf and return */
-	newex.ee_block = cpu_to_le32(iblock);
 	ext4_ext_store_pblock(&newex, newblock);
 	newex.ee_len = cpu_to_le16(allocated);
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
-	if (err)
+	if (err) {
+		/* free data blocks we just allocated */
+		ext4_free_blocks(handle, inode, ext_pblock(&newex),
+					le16_to_cpu(newex.ee_len));
 		goto out2;
+	}
 
 	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2158,11 +2217,3 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
 
 	return needed;
 }
-
-EXPORT_SYMBOL(ext4_mark_inode_dirty);
-EXPORT_SYMBOL(ext4_ext_invalidate_cache);
-EXPORT_SYMBOL(ext4_ext_insert_extent);
-EXPORT_SYMBOL(ext4_ext_walk_space);
-EXPORT_SYMBOL(ext4_ext_find_goal);
-EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
-
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 810b6d6474bf..0bcf62a750ff 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -27,7 +27,6 @@
 #include <linux/time.h>
 #include <linux/ext4_jbd2.h>
 #include <linux/jbd2.h>
-#include <linux/smp_lock.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
@@ -256,8 +255,8 @@ static int verify_chain(Indirect *from, Indirect *to)
  *	@inode: inode in question (we are only interested in its superblock)
  *	@i_block: block number to be parsed
  *	@offsets: array to store the offsets in
- *      @boundary: set this non-zero if the referred-to block is likely to be
- *             followed (on disk) by an indirect block.
+ *	@boundary: set this non-zero if the referred-to block is likely to be
+ *	       followed (on disk) by an indirect block.
  *
  *	To store the locations of file's data ext4 uses a data structure common
  *	for UNIX filesystems - tree of pointers anchored in the inode, with
@@ -2611,9 +2610,9 @@ void ext4_read_inode(struct inode * inode)
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
-	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
-	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 
 	ei->i_state = 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e7e1d79a7d75..2811e5720ad0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -36,7 +36,6 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
-#include <linux/smp_lock.h>
 
 #include "namei.h"
 #include "xattr.h"
@@ -47,7 +46,7 @@
  */
 #define NAMEI_RA_CHUNKS  2
 #define NAMEI_RA_BLOCKS  4
-#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_SIZE	     (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
 #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 
 static struct buffer_head *ext4_append(handle_t *handle,
@@ -242,7 +241,7 @@ static inline unsigned dx_node_limit (struct inode *dir)
 static void dx_show_index (char * label, struct dx_entry *entries)
 {
 	int i, n = dx_get_count (entries);
-        printk("%s index ", label);
+	printk("%s index ", label);
 	for (i = 0; i < n; i++) {
 		printk("%x->%u ", i? dx_get_hash(entries + i) :
 				0, dx_get_block(entries + i));
@@ -967,6 +966,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 				  (block<<EXT4_BLOCK_SIZE_BITS(sb))
 					  +((char *)de - bh->b_data))) {
 				brelse (bh);
+				*err = ERR_BAD_DX_DIR;
 				goto errout;
 			}
 			*res_dir = de;
@@ -1132,9 +1132,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	char *data1 = (*bh)->b_data, *data2;
 	unsigned split;
 	struct ext4_dir_entry_2 *de = NULL, *de2;
-	int	err;
+	int	err = 0;
 
-	bh2 = ext4_append (handle, dir, &newblock, error);
+	bh2 = ext4_append (handle, dir, &newblock, &err);
 	if (!(bh2)) {
 		brelse(*bh);
 		*bh = NULL;
@@ -1143,14 +1143,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 
 	BUFFER_TRACE(*bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, *bh);
-	if (err) {
-	journal_error:
-		brelse(*bh);
-		brelse(bh2);
-		*bh = NULL;
-		ext4_std_error(dir->i_sb, err);
-		goto errout;
-	}
+	if (err)
+		goto journal_error;
+
 	BUFFER_TRACE(frame->bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, frame->bh);
 	if (err)
@@ -1193,8 +1188,16 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 		goto journal_error;
 	brelse (bh2);
 	dxtrace(dx_show_index ("frame", frame->entries));
-errout:
 	return de;
+
+journal_error:
+	brelse(*bh);
+	brelse(bh2);
+	*bh = NULL;
+	ext4_std_error(dir->i_sb, err);
+errout:
+	*error = err;
+	return NULL;
 }
 #endif
 
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ea99f6c97f56..aa11d7dbe970 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -11,7 +11,6 @@
 
 #define EXT4FS_DEBUG
 
-#include <linux/smp_lock.h>
 #include <linux/ext4_jbd2.h>
 
 #include <linux/errno.h>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61c4718e4a53..175b68c60968 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -470,7 +470,7 @@ static void ext4_put_super (struct super_block * sb)
 		dump_orphan_list(sb, sbi);
 	J_ASSERT(list_empty(&sbi->s_orphan));
 
-	invalidate_bdev(sb->s_bdev, 0);
+	invalidate_bdev(sb->s_bdev);
 	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
 		/*
 		 * Invalidate the journal device's buffers.  We don't want them
@@ -478,7 +478,7 @@ static void ext4_put_super (struct super_block * sb)
 		 * hotswapped, and it breaks the `ro-after' testing code.
 		 */
 		sync_blockdev(sbi->journal_bdev);
-		invalidate_bdev(sbi->journal_bdev, 0);
+		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
@@ -517,15 +517,12 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		INIT_LIST_HEAD(&ei->i_orphan);
+	INIT_LIST_HEAD(&ei->i_orphan);
 #ifdef CONFIG_EXT4DEV_FS_XATTR
-		init_rwsem(&ei->xattr_sem);
+	init_rwsem(&ei->xattr_sem);
 #endif
-		mutex_init(&ei->truncate_mutex);
-		inode_init_once(&ei->vfs_inode);
-	}
+	mutex_init(&ei->truncate_mutex);
+	inode_init_once(&ei->vfs_inode);
 }
 
 static int init_inodecache(void)
@@ -1988,7 +1985,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 
 	if (bd_claim(bdev, sb)) {
 		printk(KERN_ERR
-		        "EXT4: failed to claim external journal device.\n");
+			"EXT4: failed to claim external journal device.\n");
 		blkdev_put(bdev);
 		return NULL;
 	}
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index b6a6861951f9..f17eaf2321b9 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -6,7 +6,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/ext4_jbd2.h>
 #include <linux/ext4_fs.h>
 #include <linux/security.h>
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index b76f2dbc82da..e0f05acdafec 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -9,7 +9,6 @@
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/ext4_jbd2.h>
 #include <linux/ext4_fs.h>
 #include "xattr.h"
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c53cded0761a..7ed3d8ebf096 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -8,7 +8,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/ext4_jbd2.h>
 #include <linux/ext4_fs.h>
 #include "xattr.h"
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 05c2941c74f2..3c9c8a15ec73 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -40,9 +40,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct fat_cache *cache = (struct fat_cache *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		INIT_LIST_HEAD(&cache->cache_list);
+	INIT_LIST_HEAD(&cache->cache_list);
 }
 
 int __init fat_cache_init(void)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index c16af246d245..ccf161dffb63 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -422,7 +422,7 @@ EODir:
 EXPORT_SYMBOL_GPL(fat_search_long);
 
 struct fat_ioctl_filldir_callback {
-	struct dirent __user *dirent;
+	void __user *dirent;
 	int result;
 	/* for dir ioctl */
 	const char *longname;
@@ -647,62 +647,85 @@ static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
 }
 
-static int fat_ioctl_filldir(void *__buf, const char *name, int name_len,
-			     loff_t offset, u64 ino, unsigned int d_type)
+#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type)			   \
+static int func(void *__buf, const char *name, int name_len,		   \
+			     loff_t offset, u64 ino, unsigned int d_type)  \
+{									   \
+	struct fat_ioctl_filldir_callback *buf = __buf;			   \
+	struct dirent_type __user *d1 = buf->dirent;			   \
+	struct dirent_type __user *d2 = d1 + 1;				   \
+									   \
+	if (buf->result)						   \
+		return -EINVAL;						   \
+	buf->result++;							   \
+									   \
+	if (name != NULL) {						   \
+		/* dirent has only short name */			   \
+		if (name_len >= sizeof(d1->d_name))			   \
+			name_len = sizeof(d1->d_name) - 1;		   \
+									   \
+		if (put_user(0, d2->d_name)			||	   \
+		    put_user(0, &d2->d_reclen)			||	   \
+		    copy_to_user(d1->d_name, name, name_len)	||	   \
+		    put_user(0, d1->d_name + name_len)		||	   \
+		    put_user(name_len, &d1->d_reclen))			   \
+			goto efault;					   \
+	} else {							   \
+		/* dirent has short and long name */			   \
+		const char *longname = buf->longname;			   \
+		int long_len = buf->long_len;				   \
+		const char *shortname = buf->shortname;			   \
+		int short_len = buf->short_len;				   \
+									   \
+		if (long_len >= sizeof(d1->d_name))			   \
+			long_len = sizeof(d1->d_name) - 1;		   \
+		if (short_len >= sizeof(d1->d_name))			   \
+			short_len = sizeof(d1->d_name) - 1;		   \
+									   \
+		if (copy_to_user(d2->d_name, longname, long_len)	|| \
+		    put_user(0, d2->d_name + long_len)			|| \
+		    put_user(long_len, &d2->d_reclen)			|| \
+		    put_user(ino, &d2->d_ino)				|| \
+		    put_user(offset, &d2->d_off)			|| \
+		    copy_to_user(d1->d_name, shortname, short_len)	|| \
+		    put_user(0, d1->d_name + short_len)			|| \
+		    put_user(short_len, &d1->d_reclen))			   \
+			goto efault;					   \
+	}								   \
+	return 0;							   \
+efault:									   \
+	buf->result = -EFAULT;						   \
+	return -EFAULT;							   \
+}
+
+FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, dirent)
+
+static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
+			     void __user *dirent, filldir_t filldir,
+			     int short_only, int both)
 {
-	struct fat_ioctl_filldir_callback *buf = __buf;
-	struct dirent __user *d1 = buf->dirent;
-	struct dirent __user *d2 = d1 + 1;
-
-	if (buf->result)
-		return -EINVAL;
-	buf->result++;
-
-	if (name != NULL) {
-		/* dirent has only short name */
-		if (name_len >= sizeof(d1->d_name))
-			name_len = sizeof(d1->d_name) - 1;
-
-		if (put_user(0, d2->d_name)			||
-		    put_user(0, &d2->d_reclen)			||
-		    copy_to_user(d1->d_name, name, name_len)	||
-		    put_user(0, d1->d_name + name_len)		||
-		    put_user(name_len, &d1->d_reclen))
-			goto efault;
-	} else {
-		/* dirent has short and long name */
-		const char *longname = buf->longname;
-		int long_len = buf->long_len;
-		const char *shortname = buf->shortname;
-		int short_len = buf->short_len;
-
-		if (long_len >= sizeof(d1->d_name))
-			long_len = sizeof(d1->d_name) - 1;
-		if (short_len >= sizeof(d1->d_name))
-			short_len = sizeof(d1->d_name) - 1;
-
-		if (copy_to_user(d2->d_name, longname, long_len)	||
-		    put_user(0, d2->d_name + long_len)			||
-		    put_user(long_len, &d2->d_reclen)			||
-		    put_user(ino, &d2->d_ino)				||
-		    put_user(offset, &d2->d_off)			||
-		    copy_to_user(d1->d_name, shortname, short_len)	||
-		    put_user(0, d1->d_name + short_len)			||
-		    put_user(short_len, &d1->d_reclen))
-			goto efault;
+	struct fat_ioctl_filldir_callback buf;
+	int ret;
+
+	buf.dirent = dirent;
+	buf.result = 0;
+	mutex_lock(&inode->i_mutex);
+	ret = -ENOENT;
+	if (!IS_DEADDIR(inode)) {
+		ret = __fat_readdir(inode, filp, &buf, filldir,
+				    short_only, both);
 	}
-	return 0;
-efault:
-	buf->result = -EFAULT;
-	return -EFAULT;
+	mutex_unlock(&inode->i_mutex);
+	if (ret >= 0)
+		ret = buf.result;
+	return ret;
 }
 
-static int fat_dir_ioctl(struct inode * inode, struct file * filp,
-		  unsigned int cmd, unsigned long arg)
+static int fat_dir_ioctl(struct inode *inode, struct file *filp,
+			 unsigned int cmd, unsigned long arg)
 {
-	struct fat_ioctl_filldir_callback buf;
-	struct dirent __user *d1;
-	int ret, short_only, both;
+	struct dirent __user *d1 = (struct dirent __user *)arg;
+	int short_only, both;
 
 	switch (cmd) {
 	case VFAT_IOCTL_READDIR_SHORT:
@@ -717,7 +740,6 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
 		return fat_generic_ioctl(inode, filp, cmd, arg);
 	}
 
-	d1 = (struct dirent __user *)arg;
 	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct dirent[2])))
 		return -EFAULT;
 	/*
@@ -728,69 +750,48 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
 	if (put_user(0, &d1->d_reclen))
 		return -EFAULT;
 
-	buf.dirent = d1;
-	buf.result = 0;
-	mutex_lock(&inode->i_mutex);
-	ret = -ENOENT;
-	if (!IS_DEADDIR(inode)) {
-		ret = __fat_readdir(inode, filp, &buf, fat_ioctl_filldir,
-				    short_only, both);
-	}
-	mutex_unlock(&inode->i_mutex);
-	if (ret >= 0)
-		ret = buf.result;
-	return ret;
+	return fat_ioctl_readdir(inode, filp, d1, fat_ioctl_filldir,
+				 short_only, both);
 }
 
 #ifdef CONFIG_COMPAT
 #define	VFAT_IOCTL_READDIR_BOTH32	_IOR('r', 1, struct compat_dirent[2])
 #define	VFAT_IOCTL_READDIR_SHORT32	_IOR('r', 2, struct compat_dirent[2])
 
-static long fat_compat_put_dirent32(struct dirent *d,
-				    struct compat_dirent __user *d32)
-{
-        if (!access_ok(VERIFY_WRITE, d32, sizeof(struct compat_dirent)))
-                return -EFAULT;
-
-        __put_user(d->d_ino, &d32->d_ino);
-        __put_user(d->d_off, &d32->d_off);
-        __put_user(d->d_reclen, &d32->d_reclen);
-        if (__copy_to_user(d32->d_name, d->d_name, d->d_reclen))
-		return -EFAULT;
+FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent)
 
-        return 0;
-}
-
-static long fat_compat_dir_ioctl(struct file *file, unsigned cmd,
+static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 				 unsigned long arg)
 {
-	struct compat_dirent __user *p = compat_ptr(arg);
-	int ret;
-	mm_segment_t oldfs = get_fs();
-	struct dirent d[2];
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct compat_dirent __user *d1 = compat_ptr(arg);
+	int short_only, both;
 
 	switch (cmd) {
-	case VFAT_IOCTL_READDIR_BOTH32:
-		cmd = VFAT_IOCTL_READDIR_BOTH;
-		break;
 	case VFAT_IOCTL_READDIR_SHORT32:
-		cmd = VFAT_IOCTL_READDIR_SHORT;
+		short_only = 1;
+		both = 0;
+		break;
+	case VFAT_IOCTL_READDIR_BOTH32:
+		short_only = 0;
+		both = 1;
 		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
 
-	set_fs(KERNEL_DS);
-	lock_kernel();
-	ret = fat_dir_ioctl(file->f_path.dentry->d_inode, file,
-			    cmd, (unsigned long) &d);
-	unlock_kernel();
-	set_fs(oldfs);
-	if (ret >= 0) {
-		ret |= fat_compat_put_dirent32(&d[0], p);
-		ret |= fat_compat_put_dirent32(&d[1], p + 1);
-	}
-	return ret;
+	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
+		return -EFAULT;
+	/*
+	 * Yes, we don't need this put_user() absolutely. However old
+	 * code didn't return the right value. So, app use this value,
+	 * in order to check whether it is EOF.
+	 */
+	if (put_user(0, &d1->d_reclen))
+		return -EFAULT;
+
+	return fat_ioctl_readdir(inode, filp, d1, fat_compat_ioctl_filldir,
+				 short_only, both);
 }
 #endif /* CONFIG_COMPAT */
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9bfe607c892e..479722d89667 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -25,6 +25,7 @@
 #include <linux/parser.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
+#include <linux/log2.h>
 #include <asm/unaligned.h>
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -499,15 +500,12 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		spin_lock_init(&ei->cache_lru_lock);
-		ei->nr_caches = 0;
-		ei->cache_valid_id = FAT_CACHE_VALID + 1;
-		INIT_LIST_HEAD(&ei->cache_lru);
-		INIT_HLIST_NODE(&ei->i_fat_hash);
-		inode_init_once(&ei->vfs_inode);
-	}
+	spin_lock_init(&ei->cache_lru_lock);
+	ei->nr_caches = 0;
+	ei->cache_valid_id = FAT_CACHE_VALID + 1;
+	INIT_LIST_HEAD(&ei->cache_lru);
+	INIT_HLIST_NODE(&ei->i_fat_hash);
+	inode_init_once(&ei->vfs_inode);
 }
 
 static int __init fat_init_inodecache(void)
@@ -825,6 +823,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 	}
 	if (opts->name_check != 'n')
 		seq_printf(m, ",check=%c", opts->name_check);
+	if (opts->usefree)
+		seq_puts(m, ",usefree");
 	if (opts->quiet)
 		seq_puts(m, ",quiet");
 	if (opts->showexec)
@@ -850,7 +850,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 enum {
 	Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid,
-	Opt_umask, Opt_dmask, Opt_fmask, Opt_codepage, Opt_nocase,
+	Opt_umask, Opt_dmask, Opt_fmask, Opt_codepage, Opt_usefree, Opt_nocase,
 	Opt_quiet, Opt_showexec, Opt_debug, Opt_immutable,
 	Opt_dots, Opt_nodots,
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
@@ -872,6 +872,7 @@ static match_table_t fat_tokens = {
 	{Opt_dmask, "dmask=%o"},
 	{Opt_fmask, "fmask=%o"},
 	{Opt_codepage, "codepage=%u"},
+	{Opt_usefree, "usefree"},
 	{Opt_nocase, "nocase"},
 	{Opt_quiet, "quiet"},
 	{Opt_showexec, "showexec"},
@@ -951,7 +952,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 	opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
 	opts->utf8 = opts->unicode_xlate = 0;
 	opts->numtail = 1;
-	opts->nocase = 0;
+	opts->usefree = opts->nocase = 0;
 	*debug = 0;
 
 	if (!options)
@@ -979,6 +980,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 		case Opt_check_n:
 			opts->name_check = 'n';
 			break;
+		case Opt_usefree:
+			opts->usefree = 1;
+			break;
 		case Opt_nocase:
 			if (!is_vfat)
 				opts->nocase = 1;
@@ -1218,8 +1222,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	}
 	logical_sector_size =
 		le16_to_cpu(get_unaligned((__le16 *)&b->sector_size));
-	if (!logical_sector_size
-	    || (logical_sector_size & (logical_sector_size - 1))
+	if (!is_power_of_2(logical_sector_size)
 	    || (logical_sector_size < 512)
 	    || (PAGE_CACHE_SIZE < logical_sector_size)) {
 		if (!silent)
@@ -1229,8 +1232,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 		goto out_invalid;
 	}
 	sbi->sec_per_clus = b->sec_per_clus;
-	if (!sbi->sec_per_clus
-	    || (sbi->sec_per_clus & (sbi->sec_per_clus - 1))) {
+	if (!is_power_of_2(sbi->sec_per_clus)) {
 		if (!silent)
 			printk(KERN_ERR "FAT: bogus sectors per cluster %u\n",
 			       sbi->sec_per_clus);
@@ -1306,7 +1308,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 			       le32_to_cpu(fsinfo->signature2),
 			       sbi->fsinfo_sector);
 		} else {
-			sbi->free_clusters = le32_to_cpu(fsinfo->free_clusters);
+			if (sbi->options.usefree)
+				sbi->free_clusters =
+					le32_to_cpu(fsinfo->free_clusters);
 			sbi->prev_free = le32_to_cpu(fsinfo->next_cluster);
 		}
 
diff --git a/fs/fifo.c b/fs/fifo.c
index 49035b174b48..9785e36f81e7 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -11,8 +11,8 @@
 
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
+#include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
 
 static void wait_for_partner(struct inode* inode, unsigned int *cnt)
diff --git a/fs/file_table.c b/fs/file_table.c
index 4c17a18d8c10..d17fd691b832 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -10,7 +10,6 @@
 #include <linux/file.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/security.h>
 #include <linux/eventpoll.h>
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 7a4f61aa05f8..f37f87262837 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -41,11 +41,12 @@ void put_filesystem(struct file_system_type *fs)
 	module_put(fs->owner);
 }
 
-static struct file_system_type **find_filesystem(const char *name)
+static struct file_system_type **find_filesystem(const char *name, unsigned len)
 {
 	struct file_system_type **p;
 	for (p=&file_systems; *p; p=&(*p)->next)
-		if (strcmp((*p)->name,name) == 0)
+		if (strlen((*p)->name) == len &&
+		    strncmp((*p)->name, name, len) == 0)
 			break;
 	return p;
 }
@@ -68,11 +69,12 @@ int register_filesystem(struct file_system_type * fs)
 	int res = 0;
 	struct file_system_type ** p;
 
+	BUG_ON(strchr(fs->name, '.'));
 	if (fs->next)
 		return -EBUSY;
 	INIT_LIST_HEAD(&fs->fs_supers);
 	write_lock(&file_systems_lock);
-	p = find_filesystem(fs->name);
+	p = find_filesystem(fs->name, strlen(fs->name));
 	if (*p)
 		res = -EBUSY;
 	else
@@ -215,19 +217,26 @@ int get_filesystem_list(char * buf)
 struct file_system_type *get_fs_type(const char *name)
 {
 	struct file_system_type *fs;
+	const char *dot = strchr(name, '.');
+	unsigned len = dot ? dot - name : strlen(name);
 
 	read_lock(&file_systems_lock);
-	fs = *(find_filesystem(name));
+	fs = *(find_filesystem(name, len));
 	if (fs && !try_module_get(fs->owner))
 		fs = NULL;
 	read_unlock(&file_systems_lock);
-	if (!fs && (request_module("%s", name) == 0)) {
+	if (!fs && (request_module("%.*s", len, name) == 0)) {
 		read_lock(&file_systems_lock);
-		fs = *(find_filesystem(name));
+		fs = *(find_filesystem(name, len));
 		if (fs && !try_module_get(fs->owner))
 			fs = NULL;
 		read_unlock(&file_systems_lock);
 	}
+
+	if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
+		put_filesystem(fs);
+		fs = NULL;
+	}
 	return fs;
 }
 
diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c
index 2d71128bd8d6..f86fd3cacd5a 100644
--- a/fs/freevxfs/vxfs_bmap.c
+++ b/fs/freevxfs/vxfs_bmap.c
@@ -137,7 +137,7 @@ vxfs_bmap_indir(struct inode *ip, long indir, int size, long block)
 
 		bp = sb_bread(ip->i_sb,
 				indir + (i / VXFS_TYPED_PER_BLOCK(ip->i_sb)));
-		if (!buffer_mapped(bp))
+		if (!bp || !buffer_mapped(bp))
 			return 0;
 
 		typ = ((struct vxfs_typed *)bp->b_data) +
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 098a915fd9a1..d1f7c5b5b3c3 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -99,7 +99,7 @@ vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino)
 	offset = ((ino % (sbp->s_blocksize / VXFS_ISIZE)) * VXFS_ISIZE);
 	bp = sb_bread(sbp, block);
 
-	if (buffer_mapped(bp)) {
+	if (bp && buffer_mapped(bp)) {
 		struct vxfs_inode_info	*vip;
 		struct vxfs_dinode	*dip;
 
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index decac62efe57..ed8f0b0dd880 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -74,10 +74,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
 	pp = read_mapping_page(mapping, n, NULL);
 
 	if (!IS_ERR(pp)) {
-		wait_on_page_locked(pp);
 		kmap(pp);
-		if (!PageUptodate(pp))
-			goto fail;
 		/** if (!PageChecked(pp)) **/
 			/** vxfs_check_page(pp); **/
 		if (PageError(pp))
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8890eba1db52..bd5a772d8ccf 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -485,7 +485,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
 static int fuse_create(struct inode *dir, struct dentry *entry, int mode,
 		       struct nameidata *nd)
 {
-	if (nd && (nd->flags & LOOKUP_CREATE)) {
+	if (nd && (nd->flags & LOOKUP_OPEN)) {
 		int err = fuse_create_open(dir, entry, mode, nd);
 		if (err != -ENOSYS)
 			return err;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2fd06927e851..adf7995232b8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -11,6 +11,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 
 static const struct file_operations fuse_direct_io_file_operations;
 
@@ -609,7 +610,9 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 	ssize_t res;
 	/* Don't allow parallel writes to the same file */
 	mutex_lock(&inode->i_mutex);
-	res = fuse_direct_io(file, buf, count, ppos, 1);
+	res = generic_write_checks(file, ppos, &count, 0);
+	if (!res)
+		res = fuse_direct_io(file, buf, count, ppos, 1);
 	mutex_unlock(&inode->i_mutex);
 	return res;
 }
@@ -738,8 +741,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (cmd == F_GETLK) {
 		if (fc->no_lock) {
-			if (!posix_test_lock(file, fl, fl))
-				fl->fl_type = F_UNLCK;
+			posix_test_lock(file, fl);
 			err = 0;
 		} else
 			err = fuse_getlk(file, fl);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 608db81219a0..9804c0cdcb42 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -17,6 +17,7 @@
 #include <linux/parser.h>
 #include <linux/statfs.h>
 #include <linux/random.h>
+#include <linux/sched.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -453,6 +454,7 @@ static const struct super_operations fuse_super_operations = {
 	.destroy_inode  = fuse_destroy_inode,
 	.read_inode	= fuse_read_inode,
 	.clear_inode	= fuse_clear_inode,
+	.drop_inode	= generic_delete_inode,
 	.remount_fs	= fuse_remount_fs,
 	.put_super	= fuse_put_super,
 	.umount_begin	= fuse_umount_begin,
@@ -636,6 +638,7 @@ static int fuse_get_sb(struct file_system_type *fs_type,
 static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
+	.fs_flags	= FS_HAS_SUBTYPE,
 	.get_sb		= fuse_get_sb,
 	.kill_sb	= kill_anon_super,
 };
@@ -652,6 +655,7 @@ static int fuse_get_sb_blk(struct file_system_type *fs_type,
 static struct file_system_type fuseblk_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuseblk",
+	.fs_flags	= FS_HAS_SUBTYPE,
 	.get_sb		= fuse_get_sb_blk,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
@@ -685,9 +689,7 @@ static void fuse_inode_init_once(void *foo, struct kmem_cache *cachep,
 {
 	struct inode * inode = foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(inode);
+	inode_init_once(inode);
 }
 
 static int __init fuse_fs_init(void)
@@ -731,12 +733,12 @@ static int fuse_sysfs_init(void)
 {
 	int err;
 
-	kset_set_kset_s(&fuse_subsys, fs_subsys);
+	kobj_set_kset_s(&fuse_subsys, fs_subsys);
 	err = subsystem_register(&fuse_subsys);
 	if (err)
 		goto out_err;
 
-	kset_set_kset_s(&connections_subsys, fuse_subsys);
+	kobj_set_kset_s(&connections_subsys, fuse_subsys);
 	err = subsystem_register(&connections_subsys);
 	if (err)
 		goto out_fuse_unregister;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 82a1ac7895a2..a96fa07b3f3b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1262,9 +1262,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 			      u64 leaf_no)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *bh;
 	struct gfs2_leaf *lf;
-	unsigned entries = 0;
+	unsigned entries = 0, entries2 = 0;
 	unsigned leaves = 0;
 	const struct gfs2_dirent **darr, *dent;
 	struct dirent_gather g;
@@ -1290,7 +1291,13 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 		return 0;
 
 	error = -ENOMEM;
-	larr = vmalloc((leaves + entries) * sizeof(void *));
+	/*
+	 * The extra 99 entries are not normally used, but are a buffer
+	 * zone in case the number of entries in the leaf is corrupt.
+	 * 99 is the maximum number of entries that can fit in a single
+	 * leaf block.
+	 */
+	larr = vmalloc((leaves + entries + 99) * sizeof(void *));
 	if (!larr)
 		goto out;
 	darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1305,10 +1312,20 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 		lf = (struct gfs2_leaf *)bh->b_data;
 		lfn = be64_to_cpu(lf->lf_next);
 		if (lf->lf_entries) {
+			entries2 += be16_to_cpu(lf->lf_entries);
 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
 						gfs2_dirent_gather, NULL, &g);
 			error = PTR_ERR(dent);
-			if (IS_ERR(dent)) {
+			if (IS_ERR(dent))
+				goto out_kfree;
+			if (entries2 != g.offset) {
+				fs_warn(sdp, "Number of entries corrupt in dir "
+						"leaf %llu, entries2 (%u) != "
+						"g.offset (%u)\n",
+					(unsigned long long)bh->b_blocknr,
+					entries2, g.offset);
+					
+				error = -EIO;
 				goto out_kfree;
 			}
 			error = 0;
@@ -1318,6 +1335,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 		}
 	} while(lfn);
 
+	BUG_ON(entries2 != entries);
 	error = do_filldir_main(ip, offset, opaque, filldir, darr,
 				entries, copied);
 out_kfree:
@@ -1401,6 +1419,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 		  filldir_t filldir)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct dirent_gather g;
 	const struct gfs2_dirent **darr, *dent;
 	struct buffer_head *dibh;
@@ -1423,8 +1442,8 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 		return error;
 
 	error = -ENOMEM;
-	darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
-		       GFP_KERNEL);
+	/* 96 is max number of dirents which can be stuffed into an inode */
+	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL);
 	if (darr) {
 		g.pdent = darr;
 		g.offset = 0;
@@ -1434,6 +1453,15 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 			error = PTR_ERR(dent);
 			goto out;
 		}
+		if (dip->i_di.di_entries != g.offset) {
+			fs_warn(sdp, "Number of entries corrupt in dir %llu, "
+				"ip->i_di.di_entries (%u) != g.offset (%u)\n",
+				(unsigned long long)dip->i_num.no_addr,
+				dip->i_di.di_entries,
+				g.offset);
+			error = -EIO;
+			goto out;
+		}
 		error = do_filldir_main(dip, offset, opaque, filldir, darr,
 					dip->i_di.di_entries, &copied);
 out:
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 12accb08fe02..1815429a2978 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -23,6 +23,10 @@
 #include <linux/module.h>
 #include <linux/rwsem.h>
 #include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -40,20 +44,30 @@ struct gfs2_gl_hash_bucket {
         struct hlist_head hb_list;
 };
 
+struct glock_iter {
+	int hash;                     /* hash bucket index         */
+	struct gfs2_sbd *sdp;         /* incore superblock         */
+	struct gfs2_glock *gl;        /* current glock struct      */
+	struct hlist_head *hb_list;   /* current hash bucket ptr   */
+	struct seq_file *seq;         /* sequence file for debugfs */
+	char string[512];             /* scratch space             */
+};
+
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct gfs2_glock *gl);
-static int dump_inode(struct gfs2_inode *ip);
-static void gfs2_glock_xmote_th(struct gfs2_holder *gh);
+static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
+static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
 static void gfs2_glock_drop_th(struct gfs2_glock *gl);
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
+static struct dentry *gfs2_root;
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
 
 static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct dentry *gfs2_root;
 
 /*
  * Despite what you might think, the numbers below are not arbitrary :-)
@@ -202,7 +216,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 		gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
 		gfs2_assert(sdp, list_empty(&gl->gl_holders));
 		gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
-		gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
 		gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
 		glock_free(gl);
 		rv = 1;
@@ -303,7 +316,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	atomic_set(&gl->gl_ref, 1);
 	gl->gl_state = LM_ST_UNLOCKED;
 	gl->gl_hash = hash;
-	gl->gl_owner = NULL;
+	gl->gl_owner_pid = 0;
 	gl->gl_ip = 0;
 	gl->gl_ops = glops;
 	gl->gl_req_gh = NULL;
@@ -367,7 +380,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
 	INIT_LIST_HEAD(&gh->gh_list);
 	gh->gh_gl = gl;
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
-	gh->gh_owner = current;
+	gh->gh_owner_pid = current->pid;
 	gh->gh_state = state;
 	gh->gh_flags = flags;
 	gh->gh_error = 0;
@@ -389,7 +402,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
 {
 	gh->gh_state = state;
 	gh->gh_flags = flags;
-	gh->gh_iflags &= 1 << HIF_ALLOCED;
+	gh->gh_iflags = 0;
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
 }
 
@@ -406,54 +419,8 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
 	gh->gh_ip = 0;
 }
 
-/**
- * gfs2_holder_get - get a struct gfs2_holder structure
- * @gl: the glock
- * @state: the state we're requesting
- * @flags: the modifier flags
- * @gfp_flags:
- *
- * Figure out how big an impact this function has.  Either:
- * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
- * 2) Leave it like it is
- *
- * Returns: the holder structure, NULL on ENOMEM
- */
-
-static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
-					   unsigned int state,
-					   int flags, gfp_t gfp_flags)
-{
-	struct gfs2_holder *gh;
-
-	gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
-	if (!gh)
-		return NULL;
-
-	gfs2_holder_init(gl, state, flags, gh);
-	set_bit(HIF_ALLOCED, &gh->gh_iflags);
-	gh->gh_ip = (unsigned long)__builtin_return_address(0);
-	return gh;
-}
-
-/**
- * gfs2_holder_put - get rid of a struct gfs2_holder structure
- * @gh: the holder structure
- *
- */
-
-static void gfs2_holder_put(struct gfs2_holder *gh)
+static void gfs2_holder_wake(struct gfs2_holder *gh)
 {
-	gfs2_holder_uninit(gh);
-	kfree(gh);
-}
-
-static void gfs2_holder_dispose_or_wake(struct gfs2_holder *gh)
-{
-	if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) {
-		gfs2_holder_put(gh);
-		return;
-	}
 	clear_bit(HIF_WAIT, &gh->gh_iflags);
 	smp_mb();
 	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
@@ -519,7 +486,7 @@ static int rq_promote(struct gfs2_holder *gh)
 				gfs2_reclaim_glock(sdp);
 			}
 
-			gfs2_glock_xmote_th(gh);
+			gfs2_glock_xmote_th(gh->gh_gl, gh);
 			spin_lock(&gl->gl_spin);
 		}
 		return 1;
@@ -542,7 +509,7 @@ static int rq_promote(struct gfs2_holder *gh)
 	gh->gh_error = 0;
 	set_bit(HIF_HOLDER, &gh->gh_iflags);
 
-	gfs2_holder_dispose_or_wake(gh);
+	gfs2_holder_wake(gh);
 
 	return 0;
 }
@@ -554,32 +521,24 @@ static int rq_promote(struct gfs2_holder *gh)
  * Returns: 1 if the queue is blocked
  */
 
-static int rq_demote(struct gfs2_holder *gh)
+static int rq_demote(struct gfs2_glock *gl)
 {
-	struct gfs2_glock *gl = gh->gh_gl;
-
 	if (!list_empty(&gl->gl_holders))
 		return 1;
 
-	if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
-		list_del_init(&gh->gh_list);
-		gh->gh_error = 0;
-		spin_unlock(&gl->gl_spin);
-		gfs2_holder_dispose_or_wake(gh);
-		spin_lock(&gl->gl_spin);
-	} else {
-		gl->gl_req_gh = gh;
-		set_bit(GLF_LOCK, &gl->gl_flags);
-		spin_unlock(&gl->gl_spin);
-
-		if (gh->gh_state == LM_ST_UNLOCKED ||
-		    gl->gl_state != LM_ST_EXCLUSIVE)
-			gfs2_glock_drop_th(gl);
-		else
-			gfs2_glock_xmote_th(gh);
-
-		spin_lock(&gl->gl_spin);
+	if (gl->gl_state == gl->gl_demote_state ||
+	    gl->gl_state == LM_ST_UNLOCKED) {
+		clear_bit(GLF_DEMOTE, &gl->gl_flags);
+		return 0;
 	}
+	set_bit(GLF_LOCK, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+	if (gl->gl_demote_state == LM_ST_UNLOCKED ||
+	    gl->gl_state != LM_ST_EXCLUSIVE)
+		gfs2_glock_drop_th(gl);
+	else
+		gfs2_glock_xmote_th(gl, NULL);
+	spin_lock(&gl->gl_spin);
 
 	return 0;
 }
@@ -607,16 +566,8 @@ static void run_queue(struct gfs2_glock *gl)
 			else
 				gfs2_assert_warn(gl->gl_sbd, 0);
 
-		} else if (!list_empty(&gl->gl_waiters2) &&
-			   !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
-			gh = list_entry(gl->gl_waiters2.next,
-					struct gfs2_holder, gh_list);
-
-			if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
-				blocked = rq_demote(gh);
-			else
-				gfs2_assert_warn(gl->gl_sbd, 0);
-
+		} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+			blocked = rq_demote(gl);
 		} else if (!list_empty(&gl->gl_waiters3)) {
 			gh = list_entry(gl->gl_waiters3.next,
 					struct gfs2_holder, gh_list);
@@ -654,7 +605,7 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 		list_add_tail(&gh.gh_list, &gl->gl_waiters1);
 	} else {
-		gl->gl_owner = current;
+		gl->gl_owner_pid = current->pid;
 		gl->gl_ip = (unsigned long)__builtin_return_address(0);
 		clear_bit(HIF_WAIT, &gh.gh_iflags);
 		smp_mb();
@@ -681,7 +632,7 @@ static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 		acquired = 0;
 	} else {
-		gl->gl_owner = current;
+		gl->gl_owner_pid = current->pid;
 		gl->gl_ip = (unsigned long)__builtin_return_address(0);
 	}
 	spin_unlock(&gl->gl_spin);
@@ -699,7 +650,7 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
 {
 	spin_lock(&gl->gl_spin);
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	gl->gl_owner = NULL;
+	gl->gl_owner_pid = 0;
 	gl->gl_ip = 0;
 	run_queue(gl);
 	BUG_ON(!spin_is_locked(&gl->gl_spin));
@@ -707,50 +658,24 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
 }
 
 /**
- * handle_callback - add a demote request to a lock's queue
+ * handle_callback - process a demote request
  * @gl: the glock
  * @state: the state the caller wants us to change to
  *
- * Note: This may fail sliently if we are out of memory.
+ * There are only two requests that we are going to see in actual
+ * practise: LM_ST_SHARED and LM_ST_UNLOCKED
  */
 
 static void handle_callback(struct gfs2_glock *gl, unsigned int state)
 {
-	struct gfs2_holder *gh, *new_gh = NULL;
-
-restart:
 	spin_lock(&gl->gl_spin);
-
-	list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
-		if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
-		    gl->gl_req_gh != gh) {
-			if (gh->gh_state != state)
-				gh->gh_state = LM_ST_UNLOCKED;
-			goto out;
-		}
-	}
-
-	if (new_gh) {
-		list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
-		new_gh = NULL;
-	} else {
-		spin_unlock(&gl->gl_spin);
-
-		new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_NOFS);
-		if (!new_gh)
-			return;
-		set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
-		set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
-		set_bit(HIF_WAIT, &new_gh->gh_iflags);
-
-		goto restart;
+	if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) {
+		gl->gl_demote_state = state;
+		gl->gl_demote_time = jiffies;
+	} else if (gl->gl_demote_state != LM_ST_UNLOCKED) {
+		gl->gl_demote_state = state;
 	}
-
-out:
 	spin_unlock(&gl->gl_spin);
-
-	if (new_gh)
-		gfs2_holder_put(new_gh);
 }
 
 /**
@@ -810,56 +735,37 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 
 	/*  Deal with each possible exit condition  */
 
-	if (!gh)
+	if (!gh) {
 		gl->gl_stamp = jiffies;
-	else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+		if (ret & LM_OUT_CANCELED)
+			op_done = 0;
+		else
+			clear_bit(GLF_DEMOTE, &gl->gl_flags);
+	} else {
 		spin_lock(&gl->gl_spin);
 		list_del_init(&gh->gh_list);
 		gh->gh_error = -EIO;
-		spin_unlock(&gl->gl_spin);
-	} else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
-		if (gl->gl_state == gh->gh_state ||
-		    gl->gl_state == LM_ST_UNLOCKED) {
+		if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
+			goto out;
+		gh->gh_error = GLR_CANCELED;
+		if (ret & LM_OUT_CANCELED) 
+			goto out;
+		if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+			list_add_tail(&gh->gh_list, &gl->gl_holders);
 			gh->gh_error = 0;
-		} else {
-			if (gfs2_assert_warn(sdp, gh->gh_flags &
-					(LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
-				fs_warn(sdp, "ret = 0x%.8X\n", ret);
-			gh->gh_error = GLR_TRYFAILED;
+			set_bit(HIF_HOLDER, &gh->gh_iflags);
+			set_bit(HIF_FIRST, &gh->gh_iflags);
+			op_done = 0;
+			goto out;
 		}
-		spin_unlock(&gl->gl_spin);
-
-		if (ret & LM_OUT_CANCELED)
-			handle_callback(gl, LM_ST_UNLOCKED);
-
-	} else if (ret & LM_OUT_CANCELED) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
-		gh->gh_error = GLR_CANCELED;
-		spin_unlock(&gl->gl_spin);
-
-	} else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-		spin_lock(&gl->gl_spin);
-		list_move_tail(&gh->gh_list, &gl->gl_holders);
-		gh->gh_error = 0;
-		set_bit(HIF_HOLDER, &gh->gh_iflags);
-		spin_unlock(&gl->gl_spin);
-
-		set_bit(HIF_FIRST, &gh->gh_iflags);
-
-		op_done = 0;
-
-	} else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-		spin_lock(&gl->gl_spin);
-		list_del_init(&gh->gh_list);
 		gh->gh_error = GLR_TRYFAILED;
-		spin_unlock(&gl->gl_spin);
-
-	} else {
+		if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+			goto out;
+		gh->gh_error = -EINVAL;
 		if (gfs2_assert_withdraw(sdp, 0) == -1)
 			fs_err(sdp, "ret = 0x%.8X\n", ret);
+out:
+		spin_unlock(&gl->gl_spin);
 	}
 
 	if (glops->go_xmote_bh)
@@ -877,7 +783,7 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_glock_put(gl);
 
 	if (gh)
-		gfs2_holder_dispose_or_wake(gh);
+		gfs2_holder_wake(gh);
 }
 
 /**
@@ -888,12 +794,11 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
  *
  */
 
-void gfs2_glock_xmote_th(struct gfs2_holder *gh)
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
-	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	int flags = gh->gh_flags;
-	unsigned state = gh->gh_state;
+	int flags = gh ? gh->gh_flags : 0;
+	unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
 				 LM_FLAG_NOEXP | LM_FLAG_ANY |
@@ -943,6 +848,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_assert_warn(sdp, !ret);
 
 	state_change(gl, LM_ST_UNLOCKED);
+	clear_bit(GLF_DEMOTE, &gl->gl_flags);
 
 	if (glops->go_inval)
 		glops->go_inval(gl, DIO_METADATA);
@@ -964,7 +870,7 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_glock_put(gl);
 
 	if (gh)
-		gfs2_holder_dispose_or_wake(gh);
+		gfs2_holder_wake(gh);
 }
 
 /**
@@ -1097,18 +1003,32 @@ static int glock_wait_internal(struct gfs2_holder *gh)
 }
 
 static inline struct gfs2_holder *
-find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+find_holder_by_owner(struct list_head *head, pid_t pid)
 {
 	struct gfs2_holder *gh;
 
 	list_for_each_entry(gh, head, gh_list) {
-		if (gh->gh_owner == owner)
+		if (gh->gh_owner_pid == pid)
 			return gh;
 	}
 
 	return NULL;
 }
 
+static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	if (gi) {
+		vsprintf(gi->string, fmt, args);
+		seq_printf(gi->seq, gi->string);
+	}
+	else
+		vprintk(fmt, args);
+	va_end(args);
+}
+
 /**
  * add_to_queue - Add a holder to the wait queue (but look for recursion)
  * @gh: the holder structure to add
@@ -1120,24 +1040,24 @@ static void add_to_queue(struct gfs2_holder *gh)
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_holder *existing;
 
-	BUG_ON(!gh->gh_owner);
+	BUG_ON(!gh->gh_owner_pid);
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
 		BUG();
 
-	existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+	existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner_pid);
 	if (existing) {
 		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
-		printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
+		printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid);
 		printk(KERN_INFO "lock type : %d lock state : %d\n",
 				existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
 		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-		printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
+		printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid);
 		printk(KERN_INFO "lock type : %d lock state : %d\n",
 				gl->gl_name.ln_type, gl->gl_state);
 		BUG();
 	}
 
-	existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+	existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner_pid);
 	if (existing) {
 		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
 		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
@@ -1267,9 +1187,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 		if (glops->go_unlock)
 			glops->go_unlock(gh);
 
-		gl->gl_stamp = jiffies;
-
 		spin_lock(&gl->gl_spin);
+		gl->gl_stamp = jiffies;
 	}
 
 	clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -1841,6 +1760,15 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
  *  Diagnostic routines to help debug distributed deadlock
  */
 
+static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
+                              unsigned long address)
+{
+	char buffer[KSYM_SYMBOL_LEN];
+
+	sprint_symbol(buffer, address);
+	print_dbg(gi, fmt, buffer);
+}
+
 /**
  * dump_holder - print information about a glock holder
  * @str: a string naming the type of holder
@@ -1849,31 +1777,37 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_holder(char *str, struct gfs2_holder *gh)
+static int dump_holder(struct glock_iter *gi, char *str,
+		       struct gfs2_holder *gh)
 {
 	unsigned int x;
-	int error = -ENOBUFS;
-
-	printk(KERN_INFO "  %s\n", str);
-	printk(KERN_INFO "    owner = %ld\n",
-		   (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
-	printk(KERN_INFO "    gh_state = %u\n", gh->gh_state);
-	printk(KERN_INFO "    gh_flags =");
+	struct task_struct *gh_owner;
+
+	print_dbg(gi, "  %s\n", str);
+	if (gh->gh_owner_pid) {
+		print_dbg(gi, "    owner = %ld ", (long)gh->gh_owner_pid);
+		gh_owner = find_task_by_pid(gh->gh_owner_pid);
+		if (gh_owner)
+			print_dbg(gi, "(%s)\n", gh_owner->comm);
+		else
+			print_dbg(gi, "(ended)\n");
+	} else
+		print_dbg(gi, "    owner = -1\n");
+	print_dbg(gi, "    gh_state = %u\n", gh->gh_state);
+	print_dbg(gi, "    gh_flags =");
 	for (x = 0; x < 32; x++)
 		if (gh->gh_flags & (1 << x))
-			printk(" %u", x);
-	printk(" \n");
-	printk(KERN_INFO "    error = %d\n", gh->gh_error);
-	printk(KERN_INFO "    gh_iflags =");
+			print_dbg(gi, " %u", x);
+	print_dbg(gi, " \n");
+	print_dbg(gi, "    error = %d\n", gh->gh_error);
+	print_dbg(gi, "    gh_iflags =");
 	for (x = 0; x < 32; x++)
 		if (test_bit(x, &gh->gh_iflags))
-			printk(" %u", x);
-	printk(" \n");
-	print_symbol(KERN_INFO "    initialized at: %s\n", gh->gh_ip);
-
-	error = 0;
+			print_dbg(gi, " %u", x);
+	print_dbg(gi, " \n");
+        gfs2_print_symbol(gi, "    initialized at: %s\n", gh->gh_ip);
 
-	return error;
+	return 0;
 }
 
 /**
@@ -1883,25 +1817,20 @@ static int dump_holder(char *str, struct gfs2_holder *gh)
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_inode(struct gfs2_inode *ip)
+static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
 {
 	unsigned int x;
-	int error = -ENOBUFS;
 
-	printk(KERN_INFO "  Inode:\n");
-	printk(KERN_INFO "    num = %llu %llu\n",
-		    (unsigned long long)ip->i_num.no_formal_ino,
-		    (unsigned long long)ip->i_num.no_addr);
-	printk(KERN_INFO "    type = %u\n", IF2DT(ip->i_inode.i_mode));
-	printk(KERN_INFO "    i_flags =");
+	print_dbg(gi, "  Inode:\n");
+	print_dbg(gi, "    num = %llu/%llu\n",
+		    ip->i_num.no_formal_ino, ip->i_num.no_addr);
+	print_dbg(gi, "    type = %u\n", IF2DT(ip->i_inode.i_mode));
+	print_dbg(gi, "    i_flags =");
 	for (x = 0; x < 32; x++)
 		if (test_bit(x, &ip->i_flags))
-			printk(" %u", x);
-	printk(" \n");
-
-	error = 0;
-
-	return error;
+			print_dbg(gi, " %u", x);
+	print_dbg(gi, " \n");
+	return 0;
 }
 
 /**
@@ -1912,74 +1841,86 @@ static int dump_inode(struct gfs2_inode *ip)
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_glock(struct gfs2_glock *gl)
+static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
 {
 	struct gfs2_holder *gh;
 	unsigned int x;
 	int error = -ENOBUFS;
+	struct task_struct *gl_owner;
 
 	spin_lock(&gl->gl_spin);
 
-	printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
-	       (unsigned long long)gl->gl_name.ln_number);
-	printk(KERN_INFO "  gl_flags =");
+	print_dbg(gi, "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+		   (unsigned long long)gl->gl_name.ln_number);
+	print_dbg(gi, "  gl_flags =");
 	for (x = 0; x < 32; x++) {
 		if (test_bit(x, &gl->gl_flags))
-			printk(" %u", x);
+			print_dbg(gi, " %u", x);
 	}
-	printk(" \n");
-	printk(KERN_INFO "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
-	printk(KERN_INFO "  gl_state = %u\n", gl->gl_state);
-	printk(KERN_INFO "  gl_owner = %s\n", gl->gl_owner->comm);
-	print_symbol(KERN_INFO "  gl_ip = %s\n", gl->gl_ip);
-	printk(KERN_INFO "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-	printk(KERN_INFO "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
-	printk(KERN_INFO "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
-	printk(KERN_INFO "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-	printk(KERN_INFO "  le = %s\n",
+	if (!test_bit(GLF_LOCK, &gl->gl_flags))
+		print_dbg(gi, " (unlocked)");
+	print_dbg(gi, " \n");
+	print_dbg(gi, "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
+	print_dbg(gi, "  gl_state = %u\n", gl->gl_state);
+	if (gl->gl_owner_pid) {
+		gl_owner = find_task_by_pid(gl->gl_owner_pid);
+		if (gl_owner)
+			print_dbg(gi, "  gl_owner = pid %d (%s)\n",
+				  gl->gl_owner_pid, gl_owner->comm);
+		else
+			print_dbg(gi, "  gl_owner = %d (ended)\n",
+				  gl->gl_owner_pid);
+	} else
+		print_dbg(gi, "  gl_owner = -1\n");
+	print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
+	print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+	print_dbg(gi, "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+	print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+	print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
+	print_dbg(gi, "  le = %s\n",
 		   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
-	printk(KERN_INFO "  reclaim = %s\n",
-		    (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+	print_dbg(gi, "  reclaim = %s\n",
+		   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
 	if (gl->gl_aspace)
-		printk(KERN_INFO "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-		       gl->gl_aspace->i_mapping->nrpages);
+		print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
+			   gl->gl_aspace->i_mapping->nrpages);
 	else
-		printk(KERN_INFO "  aspace = no\n");
-	printk(KERN_INFO "  ail = %d\n", atomic_read(&gl->gl_ail_count));
+		print_dbg(gi, "  aspace = no\n");
+	print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
 	if (gl->gl_req_gh) {
-		error = dump_holder("Request", gl->gl_req_gh);
+		error = dump_holder(gi, "Request", gl->gl_req_gh);
 		if (error)
 			goto out;
 	}
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		error = dump_holder("Holder", gh);
+		error = dump_holder(gi, "Holder", gh);
 		if (error)
 			goto out;
 	}
 	list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
-		error = dump_holder("Waiter1", gh);
-		if (error)
-			goto out;
-	}
-	list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
-		error = dump_holder("Waiter2", gh);
+		error = dump_holder(gi, "Waiter1", gh);
 		if (error)
 			goto out;
 	}
 	list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
-		error = dump_holder("Waiter3", gh);
+		error = dump_holder(gi, "Waiter3", gh);
 		if (error)
 			goto out;
 	}
+	if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+		print_dbg(gi, "  Demotion req to state %u (%llu uS ago)\n",
+			  gl->gl_demote_state,
+			  (u64)(jiffies - gl->gl_demote_time)*(1000000/HZ));
+	}
 	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
 		if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
-		    list_empty(&gl->gl_holders)) {
-			error = dump_inode(gl->gl_object);
+			list_empty(&gl->gl_holders)) {
+			error = dump_inode(gi, gl->gl_object);
 			if (error)
 				goto out;
 		} else {
 			error = -ENOBUFS;
-			printk(KERN_INFO "  Inode: busy\n");
+			print_dbg(gi, "  Inode: busy\n");
 		}
 	}
 
@@ -2014,7 +1955,7 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
 			if (gl->gl_sbd != sdp)
 				continue;
 
-			error = dump_glock(gl);
+			error = dump_glock(NULL, gl);
 			if (error)
 				break;
 		}
@@ -2043,3 +1984,189 @@ int __init gfs2_glock_init(void)
 	return 0;
 }
 
+static int gfs2_glock_iter_next(struct glock_iter *gi)
+{
+	read_lock(gl_lock_addr(gi->hash));
+	while (1) {
+		if (!gi->hb_list) {  /* If we don't have a hash bucket yet */
+			gi->hb_list = &gl_hash_table[gi->hash].hb_list;
+			if (hlist_empty(gi->hb_list)) {
+				read_unlock(gl_lock_addr(gi->hash));
+				gi->hash++;
+				read_lock(gl_lock_addr(gi->hash));
+				gi->hb_list = NULL;
+				if (gi->hash >= GFS2_GL_HASH_SIZE) {
+					read_unlock(gl_lock_addr(gi->hash));
+					return 1;
+				}
+				else
+					continue;
+			}
+			if (!hlist_empty(gi->hb_list)) {
+				gi->gl = list_entry(gi->hb_list->first,
+						    struct gfs2_glock,
+						    gl_list);
+			}
+		} else {
+			if (gi->gl->gl_list.next == NULL) {
+				read_unlock(gl_lock_addr(gi->hash));
+				gi->hash++;
+				read_lock(gl_lock_addr(gi->hash));
+				gi->hb_list = NULL;
+				continue;
+			}
+			gi->gl = list_entry(gi->gl->gl_list.next,
+					    struct gfs2_glock, gl_list);
+		}
+		if (gi->gl)
+			break;
+	}
+	read_unlock(gl_lock_addr(gi->hash));
+	return 0;
+}
+
+static void gfs2_glock_iter_free(struct glock_iter *gi)
+{
+	kfree(gi);
+}
+
+static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
+{
+	struct glock_iter *gi;
+
+	gi = kmalloc(sizeof (*gi), GFP_KERNEL);
+	if (!gi)
+		return NULL;
+
+	gi->sdp = sdp;
+	gi->hash = 0;
+	gi->gl = NULL;
+	gi->hb_list = NULL;
+	gi->seq = NULL;
+	memset(gi->string, 0, sizeof(gi->string));
+
+	if (gfs2_glock_iter_next(gi)) {
+		gfs2_glock_iter_free(gi);
+		return NULL;
+	}
+
+	return gi;
+}
+
+static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct glock_iter *gi;
+	loff_t n = *pos;
+
+	gi = gfs2_glock_iter_init(file->private);
+	if (!gi)
+		return NULL;
+
+	while (n--) {
+		if (gfs2_glock_iter_next(gi)) {
+			gfs2_glock_iter_free(gi);
+			return NULL;
+		}
+	}
+
+	return gi;
+}
+
+static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+				 loff_t *pos)
+{
+	struct glock_iter *gi = iter_ptr;
+
+	(*pos)++;
+
+	if (gfs2_glock_iter_next(gi)) {
+		gfs2_glock_iter_free(gi);
+		return NULL;
+	}
+
+	return gi;
+}
+
+static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct glock_iter *gi = iter_ptr;
+
+	gi->seq = file;
+	dump_glock(gi, gi->gl);
+
+	return 0;
+}
+
+static struct seq_operations gfs2_glock_seq_ops = {
+	.start = gfs2_glock_seq_start,
+	.next  = gfs2_glock_seq_next,
+	.stop  = gfs2_glock_seq_stop,
+	.show  = gfs2_glock_seq_show,
+};
+
+static int gfs2_debugfs_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &gfs2_glock_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations gfs2_debug_fops = {
+	.owner   = THIS_MODULE,
+	.open    = gfs2_debugfs_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
+{
+	sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
+	if (!sdp->debugfs_dir)
+		return -ENOMEM;
+	sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
+							 S_IFREG | S_IRUGO,
+							 sdp->debugfs_dir, sdp,
+							 &gfs2_debug_fops);
+	if (!sdp->debugfs_dentry_glocks)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
+{
+	if (sdp && sdp->debugfs_dir) {
+		if (sdp->debugfs_dentry_glocks) {
+			debugfs_remove(sdp->debugfs_dentry_glocks);
+			sdp->debugfs_dentry_glocks = NULL;
+		}
+		debugfs_remove(sdp->debugfs_dir);
+		sdp->debugfs_dir = NULL;
+	}
+}
+
+int gfs2_register_debugfs(void)
+{
+	gfs2_root = debugfs_create_dir("gfs2", NULL);
+	return gfs2_root ? 0 : -ENOMEM;
+}
+
+void gfs2_unregister_debugfs(void)
+{
+	debugfs_remove(gfs2_root);
+	gfs2_root = NULL;
+}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index f50e40ceca43..b3e152db70c8 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -10,6 +10,7 @@
 #ifndef __GLOCK_DOT_H__
 #define __GLOCK_DOT_H__
 
+#include <linux/sched.h>
 #include "incore.h"
 
 /* Flags for lock requests; used in gfs2_holder gh_flag field.
@@ -38,7 +39,7 @@ static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 	/* Look in glock's list of holders for one with current task as owner */
 	spin_lock(&gl->gl_spin);
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		if (gh->gh_owner == current) {
+		if (gh->gh_owner_pid == current->pid) {
 			locked = 1;
 			break;
 		}
@@ -67,7 +68,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 {
 	int ret;
 	spin_lock(&gl->gl_spin);
-	ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
+	ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
 	spin_unlock(&gl->gl_spin);
 	return ret;
 }
@@ -135,5 +136,9 @@ void gfs2_scand_internal(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
 
 int __init gfs2_glock_init(void);
+int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+int gfs2_register_debugfs(void);
+void gfs2_unregister_debugfs(void);
 
 #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 39c8ae23bd9c..7b82657a9910 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -163,10 +163,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
 		if (ip) {
 			struct address_space *mapping = ip->i_inode.i_mapping;
 			int error = filemap_fdatawait(mapping);
-			if (error == -ENOSPC)
-				set_bit(AS_ENOSPC, &mapping->flags);
-			else if (error)
-				set_bit(AS_EIO, &mapping->flags);
+			mapping_set_error(mapping, error);
 		}
 		clear_bit(GLF_DIRTY, &gl->gl_flags);
 		gfs2_ail_empty_gl(gl);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 49f0dbf40d86..d995441373ab 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -115,11 +115,8 @@ enum {
 	/* Actions */
 	HIF_MUTEX		= 0,
 	HIF_PROMOTE		= 1,
-	HIF_DEMOTE		= 2,
 
 	/* States */
-	HIF_ALLOCED		= 4,
-	HIF_DEALLOC		= 5,
 	HIF_HOLDER		= 6,
 	HIF_FIRST		= 7,
 	HIF_ABORTED		= 9,
@@ -130,7 +127,7 @@ struct gfs2_holder {
 	struct list_head gh_list;
 
 	struct gfs2_glock *gh_gl;
-	struct task_struct *gh_owner;
+	pid_t gh_owner_pid;
 	unsigned int gh_state;
 	unsigned gh_flags;
 
@@ -142,8 +139,8 @@ struct gfs2_holder {
 enum {
 	GLF_LOCK		= 1,
 	GLF_STICKY		= 2,
+	GLF_DEMOTE		= 3,
 	GLF_DIRTY		= 5,
-	GLF_SKIP_WAITERS2	= 6,
 };
 
 struct gfs2_glock {
@@ -156,11 +153,12 @@ struct gfs2_glock {
 
 	unsigned int gl_state;
 	unsigned int gl_hash;
-	struct task_struct *gl_owner;
+	unsigned int gl_demote_state; /* state requested by remote node */
+	unsigned long gl_demote_time; /* time of first demote request */
+	pid_t gl_owner_pid;
 	unsigned long gl_ip;
 	struct list_head gl_holders;
 	struct list_head gl_waiters1;	/* HIF_MUTEX */
-	struct list_head gl_waiters2;	/* HIF_DEMOTE */
 	struct list_head gl_waiters3;	/* HIF_PROMOTE */
 
 	const struct gfs2_glock_operations *gl_ops;
@@ -611,6 +609,8 @@ struct gfs2_sbd {
 
 	unsigned long sd_last_warning;
 	struct vfsmount *sd_gfs2mnt;
+	struct dentry *debugfs_dir;    /* debugfs directory */
+	struct dentry *debugfs_dentry_glocks; /* for debugfs */
 };
 
 #endif /* __INCORE_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index b167addf9fd1..c305255bfe8a 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -151,7 +151,7 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
 
 /* make_strname - convert GFS lock numbers to a string */
 
-static inline void make_strname(struct lm_lockname *lockname,
+static inline void make_strname(const struct lm_lockname *lockname,
 				struct gdlm_strname *str)
 {
 	sprintf(str->name, "%8x%16llx", lockname->ln_type,
@@ -169,6 +169,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 		return -ENOMEM;
 
 	lp->lockname = *name;
+	make_strname(name, &lp->strname);
 	lp->ls = ls;
 	lp->cur = DLM_LOCK_IV;
 	lp->lvb = NULL;
@@ -227,7 +228,6 @@ void gdlm_put_lock(void *lock)
 unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 {
 	struct gdlm_ls *ls = lp->ls;
-	struct gdlm_strname str;
 	int error, bast = 1;
 
 	/*
@@ -249,8 +249,6 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 	if (test_bit(LFL_NOBAST, &lp->flags))
 		bast = 0;
 
-	make_strname(&lp->lockname, &str);
-
 	set_bit(LFL_ACTIVE, &lp->flags);
 
 	log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
@@ -258,8 +256,8 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 		  lp->cur, lp->req, lp->lkf);
 
 	error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
-			 str.name, str.namelen, 0, gdlm_ast, lp,
-			 bast ? gdlm_bast : NULL);
+			 lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
+			 lp, bast ? gdlm_bast : NULL);
 
 	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
 		lp->lksb.sb_status = -EAGAIN;
@@ -268,7 +266,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 	}
 
 	if (error) {
-		log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
+		log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
 			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
 			  (unsigned long long)lp->lockname.ln_number, error,
 			  lp->cur, lp->req, lp->lkf, lp->flags);
@@ -296,7 +294,7 @@ static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
 	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
 
 	if (error) {
-		log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
+		log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
 			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
 			  (unsigned long long)lp->lockname.ln_number, error,
 			  lp->cur, lp->req, lp->lkf, lp->flags);
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a87c7bf3c568..d074c6e6f9bf 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -36,7 +36,7 @@
 
 #define GDLM_STRNAME_BYTES	24
 #define GDLM_LVB_SIZE		32
-#define GDLM_DROP_COUNT		200000
+#define GDLM_DROP_COUNT		0
 #define GDLM_DROP_PERIOD	60
 #define GDLM_NAME_LEN		128
 
@@ -106,6 +106,7 @@ enum {
 struct gdlm_lock {
 	struct gdlm_ls		*ls;
 	struct lm_lockname	lockname;
+	struct gdlm_strname	strname;
 	char			*lvb;
 	struct dlm_lksb		lksb;
 
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index 1dd4215b83d0..f82495e18c2d 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -25,6 +25,15 @@ struct plock_op {
 	struct gdlm_plock_info info;
 };
 
+struct plock_xop {
+	struct plock_op xop;
+	void *callback;
+	void *fl;
+	void *file;
+	struct file_lock flc;
+};
+
+
 static inline void set_version(struct gdlm_plock_info *info)
 {
 	info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
@@ -64,12 +73,14 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
 {
 	struct gdlm_ls *ls = lockspace;
 	struct plock_op *op;
+	struct plock_xop *xop;
 	int rv;
 
-	op = kzalloc(sizeof(*op), GFP_KERNEL);
-	if (!op)
+	xop = kzalloc(sizeof(*xop), GFP_KERNEL);
+	if (!xop)
 		return -ENOMEM;
 
+	op = &xop->xop;
 	op->info.optype		= GDLM_PLOCK_OP_LOCK;
 	op->info.pid		= fl->fl_pid;
 	op->info.ex		= (fl->fl_type == F_WRLCK);
@@ -79,9 +90,21 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
 	op->info.owner		= (__u64)(long) fl->fl_owner;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+		xop->callback	= fl->fl_lmops->fl_grant;
+		locks_init_lock(&xop->flc);
+		locks_copy_lock(&xop->flc, fl);
+		xop->fl		= fl;
+		xop->file	= file;
+	} else
+		xop->callback	= NULL;
 
 	send_op(op);
-	wait_event(recv_wq, (op->done != 0));
+
+	if (xop->callback == NULL)
+		wait_event(recv_wq, (op->done != 0));
+	else
+		return -EINPROGRESS;
 
 	spin_lock(&ops_lock);
 	if (!list_empty(&op->list)) {
@@ -99,7 +122,63 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
 				  (unsigned long long)name->ln_number);
 	}
 
-	kfree(op);
+	kfree(xop);
+	return rv;
+}
+
+/* Returns failure iff a succesful lock operation should be canceled */
+static int gdlm_plock_callback(struct plock_op *op)
+{
+	struct file *file;
+	struct file_lock *fl;
+	struct file_lock *flc;
+	int (*notify)(void *, void *, int) = NULL;
+	struct plock_xop *xop = (struct plock_xop *)op;
+	int rv = 0;
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		printk(KERN_INFO "plock op on list\n");
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	/* check if the following 2 are still valid or make a copy */
+	file = xop->file;
+	flc = &xop->flc;
+	fl = xop->fl;
+	notify = xop->callback;
+
+	if (op->info.rv) {
+		notify(flc, NULL, op->info.rv);
+		goto out;
+	}
+
+	/* got fs lock; bookkeep locally as well: */
+	flc->fl_flags &= ~FL_SLEEP;
+	if (posix_lock_file(file, flc, NULL)) {
+		/*
+		 * This can only happen in the case of kmalloc() failure.
+		 * The filesystem's own lock is the authoritative lock,
+		 * so a failure to get the lock locally is not a disaster.
+		 * As long as GFS cannot reliably cancel locks (especially
+		 * in a low-memory situation), we're better off ignoring
+		 * this failure than trying to recover.
+		 */
+		log_error("gdlm_plock: vfs lock error file %p fl %p",
+				file, fl);
+	}
+
+	rv = notify(flc, NULL, 0);
+	if (rv) {
+		/* XXX: We need to cancel the fs lock here: */
+		printk("gfs2 lock granted after lock request failed;"
+						" dangling lock!\n");
+		goto out;
+	}
+
+out:
+	kfree(xop);
 	return rv;
 }
 
@@ -138,6 +217,9 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
 
 	rv = op->info.rv;
 
+	if (rv == -ENOENT)
+		rv = 0;
+
 	kfree(op);
 	return rv;
 }
@@ -161,6 +243,7 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
 
+
 	send_op(op);
 	wait_event(recv_wq, (op->done != 0));
 
@@ -173,9 +256,10 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
 
 	rv = op->info.rv;
 
-	if (rv == 0)
-		fl->fl_type = F_UNLCK;
-	else if (rv > 0) {
+	fl->fl_type = F_UNLCK;
+	if (rv == -ENOENT)
+		rv = 0;
+	else if (rv == 0 && op->info.pid != fl->fl_pid) {
 		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
 		fl->fl_pid = op->info.pid;
 		fl->fl_start = op->info.start;
@@ -243,9 +327,14 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
 	}
 	spin_unlock(&ops_lock);
 
-	if (found)
-		wake_up(&recv_wq);
-	else
+	if (found) {
+		struct plock_xop *xop;
+		xop = (struct plock_xop *)op;
+		if (xop->callback)
+			count = gdlm_plock_callback(op);
+		else
+			wake_up(&recv_wq);
+	} else
 		printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
 			(unsigned long long)info.number);
 	return count;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4746b884662d..d9fe3ca40e18 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -190,7 +190,6 @@ static struct kobj_type gdlm_ktype = {
 };
 
 static struct kset gdlm_kset = {
-	.subsys = &kernel_subsys,
 	.kobj   = {.name = "lock_dlm",},
 	.ktype  = &gdlm_ktype,
 };
@@ -225,6 +224,7 @@ int gdlm_sysfs_init(void)
 {
 	int error;
 
+	kobj_set_kset_s(&gdlm_kset, kernel_subsys);
 	error = kset_register(&gdlm_kset);
 	if (error)
 		printk("lock_dlm: cannot register kset %d\n", error);
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index acfbc941f319..0d149c8c493a 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -13,7 +13,6 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/lm_interface.h>
 
 struct nolock_lockspace {
@@ -164,13 +163,7 @@ static void nolock_unhold_lvb(void *lock, char *lvb)
 static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
 			    struct file *file, struct file_lock *fl)
 {
-	struct file_lock tmp;
-	int ret;
-
-	ret = posix_test_lock(file, fl, &tmp);
-	fl->fl_type = F_UNLCK;
-	if (ret)
-		memcpy(fl, &tmp, sizeof(struct file_lock));
+	posix_test_lock(file, fl);
 
 	return 0;
 }
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 16bb4b4561ae..f82d84d05d23 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -33,16 +33,17 @@ static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 
 	tr->tr_touched = 1;
 
-	if (!list_empty(&le->le_list))
-		return;
-
 	gl = container_of(le, struct gfs2_glock, gl_le);
 	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
 		return;
-	gfs2_glock_hold(gl);
-	set_bit(GLF_DIRTY, &gl->gl_flags);
 
 	gfs2_log_lock(sdp);
+	if (!list_empty(&le->le_list)){
+		gfs2_log_unlock(sdp);
+		return;
+	}
+	gfs2_glock_hold(gl);
+	set_bit(GLF_DIRTY, &gl->gl_flags);
 	sdp->sd_log_num_gl++;
 	list_add(&le->le_list, &sdp->sd_log_le_gl);
 	gfs2_log_unlock(sdp);
@@ -415,13 +416,14 @@ static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 
 	tr->tr_touched = 1;
 
-	if (!list_empty(&le->le_list))
-		return;
-
 	rgd = container_of(le, struct gfs2_rgrpd, rd_le);
-	gfs2_rgrp_bh_hold(rgd);
 
 	gfs2_log_lock(sdp);
+	if (!list_empty(&le->le_list)){
+		gfs2_log_unlock(sdp);
+		return;
+	}
+	gfs2_rgrp_bh_hold(rgd);
 	sdp->sd_log_num_rg++;
 	list_add(&le->le_list, &sdp->sd_log_le_rg);
 	gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 6e8a59809abf..787a0edef100 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -27,32 +27,27 @@
 static void gfs2_init_inode_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct gfs2_inode *ip = foo;
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		inode_init_once(&ip->i_inode);
-		spin_lock_init(&ip->i_spin);
-		init_rwsem(&ip->i_rw_mutex);
-		memset(ip->i_cache, 0, sizeof(ip->i_cache));
-	}
+
+	inode_init_once(&ip->i_inode);
+	spin_lock_init(&ip->i_spin);
+	init_rwsem(&ip->i_rw_mutex);
+	memset(ip->i_cache, 0, sizeof(ip->i_cache));
 }
 
 static void gfs2_init_glock_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct gfs2_glock *gl = foo;
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		INIT_HLIST_NODE(&gl->gl_list);
-		spin_lock_init(&gl->gl_spin);
-		INIT_LIST_HEAD(&gl->gl_holders);
-		INIT_LIST_HEAD(&gl->gl_waiters1);
-		INIT_LIST_HEAD(&gl->gl_waiters2);
-		INIT_LIST_HEAD(&gl->gl_waiters3);
-		gl->gl_lvb = NULL;
-		atomic_set(&gl->gl_lvb_count, 0);
-		INIT_LIST_HEAD(&gl->gl_reclaim);
-		INIT_LIST_HEAD(&gl->gl_ail_list);
-		atomic_set(&gl->gl_ail_count, 0);
-	}
+
+	INIT_HLIST_NODE(&gl->gl_list);
+	spin_lock_init(&gl->gl_spin);
+	INIT_LIST_HEAD(&gl->gl_holders);
+	INIT_LIST_HEAD(&gl->gl_waiters1);
+	INIT_LIST_HEAD(&gl->gl_waiters3);
+	gl->gl_lvb = NULL;
+	atomic_set(&gl->gl_lvb_count, 0);
+	INIT_LIST_HEAD(&gl->gl_reclaim);
+	INIT_LIST_HEAD(&gl->gl_ail_list);
+	atomic_set(&gl->gl_ail_count, 0);
 }
 
 /**
@@ -103,6 +98,8 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
+	gfs2_register_debugfs();
+
 	printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
 
 	return 0;
@@ -130,6 +127,7 @@ fail:
 
 static void __exit exit_gfs2_fs(void)
 {
+	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 32caecd20300..4864659555d4 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/parser.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -20,6 +21,52 @@
 #include "sys.h"
 #include "util.h"
 
+enum {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_nodebug,
+	Opt_upgrade,
+	Opt_num_glockd,
+	Opt_acl,
+	Opt_noacl,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+	Opt_suiddir,
+	Opt_nosuiddir,
+	Opt_data_writeback,
+	Opt_data_ordered,
+};
+
+static match_table_t tokens = {
+	{Opt_lockproto, "lockproto=%s"},
+	{Opt_locktable, "locktable=%s"},
+	{Opt_hostdata, "hostdata=%s"},
+	{Opt_spectator, "spectator"},
+	{Opt_ignore_local_fs, "ignore_local_fs"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_localcaching, "localcaching"},
+	{Opt_debug, "debug"},
+	{Opt_nodebug, "nodebug"},
+	{Opt_upgrade, "upgrade"},
+	{Opt_num_glockd, "num_glockd=%d"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_quota_off, "quota=off"},
+	{Opt_quota_account, "quota=account"},
+	{Opt_quota_on, "quota=on"},
+	{Opt_suiddir, "suiddir"},
+	{Opt_nosuiddir, "nosuiddir"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_ordered, "data=ordered"}
+};
+
 /**
  * gfs2_mount_args - Parse mount options
  * @sdp:
@@ -54,146 +101,150 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 	   process them */
 
 	for (options = data; (o = strsep(&options, ",")); ) {
+		int token, option;
+		substring_t tmp[MAX_OPT_ARGS];
+
 		if (!*o)
 			continue;
 
-		v = strchr(o, '=');
-		if (v)
-			*v++ = 0;
+		token = match_token(o, tokens, tmp);
+		switch (token) {
+		case Opt_lockproto:
+			v = match_strdup(&tmp[0]);
+			if (!v) {
+				fs_info(sdp, "no memory for lockproto\n");
+				error = -ENOMEM;
+				goto out_error;
+			}
 
-		if (!strcmp(o, "lockproto")) {
-			if (!v)
-				goto need_value;
-			if (remount && strcmp(v, args->ar_lockproto))
+			if (remount && strcmp(v, args->ar_lockproto)) {
+				kfree(v);
 				goto cant_remount;
+			}
+			
 			strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
 			args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
-		}
+			kfree(v);
+			break;
+		case Opt_locktable:
+			v = match_strdup(&tmp[0]);
+			if (!v) {
+				fs_info(sdp, "no memory for locktable\n");
+				error = -ENOMEM;
+				goto out_error;
+			}
 
-		else if (!strcmp(o, "locktable")) {
-			if (!v)
-				goto need_value;
-			if (remount && strcmp(v, args->ar_locktable))
+			if (remount && strcmp(v, args->ar_locktable)) {
+				kfree(v);
 				goto cant_remount;
+			}
+
 			strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
-			args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
-		}
+			args->ar_locktable[GFS2_LOCKNAME_LEN - 1]  = 0;
+			kfree(v);
+			break;
+		case Opt_hostdata:
+			v = match_strdup(&tmp[0]);
+			if (!v) {
+				fs_info(sdp, "no memory for hostdata\n");
+				error = -ENOMEM;
+				goto out_error;
+			}
 
-		else if (!strcmp(o, "hostdata")) {
-			if (!v)
-				goto need_value;
-			if (remount && strcmp(v, args->ar_hostdata))
+			if (remount && strcmp(v, args->ar_hostdata)) {
+				kfree(v);
 				goto cant_remount;
+			}
+
 			strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
 			args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
-		}
-
-		else if (!strcmp(o, "spectator")) {
+			kfree(v);
+			break;
+		case Opt_spectator:
 			if (remount && !args->ar_spectator)
 				goto cant_remount;
 			args->ar_spectator = 1;
 			sdp->sd_vfs->s_flags |= MS_RDONLY;
-		}
-
-		else if (!strcmp(o, "ignore_local_fs")) {
+			break;
+		case Opt_ignore_local_fs:
 			if (remount && !args->ar_ignore_local_fs)
 				goto cant_remount;
 			args->ar_ignore_local_fs = 1;
-		}
-
-		else if (!strcmp(o, "localflocks")) {
+			break;
+		case Opt_localflocks:
 			if (remount && !args->ar_localflocks)
 				goto cant_remount;
 			args->ar_localflocks = 1;
-		}
-
-		else if (!strcmp(o, "localcaching")) {
+			break;
+		case Opt_localcaching:
 			if (remount && !args->ar_localcaching)
 				goto cant_remount;
 			args->ar_localcaching = 1;
-		}
-
-		else if (!strcmp(o, "debug"))
+			break;
+		case Opt_debug:
 			args->ar_debug = 1;
-
-		else if (!strcmp(o, "nodebug"))
+			break;
+		case Opt_nodebug:
 			args->ar_debug = 0;
-
-		else if (!strcmp(o, "upgrade")) {
+			break;
+		case Opt_upgrade:
 			if (remount && !args->ar_upgrade)
 				goto cant_remount;
 			args->ar_upgrade = 1;
-		}
+			break;
+		case Opt_num_glockd:
+			if ((error = match_int(&tmp[0], &option))) {
+				fs_info(sdp, "problem getting num_glockd\n");
+				goto out_error;
+			}
 
-		else if (!strcmp(o, "num_glockd")) {
-			unsigned int x;
-			if (!v)
-				goto need_value;
-			sscanf(v, "%u", &x);
-			if (remount && x != args->ar_num_glockd)
+			if (remount && option != args->ar_num_glockd)
 				goto cant_remount;
-			if (!x || x > GFS2_GLOCKD_MAX) {
+			if (!option || option > GFS2_GLOCKD_MAX) {
 				fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
-				        GFS2_GLOCKD_MAX, x);
+				        GFS2_GLOCKD_MAX, option);
 				error = -EINVAL;
-				break;
+				goto out_error;
 			}
-			args->ar_num_glockd = x;
-		}
-
-		else if (!strcmp(o, "acl")) {
+			args->ar_num_glockd = option;
+			break;
+		case Opt_acl:
 			args->ar_posix_acl = 1;
 			sdp->sd_vfs->s_flags |= MS_POSIXACL;
-		}
-
-		else if (!strcmp(o, "noacl")) {
+			break;
+		case Opt_noacl:
 			args->ar_posix_acl = 0;
 			sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
-		}
-
-		else if (!strcmp(o, "quota")) {
-			if (!v)
-				goto need_value;
-			if (!strcmp(v, "off"))
-				args->ar_quota = GFS2_QUOTA_OFF;
-			else if (!strcmp(v, "account"))
-				args->ar_quota = GFS2_QUOTA_ACCOUNT;
-			else if (!strcmp(v, "on"))
-				args->ar_quota = GFS2_QUOTA_ON;
-			else {
-				fs_info(sdp, "invalid value for quota\n");
-				error = -EINVAL;
-				break;
-			}
-		}
-
-		else if (!strcmp(o, "suiddir"))
+			break;
+		case Opt_quota_off:
+			args->ar_quota = GFS2_QUOTA_OFF;
+			break;
+		case Opt_quota_account:
+			args->ar_quota = GFS2_QUOTA_ACCOUNT;
+			break;
+		case Opt_quota_on:
+			args->ar_quota = GFS2_QUOTA_ON;
+			break;
+		case Opt_suiddir:
 			args->ar_suiddir = 1;
-
-		else if (!strcmp(o, "nosuiddir"))
+			break;
+		case Opt_nosuiddir:
 			args->ar_suiddir = 0;
-
-		else if (!strcmp(o, "data")) {
-			if (!v)
-				goto need_value;
-			if (!strcmp(v, "writeback"))
-				args->ar_data = GFS2_DATA_WRITEBACK;
-			else if (!strcmp(v, "ordered"))
-				args->ar_data = GFS2_DATA_ORDERED;
-			else {
-				fs_info(sdp, "invalid value for data\n");
-				error = -EINVAL;
-				break;
-			}
-		}
-
-		else {
+			break;
+		case Opt_data_writeback:
+			args->ar_data = GFS2_DATA_WRITEBACK;
+			break;
+		case Opt_data_ordered:
+			args->ar_data = GFS2_DATA_ORDERED;
+			break;
+		default:
 			fs_info(sdp, "unknown option: %s\n", o);
 			error = -EINVAL;
-			break;
+			goto out_error;
 		}
 	}
 
+out_error:
 	if (error)
 		fs_info(sdp, "invalid mount option(s)\n");
 
@@ -202,10 +253,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 
 	return error;
 
-need_value:
-	fs_info(sdp, "need value for option %s\n", o);
-	return -EINVAL;
-
 cant_remount:
 	fs_info(sdp, "can't remount with option %s\n", o);
 	return -EINVAL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index b3b7e8475359..30c15622174f 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -197,7 +197,19 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	void *kaddr;
 	int error;
 
-	BUG_ON(page->index);
+	/*
+	 * Due to the order of unstuffing files and ->nopage(), we can be
+	 * asked for a zero page in the case of a stuffed file being extended,
+	 * so we need to supply one here. It doesn't happen often.
+	 */
+	if (unlikely(page->index)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr, 0, PAGE_CACHE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		return 0;
+	}
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
@@ -208,9 +220,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	       ip->i_di.di_size);
 	memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
 	kunmap_atomic(kaddr, KM_USER0);
-
+	flush_dcache_page(page);
 	brelse(dibh);
-
 	SetPageUptodate(page);
 
 	return 0;
@@ -507,7 +518,9 @@ static int gfs2_commit_write(struct file *file, struct page *page,
 		gfs2_quota_unlock(ip);
 		gfs2_alloc_put(ip);
 	}
+	unlock_page(page);
 	gfs2_glock_dq_m(1, &ip->i_gh);
+	lock_page(page);
 	gfs2_holder_uninit(&ip->i_gh);
 	return 0;
 
@@ -520,7 +533,9 @@ fail_endtrans:
 		gfs2_quota_unlock(ip);
 		gfs2_alloc_put(ip);
 	}
+	unlock_page(page);
 	gfs2_glock_dq_m(1, &ip->i_gh);
+	lock_page(page);
 	gfs2_holder_uninit(&ip->i_gh);
 fail_nounlock:
 	ClearPageUptodate(page);
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index c6bac6b69420..a6fdc52f554a 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -11,7 +11,6 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index b50180e22779..064df8804582 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,7 +15,6 @@
 #include <linux/uio.h>
 #include <linux/blkdev.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
@@ -513,18 +512,18 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (sdp->sd_args.ar_localflocks) {
 		if (IS_GETLK(cmd)) {
-			struct file_lock tmp;
-			int ret;
-			ret = posix_test_lock(file, fl, &tmp);
-			fl->fl_type = F_UNLCK;
-			if (ret)
-				memcpy(fl, &tmp, sizeof(struct file_lock));
+			posix_test_lock(file, fl);
 			return 0;
 		} else {
 			return posix_lock_file_wait(file, fl);
 		}
 	}
 
+	if (cmd == F_CANCELLK) {
+		/* Hack: */
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
 	if (IS_GETLK(cmd))
 		return gfs2_lm_plock_get(sdp, &name, file, fl);
 	else if (fl->fl_type == F_UNLCK)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ee54cb667083..2c5f8e7def0d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -690,6 +690,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	if (error)
 		goto fail;
 
+	gfs2_create_debugfs_file(sdp);
+
 	error = gfs2_sys_fs_add(sdp);
 	if (error)
 		goto fail;
@@ -754,6 +756,7 @@ fail_lm:
 fail_sys:
 	gfs2_sys_fs_del(sdp);
 fail:
+	gfs2_delete_debugfs_file(sdp);
 	kfree(sdp);
 	sb->s_fs_info = NULL;
 	return error;
@@ -896,6 +899,7 @@ error:
 
 static void gfs2_kill_sb(struct super_block *sb)
 {
+	gfs2_delete_debugfs_file(sb->s_fs_info);
 	kill_block_super(sb);
 }
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index b89999d3a767..485ce3d49923 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -284,6 +284,31 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 }
 
 /**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+static void gfs2_drop_inode(struct inode *inode)
+{
+	if (inode->i_private && inode->i_nlink) {
+		struct gfs2_inode *ip = GFS2_I(inode);
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+			clear_nlink(inode);
+	}
+	generic_drop_inode(inode);
+}
+
+/**
  * gfs2_clear_inode - Deallocate an inode when VFS is done with it
  * @inode: The VFS inode
  *
@@ -441,7 +466,7 @@ out_unlock:
 out_uninit:
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
-	if (error)
+	if (error && error != GLR_TRYFAILED)
 		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
 out:
 	truncate_inode_pages(&inode->i_data, 0);
@@ -481,6 +506,7 @@ const struct super_operations gfs2_super_ops = {
 	.statfs			= gfs2_statfs,
 	.remount_fs		= gfs2_remount_fs,
 	.clear_inode		= gfs2_clear_inode,
+	.drop_inode		= gfs2_drop_inode,
 	.show_options		= gfs2_show_options,
 };
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8d9c08b5c4b6..1727f5012efe 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -27,6 +27,7 @@
 #include "trans.h"
 #include "ops_file.h"
 #include "util.h"
+#include "log.h"
 
 #define BFITNOENT ((u32)~0)
 
@@ -697,8 +698,6 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
  * @al: the struct gfs2_alloc structure describing the reservation
  *
  * If there's room for the requested blocks to be allocated from the RG:
- *   Sets the $al_reserved_data field in @al.
- *   Sets the $al_reserved_meta field in @al.
  *   Sets the $al_rgd field in @al.
  *
  * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
@@ -709,6 +708,9 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	int ret = 0;
 
+	if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC)
+		return 0;
+
 	spin_lock(&sdp->sd_rindex_spin);
 	if (rgd->rd_free_clone >= al->al_requested) {
 		al->al_rgd = rgd;
@@ -941,9 +943,13 @@ static int get_local_rgrp(struct gfs2_inode *ip)
 			rgd = gfs2_rgrpd_get_first(sdp);
 
 		if (rgd == begin) {
-			if (++loops >= 2 || !skipped)
+			if (++loops >= 3)
 				return -ENOSPC;
+			if (!skipped)
+				loops++;
 			flags = 0;
+			if (loops == 2)
+				gfs2_log_flush(sdp, NULL);
 		}
 	}
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d01f9f0fda26..c26c21b53c19 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -222,7 +222,6 @@ static struct kobj_type gfs2_ktype = {
 };
 
 static struct kset gfs2_kset = {
-	.subsys = &fs_subsys,
 	.kobj   = {.name = "gfs2"},
 	.ktype  = &gfs2_ktype,
 };
@@ -554,6 +553,7 @@ int gfs2_sys_init(void)
 {
 	gfs2_sys_margs = NULL;
 	spin_lock_init(&gfs2_sys_margs_lock);
+	kobj_set_kset_s(&gfs2_kset, fs_subsys);
 	return kset_register(&gfs2_kset);
 }
 
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 5fd0ed71f923..8a3a650abc87 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/pagemap.h>
+#include <linux/log2.h>
 
 #include "btree.h"
 
@@ -76,7 +77,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	tree->depth = be16_to_cpu(head->depth);
 
 	size = tree->node_size;
-	if (!size || size & (size - 1))
+	if (!is_power_of_2(size))
 		goto fail_page;
 	if (!tree->node_count)
 		goto fail_page;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index fafcba593871..9a934db0bd8a 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -13,6 +13,7 @@
 
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
+#include <linux/sched.h>
 
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 623f509f1d47..92cf8751e428 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -434,8 +434,7 @@ static void hfs_init_once(void *p, struct kmem_cache *cachep, unsigned long flag
 {
 	struct hfs_inode_info *i = p;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&i->vfs_inode);
+	inode_init_once(&i->vfs_inode);
 }
 
 static int __init init_hfs_fs(void)
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index a9b9e872e29a..90ebab753d30 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -10,6 +10,7 @@
 
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/log2.h>
 
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
@@ -69,7 +70,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	}
 
 	size = tree->node_size;
-	if (!size || size & (size - 1))
+	if (!is_power_of_2(size))
 		goto fail_page;
 	if (!tree->node_count)
 		goto fail_page;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 642012ac3370..45dab5d6cc10 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
+#include <linux/sched.h>
 
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1a97f9293447..ebd1b380cbbc 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -470,8 +470,7 @@ static void hfsplus_init_once(void *p, struct kmem_cache *cachep, unsigned long
 {
 	struct hfsplus_inode_info *i = p;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&i->vfs_inode);
+	inode_init_once(&i->vfs_inode);
 }
 
 static int __init init_hfsplus_fs(void)
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 70543b17e4c7..06e5930515fe 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -55,7 +55,7 @@ extern int stat_file(const char *path, unsigned long long *inode_out,
 		     int *mode_out, int *nlink_out, int *uid_out, int *gid_out,
 		     unsigned long long *size_out, struct timespec *atime_out,
 		     struct timespec *mtime_out, struct timespec *ctime_out,
-		     int *blksize_out, unsigned long long *blocks_out);
+		     int *blksize_out, unsigned long long *blocks_out, int fd);
 extern int access_file(char *path, int r, int w, int x);
 extern int open_file(char *path, int r, int w, int append);
 extern int file_type(const char *path, int *maj, int *min);
@@ -71,7 +71,7 @@ extern int lseek_file(int fd, long long offset, int whence);
 extern int fsync_file(int fd, int datasync);
 extern int file_create(char *name, int ur, int uw, int ux, int gr,
 		       int gw, int gx, int or, int ow, int ox);
-extern int set_attr(const char *file, struct hostfs_iattr *attrs);
+extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd);
 extern int make_symlink(const char *from, const char *to);
 extern int unlink_file(const char *file);
 extern int do_mkdir(const char *file, int mode);
@@ -87,14 +87,3 @@ extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long *spare_out);
 
 #endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fd301a910122..8286491dbf31 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  *
  * Ported the filesystem routines to 2.5.
@@ -31,14 +31,14 @@ struct hostfs_inode_info {
 
 static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 {
-	return(list_entry(inode, struct hostfs_inode_info, vfs_inode));
+	return list_entry(inode, struct hostfs_inode_info, vfs_inode);
 }
 
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
 
 int hostfs_d_delete(struct dentry *dentry)
 {
-	return(1);
+	return 1;
 }
 
 struct dentry_operations hostfs_dentry_ops = {
@@ -79,7 +79,7 @@ static int __init hostfs_args(char *options, int *add)
 		}
 		options = ptr;
 	}
-	return(0);
+	return 0;
 }
 
 __uml_setup("hostfs=", hostfs_args,
@@ -110,7 +110,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
 	root = HOSTFS_I(parent->d_inode)->host_filename;
 	len += strlen(root);
 	name = kmalloc(len + extra + 1, GFP_KERNEL);
-	if(name == NULL) return(NULL);
+	if(name == NULL)
+		return NULL;
 
 	name[len] = '\0';
 	parent = dentry;
@@ -122,7 +123,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
 		parent = parent->d_parent;
 	}
 	strncpy(name, root, strlen(root));
-	return(name);
+	return name;
 }
 
 static char *inode_name(struct inode *ino, int extra)
@@ -130,7 +131,7 @@ static char *inode_name(struct inode *ino, int extra)
 	struct dentry *dentry;
 
 	dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias);
-	return(dentry_name(dentry, extra));
+	return dentry_name(dentry, extra);
 }
 
 static int read_name(struct inode *ino, char *name)
@@ -147,16 +148,16 @@ static int read_name(struct inode *ino, char *name)
 
 	err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid,
 			&ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime,
-			&ino->i_ctime, &i_blksize, &i_blocks);
+			&ino->i_ctime, &i_blksize, &i_blocks, -1);
 	if(err)
-		return(err);
+		return err;
 
 	ino->i_ino = i_ino;
 	ino->i_mode = i_mode;
 	ino->i_nlink = i_nlink;
 	ino->i_size = i_size;
 	ino->i_blocks = i_blocks;
-	return(0);
+	return 0;
 }
 
 static char *follow_link(char *link)
@@ -181,11 +182,11 @@ static char *follow_link(char *link)
 		goto out_free;
 
 	if(*name == '/')
-		return(name);
+		return name;
 
 	end = strrchr(link, '/');
 	if(end == NULL)
-		return(name);
+		return name;
 
 	*(end + 1) = '\0';
 	len = strlen(link) + strlen(name) + 1;
@@ -199,12 +200,12 @@ static char *follow_link(char *link)
 	sprintf(resolved, "%s%s", link, name);
 	kfree(name);
 	kfree(link);
-	return(resolved);
+	return resolved;
 
  out_free:
 	kfree(name);
  out:
-	return(ERR_PTR(n));
+	return ERR_PTR(n);
 }
 
 static int read_inode(struct inode *ino)
@@ -234,7 +235,7 @@ static int read_inode(struct inode *ino)
 	err = read_name(ino, name);
 	kfree(name);
  out:
-	return(err);
+	return err;
 }
 
 int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
@@ -254,14 +255,15 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
 			&sf->f_namelen, sf->f_spare);
-	if(err) return(err);
+	if(err)
+		return err;
 	sf->f_blocks = f_blocks;
 	sf->f_bfree = f_bfree;
 	sf->f_bavail = f_bavail;
 	sf->f_files = f_files;
 	sf->f_ffree = f_ffree;
 	sf->f_type = HOSTFS_SUPER_MAGIC;
-	return(0);
+	return 0;
 }
 
 static struct inode *hostfs_alloc_inode(struct super_block *sb)
@@ -270,13 +272,13 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 
 	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
 	if(hi == NULL)
-		return(NULL);
+		return NULL;
 
 	*hi = ((struct hostfs_inode_info) { .host_filename	= NULL,
 					    .fd			= -1,
 					    .mode		= 0 });
 	inode_init_once(&hi->vfs_inode);
-	return(&hi->vfs_inode);
+	return &hi->vfs_inode;
 }
 
 static void hostfs_delete_inode(struct inode *inode)
@@ -325,10 +327,12 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
 	int error, len;
 
 	name = dentry_name(file->f_path.dentry, 0);
-	if(name == NULL) return(-ENOMEM);
+	if(name == NULL)
+		return -ENOMEM;
 	dir = open_dir(name, &error);
 	kfree(name);
-	if(dir == NULL) return(-error);
+	if(dir == NULL)
+		return -error;
 	next = file->f_pos;
 	while((name = read_dir(dir, &next, &ino, &len)) != NULL){
 		error = (*filldir)(ent, name, len, file->f_pos,
@@ -337,7 +341,7 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
 		file->f_pos = next;
 	}
 	close_dir(dir);
-	return(0);
+	return 0;
 }
 
 int hostfs_file_open(struct inode *ino, struct file *file)
@@ -347,7 +351,7 @@ int hostfs_file_open(struct inode *ino, struct file *file)
 
 	mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
 	if((mode & HOSTFS_I(ino)->mode) == mode)
-		return(0);
+		return 0;
 
 	/* The file may already have been opened, but with the wrong access,
 	 * so this resets things and reopens the file with the new access.
@@ -367,14 +371,15 @@ int hostfs_file_open(struct inode *ino, struct file *file)
 
 	name = dentry_name(file->f_path.dentry, 0);
 	if(name == NULL)
-		return(-ENOMEM);
+		return -ENOMEM;
 
 	fd = open_file(name, r, w, append);
 	kfree(name);
-	if(fd < 0) return(fd);
+	if(fd < 0)
+		return fd;
 	FILE_HOSTFS_I(file)->fd = fd;
 
-	return(0);
+	return 0;
 }
 
 int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
@@ -458,7 +463,7 @@ int hostfs_readpage(struct file *file, struct page *page)
  out:
 	kunmap(page);
 	unlock_page(page);
-	return(err);
+	return err;
 }
 
 int hostfs_prepare_write(struct file *file, struct page *page,
@@ -485,7 +490,7 @@ int hostfs_prepare_write(struct file *file, struct page *page,
 	err = 0;
  out:
 	kunmap(page);
-	return(err);
+	return err;
 }
 
 int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
@@ -511,7 +516,7 @@ int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
 		inode->i_size = start;
 
 	kunmap(page);
-	return(err);
+	return err;
 }
 
 static const struct address_space_operations hostfs_aops = {
@@ -569,7 +574,7 @@ static int init_inode(struct inode *inode, struct dentry *dentry)
 		break;
 	}
  out:
-	return(err);
+	return err;
 }
 
 int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
@@ -607,16 +612,16 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	HOSTFS_I(inode)->fd = fd;
 	HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE;
 	d_instantiate(dentry, inode);
-	return(0);
+	return 0;
 
  out_put:
 	iput(inode);
  out:
-	return(error);
+	return error;
 }
 
 struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
-                            struct nameidata *nd)
+			     struct nameidata *nd)
 {
 	struct inode *inode;
 	char *name;
@@ -647,44 +652,45 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
 
 	d_add(dentry, inode);
 	dentry->d_op = &hostfs_dentry_ops;
-	return(NULL);
+	return NULL;
 
  out_put:
 	iput(inode);
  out:
-	return(ERR_PTR(err));
+	return ERR_PTR(err);
 }
 
 static char *inode_dentry_name(struct inode *ino, struct dentry *dentry)
 {
-        char *file;
+	char *file;
 	int len;
 
 	file = inode_name(ino, dentry->d_name.len + 1);
-	if(file == NULL) return(NULL);
-        strcat(file, "/");
+	if(file == NULL)
+		return NULL;
+	strcat(file, "/");
 	len = strlen(file);
-        strncat(file, dentry->d_name.name, dentry->d_name.len);
+	strncat(file, dentry->d_name.name, dentry->d_name.len);
 	file[len + dentry->d_name.len] = '\0';
-        return(file);
+	return file;
 }
 
 int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
 {
-        char *from_name, *to_name;
-        int err;
+	char *from_name, *to_name;
+	int err;
 
-        if((from_name = inode_dentry_name(ino, from)) == NULL)
-                return(-ENOMEM);
-        to_name = dentry_name(to, 0);
+	if((from_name = inode_dentry_name(ino, from)) == NULL)
+		return -ENOMEM;
+	to_name = dentry_name(to, 0);
 	if(to_name == NULL){
 		kfree(from_name);
-		return(-ENOMEM);
+		return -ENOMEM;
 	}
-        err = link_file(to_name, from_name);
-        kfree(from_name);
-        kfree(to_name);
-        return(err);
+	err = link_file(to_name, from_name);
+	kfree(from_name);
+	kfree(to_name);
+	return err;
 }
 
 int hostfs_unlink(struct inode *ino, struct dentry *dentry)
@@ -692,13 +698,14 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry)
 	char *file;
 	int err;
 
-	if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
+	if((file = inode_dentry_name(ino, dentry)) == NULL)
+		return -ENOMEM;
 	if(append)
-		return(-EPERM);
+		return -EPERM;
 
 	err = unlink_file(file);
 	kfree(file);
-	return(err);
+	return err;
 }
 
 int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
@@ -706,10 +713,11 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
 	char *file;
 	int err;
 
-	if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
+	if((file = inode_dentry_name(ino, dentry)) == NULL)
+		return -ENOMEM;
 	err = make_symlink(file, to);
 	kfree(file);
-	return(err);
+	return err;
 }
 
 int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
@@ -717,10 +725,11 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
 	char *file;
 	int err;
 
-	if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
+	if((file = inode_dentry_name(ino, dentry)) == NULL)
+		return -ENOMEM;
 	err = do_mkdir(file, mode);
 	kfree(file);
-	return(err);
+	return err;
 }
 
 int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
@@ -728,10 +737,11 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
 	char *file;
 	int err;
 
-	if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
+	if((file = inode_dentry_name(ino, dentry)) == NULL)
+		return -ENOMEM;
 	err = do_rmdir(file);
 	kfree(file);
-	return(err);
+	return err;
 }
 
 int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
@@ -764,14 +774,14 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 		goto out_put;
 
 	d_instantiate(dentry, inode);
-	return(0);
+	return 0;
 
  out_free:
 	kfree(name);
  out_put:
 	iput(inode);
  out:
-	return(err);
+	return err;
 }
 
 int hostfs_rename(struct inode *from_ino, struct dentry *from,
@@ -781,15 +791,15 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	int err;
 
 	if((from_name = inode_dentry_name(from_ino, from)) == NULL)
-		return(-ENOMEM);
+		return -ENOMEM;
 	if((to_name = inode_dentry_name(to_ino, to)) == NULL){
 		kfree(from_name);
-		return(-ENOMEM);
+		return -ENOMEM;
 	}
 	err = rename_file(from_name, to_name);
 	kfree(from_name);
 	kfree(to_name);
-	return(err);
+	return err;
 }
 
 int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
@@ -801,7 +811,8 @@ int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
 	if (desired & MAY_WRITE) w = 1;
 	if (desired & MAY_EXEC) x = 1;
 	name = inode_name(ino, 0);
-	if (name == NULL) return(-ENOMEM);
+	if (name == NULL)
+		return -ENOMEM;
 
 	if (S_ISCHR(ino->i_mode) || S_ISBLK(ino->i_mode) ||
 			S_ISFIFO(ino->i_mode) || S_ISSOCK(ino->i_mode))
@@ -820,6 +831,8 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 	char *name;
 	int err;
 
+	int fd = HOSTFS_I(dentry->d_inode)->fd;
+
 	err = inode_change_ok(dentry->d_inode, attr);
 	if (err)
 		return err;
@@ -863,20 +876,21 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 		attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
 	}
 	name = dentry_name(dentry, 0);
-	if(name == NULL) return(-ENOMEM);
-	err = set_attr(name, &attrs);
+	if(name == NULL)
+		return -ENOMEM;
+	err = set_attr(name, &attrs, fd);
 	kfree(name);
 	if(err)
-		return(err);
+		return err;
 
-	return(inode_setattr(dentry->d_inode, attr));
+	return inode_setattr(dentry->d_inode, attr);
 }
 
 int hostfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	   struct kstat *stat)
 {
 	generic_fillattr(dentry->d_inode, stat);
-	return(0);
+	return 0;
 }
 
 static const struct inode_operations hostfs_iops = {
@@ -915,7 +929,8 @@ int hostfs_link_readpage(struct file *file, struct page *page)
 
 	buffer = kmap(page);
 	name = inode_name(page->mapping->host, 0);
-	if(name == NULL) return(-ENOMEM);
+	if(name == NULL)
+		return -ENOMEM;
 	err = do_readlink(name, buffer, PAGE_CACHE_SIZE);
 	kfree(name);
 	if(err == PAGE_CACHE_SIZE)
@@ -928,7 +943,7 @@ int hostfs_link_readpage(struct file *file, struct page *page)
 	}
 	kunmap(page);
 	unlock_page(page);
-	return(err);
+	return err;
 }
 
 static const struct address_space_operations hostfs_link_aops = {
@@ -978,20 +993,20 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 
 	err = read_inode(root_inode);
 	if(err){
-                /* No iput in this case because the dput does that for us */
-                dput(sb->s_root);
-                sb->s_root = NULL;
+		/* No iput in this case because the dput does that for us */
+		dput(sb->s_root);
+		sb->s_root = NULL;
 		goto out;
-        }
+	}
 
-	return(0);
+	return 0;
 
- out_put:
-        iput(root_inode);
- out_free:
+out_put:
+	iput(root_inode);
+out_free:
 	kfree(host_root_path);
- out:
-	return(err);
+out:
+	return err;
 }
 
 static int hostfs_read_sb(struct file_system_type *type,
@@ -1011,7 +1026,7 @@ static struct file_system_type hostfs_type = {
 
 static int __init init_hostfs(void)
 {
-	return(register_filesystem(&hostfs_type));
+	return register_filesystem(&hostfs_type);
 }
 
 static void __exit exit_hostfs(void)
@@ -1022,14 +1037,3 @@ static void __exit exit_hostfs(void)
 module_init(init_hostfs)
 module_exit(exit_hostfs)
 MODULE_LICENSE("GPL");
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 1ed5ea389f15..5625e2481dd3 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
 
@@ -21,12 +21,16 @@ int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
 	      int *nlink_out, int *uid_out, int *gid_out,
 	      unsigned long long *size_out, struct timespec *atime_out,
 	      struct timespec *mtime_out, struct timespec *ctime_out,
-	      int *blksize_out, unsigned long long *blocks_out)
+	      int *blksize_out, unsigned long long *blocks_out, int fd)
 {
 	struct stat64 buf;
 
-	if(lstat64(path, &buf) < 0)
-		return(-errno);
+	if(fd >= 0) {
+		if (fstat64(fd, &buf) < 0)
+			return -errno;
+	} else if(lstat64(path, &buf) < 0) {
+		return -errno;
+	}
 
 	if(inode_out != NULL) *inode_out = buf.st_ino;
 	if(mode_out != NULL) *mode_out = buf.st_mode;
@@ -48,7 +52,7 @@ int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
 	}
 	if(blksize_out != NULL) *blksize_out = buf.st_blksize;
 	if(blocks_out != NULL) *blocks_out = buf.st_blocks;
-	return(0);
+	return 0;
 }
 
 int file_type(const char *path, int *maj, int *min)
@@ -56,7 +60,7 @@ int file_type(const char *path, int *maj, int *min)
  	struct stat64 buf;
 
 	if(lstat64(path, &buf) < 0)
-		return(-errno);
+		return -errno;
 	/*We cannot pass rdev as is because glibc and the kernel disagree
 	 *about its definition.*/
 	if(maj != NULL)
@@ -64,13 +68,13 @@ int file_type(const char *path, int *maj, int *min)
 	if(min != NULL)
 		*min = minor(buf.st_rdev);
 
-	if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR);
-	else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK);
-	else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV);
-	else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV);
-	else if(S_ISFIFO(buf.st_mode))return(OS_TYPE_FIFO);
-	else if(S_ISSOCK(buf.st_mode))return(OS_TYPE_SOCK);
-	else return(OS_TYPE_FILE);
+	if(S_ISDIR(buf.st_mode)) return OS_TYPE_DIR;
+	else if(S_ISLNK(buf.st_mode)) return OS_TYPE_SYMLINK;
+	else if(S_ISCHR(buf.st_mode)) return OS_TYPE_CHARDEV;
+	else if(S_ISBLK(buf.st_mode)) return OS_TYPE_BLOCKDEV;
+	else if(S_ISFIFO(buf.st_mode))return OS_TYPE_FIFO;
+	else if(S_ISSOCK(buf.st_mode))return OS_TYPE_SOCK;
+	else return OS_TYPE_FILE;
 }
 
 int access_file(char *path, int r, int w, int x)
@@ -80,8 +84,9 @@ int access_file(char *path, int r, int w, int x)
 	if(r) mode = R_OK;
 	if(w) mode |= W_OK;
 	if(x) mode |= X_OK;
-	if(access(path, mode) != 0) return(-errno);
-	else return(0);
+	if(access(path, mode) != 0)
+		return -errno;
+	else return 0;
 }
 
 int open_file(char *path, int r, int w, int append)
@@ -99,8 +104,9 @@ int open_file(char *path, int r, int w, int append)
 	if(append)
 		mode |= O_APPEND;
 	fd = open64(path, mode);
-	if(fd < 0) return(-errno);
-	else return(fd);
+	if(fd < 0)
+		return -errno;
+	else return fd;
 }
 
 void *open_dir(char *path, int *err_out)
@@ -109,8 +115,9 @@ void *open_dir(char *path, int *err_out)
 
 	dir = opendir(path);
 	*err_out = errno;
-	if(dir == NULL) return(NULL);
-	return(dir);
+	if(dir == NULL)
+		return NULL;
+	return dir;
 }
 
 char *read_dir(void *stream, unsigned long long *pos,
@@ -121,11 +128,12 @@ char *read_dir(void *stream, unsigned long long *pos,
 
 	seekdir(dir, *pos);
 	ent = readdir(dir);
-	if(ent == NULL) return(NULL);
+	if(ent == NULL)
+		return NULL;
 	*len_out = strlen(ent->d_name);
 	*ino_out = ent->d_ino;
 	*pos = telldir(dir);
-	return(ent->d_name);
+	return ent->d_name;
 }
 
 int read_file(int fd, unsigned long long *offset, char *buf, int len)
@@ -133,9 +141,10 @@ int read_file(int fd, unsigned long long *offset, char *buf, int len)
 	int n;
 
 	n = pread64(fd, buf, len, *offset);
-	if(n < 0) return(-errno);
+	if(n < 0)
+		return -errno;
 	*offset += n;
-	return(n);
+	return n;
 }
 
 int write_file(int fd, unsigned long long *offset, const char *buf, int len)
@@ -143,9 +152,10 @@ int write_file(int fd, unsigned long long *offset, const char *buf, int len)
 	int n;
 
 	n = pwrite64(fd, buf, len, *offset);
-	if(n < 0) return(-errno);
+	if(n < 0)
+		return -errno;
 	*offset += n;
-	return(n);
+	return n;
 }
 
 int lseek_file(int fd, long long offset, int whence)
@@ -154,8 +164,8 @@ int lseek_file(int fd, long long offset, int whence)
 
 	ret = lseek64(fd, offset, whence);
 	if(ret < 0)
-		return(-errno);
-	return(0);
+		return -errno;
+	return 0;
 }
 
 int fsync_file(int fd, int datasync)
@@ -198,65 +208,90 @@ int file_create(char *name, int ur, int uw, int ux, int gr,
 	mode |= ox ? S_IXOTH : 0;
 	fd = open64(name, O_CREAT | O_RDWR, mode);
 	if(fd < 0)
-		return(-errno);
-	return(fd);
+		return -errno;
+	return fd;
 }
 
-int set_attr(const char *file, struct hostfs_iattr *attrs)
+int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
 {
-	struct utimbuf buf;
+	struct timeval times[2];
+	struct timespec atime_ts, mtime_ts;
 	int err, ma;
 
-	if(attrs->ia_valid & HOSTFS_ATTR_MODE){
-		if(chmod(file, attrs->ia_mode) != 0) return(-errno);
-	}
-	if(attrs->ia_valid & HOSTFS_ATTR_UID){
-		if(chown(file, attrs->ia_uid, -1)) return(-errno);
+	if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
+		if (fd >= 0) {
+			if (fchmod(fd, attrs->ia_mode) != 0)
+				return (-errno);
+		} else if (chmod(file, attrs->ia_mode) != 0) {
+			return -errno;
+		}
 	}
-	if(attrs->ia_valid & HOSTFS_ATTR_GID){
-		if(chown(file, -1, attrs->ia_gid)) return(-errno);
+	if (attrs->ia_valid & HOSTFS_ATTR_UID) {
+		if (fd >= 0) {
+			if (fchown(fd, attrs->ia_uid, -1))
+				return -errno;
+		} else if(chown(file, attrs->ia_uid, -1)) {
+			return -errno;
+		}
 	}
-	if(attrs->ia_valid & HOSTFS_ATTR_SIZE){
-		if(truncate(file, attrs->ia_size)) return(-errno);
+	if (attrs->ia_valid & HOSTFS_ATTR_GID) {
+		if (fd >= 0) {
+			if (fchown(fd, -1, attrs->ia_gid))
+				return -errno;
+		} else if (chown(file, -1, attrs->ia_gid)) {
+			return -errno;
+		}
 	}
-	ma = HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET;
-	if((attrs->ia_valid & ma) == ma){
-		buf.actime = attrs->ia_atime.tv_sec;
-		buf.modtime = attrs->ia_mtime.tv_sec;
-		if(utime(file, &buf) != 0) return(-errno);
+	if (attrs->ia_valid & HOSTFS_ATTR_SIZE) {
+		if (fd >= 0) {
+			if (ftruncate(fd, attrs->ia_size))
+				return -errno;
+		} else if (truncate(file, attrs->ia_size)) {
+			return -errno;
+		}
 	}
-	else {
-		struct timespec ts;
-
-		if(attrs->ia_valid & HOSTFS_ATTR_ATIME_SET){
-			err = stat_file(file, NULL, NULL, NULL, NULL, NULL,
-					NULL, NULL, &ts, NULL, NULL, NULL);
-			if(err != 0)
-				return(err);
-			buf.actime = attrs->ia_atime.tv_sec;
-			buf.modtime = ts.tv_sec;
-			if(utime(file, &buf) != 0)
-				return(-errno);
+
+	/* Update accessed and/or modified time, in two parts: first set
+	 * times according to the changes to perform, and then call futimes()
+	 * or utimes() to apply them. */
+	ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET);
+	if (attrs->ia_valid & ma) {
+		err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL,
+				&atime_ts, &mtime_ts, NULL, NULL, NULL, fd);
+		if (err != 0)
+			return err;
+
+		times[0].tv_sec = atime_ts.tv_sec;
+		times[0].tv_usec = atime_ts.tv_nsec * 1000;
+		times[1].tv_sec = mtime_ts.tv_sec;
+		times[1].tv_usec = mtime_ts.tv_nsec * 1000;
+
+		if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) {
+			times[0].tv_sec = attrs->ia_atime.tv_sec;
+			times[0].tv_usec = attrs->ia_atime.tv_nsec * 1000;
+		}
+		if (attrs->ia_valid & HOSTFS_ATTR_MTIME_SET) {
+			times[1].tv_sec = attrs->ia_mtime.tv_sec;
+			times[1].tv_usec = attrs->ia_mtime.tv_nsec * 1000;
 		}
-		if(attrs->ia_valid & HOSTFS_ATTR_MTIME_SET){
-			err = stat_file(file, NULL, NULL, NULL, NULL, NULL,
-					NULL, &ts, NULL, NULL, NULL, NULL);
-			if(err != 0)
-				return(err);
-			buf.actime = ts.tv_sec;
-			buf.modtime = attrs->ia_mtime.tv_sec;
-			if(utime(file, &buf) != 0)
-				return(-errno);
+
+		if (fd >= 0) {
+			if (futimes(fd, times) != 0)
+				return -errno;
+		} else if (utimes(file, times) != 0) {
+			return -errno;
 		}
 	}
+
 	if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ;
 	if(attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)){
 		err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL,
 				&attrs->ia_atime, &attrs->ia_mtime, NULL,
-				NULL, NULL);
-		if(err != 0) return(err);
+				NULL, NULL, fd);
+		if(err != 0)
+			return err;
 	}
-	return(0);
+	return 0;
 }
 
 int make_symlink(const char *from, const char *to)
@@ -264,8 +299,9 @@ int make_symlink(const char *from, const char *to)
 	int err;
 
 	err = symlink(to, from);
-	if(err) return(-errno);
-	return(0);
+	if(err)
+		return -errno;
+	return 0;
 }
 
 int unlink_file(const char *file)
@@ -273,8 +309,9 @@ int unlink_file(const char *file)
 	int err;
 
 	err = unlink(file);
-	if(err) return(-errno);
-	return(0);
+	if(err)
+		return -errno;
+	return 0;
 }
 
 int do_mkdir(const char *file, int mode)
@@ -282,8 +319,9 @@ int do_mkdir(const char *file, int mode)
 	int err;
 
 	err = mkdir(file, mode);
-	if(err) return(-errno);
-	return(0);
+	if(err)
+		return -errno;
+	return 0;
 }
 
 int do_rmdir(const char *file)
@@ -291,8 +329,9 @@ int do_rmdir(const char *file)
 	int err;
 
 	err = rmdir(file);
-	if(err) return(-errno);
-	return(0);
+	if(err)
+		return -errno;
+	return 0;
 }
 
 int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor)
@@ -300,8 +339,9 @@ int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor)
 	int err;
 
 	err = mknod(file, mode, makedev(major, minor));
-	if(err) return(-errno);
-	return(0);
+	if(err)
+		return -errno;
+	return 0;
 }
 
 int link_file(const char *to, const char *from)
@@ -309,8 +349,9 @@ int link_file(const char *to, const char *from)
 	int err;
 
 	err = link(to, from);
-	if(err) return(-errno);
-	return(0);
+	if(err)
+		return -errno;
+	return 0;
 }
 
 int do_readlink(char *file, char *buf, int size)
@@ -319,10 +360,10 @@ int do_readlink(char *file, char *buf, int size)
 
 	n = readlink(file, buf, size);
 	if(n < 0)
-		return(-errno);
+		return -errno;
 	if(n < size)
 		buf[n] = '\0';
-	return(n);
+	return n;
 }
 
 int rename_file(char *from, char *to)
@@ -330,8 +371,9 @@ int rename_file(char *from, char *to)
 	int err;
 
 	err = rename(from, to);
-	if(err < 0) return(-errno);
-	return(0);
+	if(err < 0)
+		return -errno;
+	return 0;
 }
 
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
@@ -344,7 +386,9 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	int err;
 
 	err = statfs64(root, &buf);
-	if(err < 0) return(-errno);
+	if(err < 0)
+		return -errno;
+
 	*bsize_out = buf.f_bsize;
 	*blocks_out = buf.f_blocks;
 	*bfree_out = buf.f_bfree;
@@ -360,16 +404,5 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	spare_out[2] = buf.f_spare[2];
 	spare_out[3] = buf.f_spare[3];
 	spare_out[4] = buf.f_spare[4];
-	return(0);
+	return 0;
 }
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b52b7381d10f..b6fca543544c 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -5,7 +5,7 @@
  *
  *  general buffer i/o
  */
-
+#include <linux/sched.h>
 #include "hpfs_fn.h"
 
 void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 9953cf9a2f16..d256559b4104 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -5,7 +5,7 @@
  *
  *  adding & removing files & directories
  */
-
+#include <linux/sched.h>
 #include "hpfs_fn.h"
 
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index e0174e338526..29cc34abb2ea 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/statfs.h>
 #include <linux/magic.h>
+#include <linux/sched.h>
 
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
 
@@ -176,12 +177,9 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		mutex_init(&ei->i_mutex);
-		mutex_init(&ei->i_parent_mutex);
-		inode_init_once(&ei->vfs_inode);
-	}
+	mutex_init(&ei->i_mutex);
+	mutex_init(&ei->i_parent_mutex);
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c718a3d413f..aa083dd34e92 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/backing-dev.h>
 #include <linux/hugetlb.h>
 #include <linux/pagevec.h>
+#include <linux/mman.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/dnotify.h>
@@ -98,10 +99,7 @@ out:
  * Called under down_write(mmap_sem).
  */
 
-#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags);
-#else
+#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 static unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
@@ -115,6 +113,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
+	if (flags & MAP_FIXED) {
+		if (prepare_hugepage_range(addr, len, pgoff))
+			return -EINVAL;
+		return addr;
+	}
+
 	if (addr) {
 		addr = ALIGN(addr, HPAGE_SIZE);
 		vma = find_vma(mm, addr);
@@ -453,7 +457,7 @@ static int hugetlbfs_symlink(struct inode *dir,
  */
 static int hugetlbfs_set_page_dirty(struct page *page)
 {
-	struct page *head = (struct page *)page_private(page);
+	struct page *head = compound_head(page);
 
 	SetPageDirty(head);
 	return 0;
@@ -552,9 +556,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
 
 const struct file_operations hugetlbfs_file_operations = {
@@ -744,6 +746,9 @@ struct file *hugetlb_zero_setup(size_t size)
 	char buf[16];
 	static atomic_t counter;
 
+	if (!hugetlbfs_vfsmount)
+		return ERR_PTR(-ENOENT);
+
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
diff --git a/fs/inode.c b/fs/inode.c
index 5abb097ab1b0..9a012cc5b6cd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -213,9 +213,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct inode * inode = (struct inode *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(inode);
+	inode_init_once(inode);
 }
 
 /*
@@ -251,7 +249,7 @@ void clear_inode(struct inode *inode)
 	BUG_ON(inode->i_state & I_CLEAR);
 	wait_on_inode(inode);
 	DQUOT_DROP(inode);
-	if (inode->i_sb && inode->i_sb->s_op->clear_inode)
+	if (inode->i_sb->s_op->clear_inode)
 		inode->i_sb->s_op->clear_inode(inode);
 	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
 		bd_forget(inode);
@@ -276,7 +274,7 @@ static void dispose_list(struct list_head *head)
 	while (!list_empty(head)) {
 		struct inode *inode;
 
-		inode = list_entry(head->next, struct inode, i_list);
+		inode = list_first_entry(head, struct inode, i_list);
 		list_del(&inode->i_list);
 
 		if (inode->i_data.nrpages)
@@ -525,7 +523,12 @@ repeat:
  */
 struct inode *new_inode(struct super_block *sb)
 {
-	static unsigned long last_ino;
+	/*
+	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+	 * error if st_ino won't fit in target struct field. Use 32bit counter
+	 * here to attempt to avoid that.
+	 */
+	static unsigned int last_ino;
 	struct inode * inode;
 
 	spin_lock_prefetch(&inode_lock);
@@ -684,27 +687,28 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
  */
 ino_t iunique(struct super_block *sb, ino_t max_reserved)
 {
-	static ino_t counter;
+	/*
+	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+	 * error if st_ino won't fit in target struct field. Use 32bit counter
+	 * here to attempt to avoid that.
+	 */
+	static unsigned int counter;
 	struct inode *inode;
-	struct hlist_head * head;
+	struct hlist_head *head;
 	ino_t res;
+
 	spin_lock(&inode_lock);
-retry:
-	if (counter > max_reserved) {
-		head = inode_hashtable + hash(sb,counter);
+	do {
+		if (counter <= max_reserved)
+			counter = max_reserved + 1;
 		res = counter++;
+		head = inode_hashtable + hash(sb, res);
 		inode = find_inode_fast(sb, head, res);
-		if (!inode) {
-			spin_unlock(&inode_lock);
-			return res;
-		}
-	} else {
-		counter = max_reserved + 1;
-	}
-	goto retry;
-	
-}
+	} while (inode != NULL);
+	spin_unlock(&inode_lock);
 
+	return res;
+}
 EXPORT_SYMBOL(iunique);
 
 struct inode *igrab(struct inode *inode)
@@ -1041,7 +1045,7 @@ static void generic_forget_inode(struct inode *inode)
 		if (!(inode->i_state & (I_DIRTY|I_LOCK)))
 			list_move(&inode->i_list, &inode_unused);
 		inodes_stat.nr_unused++;
-		if (!sb || (sb->s_flags & MS_ACTIVE)) {
+		if (sb->s_flags & MS_ACTIVE) {
 			spin_unlock(&inode_lock);
 			return;
 		}
diff --git a/fs/inotify.c b/fs/inotify.c
index f5099d86fd91..7457501b9565 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -509,7 +509,7 @@ void inotify_destroy(struct inotify_handle *ih)
 			mutex_unlock(&ih->mutex);
 			break;
 		}
-		watch = list_entry(watches->next, struct inotify_watch, h_list);
+		watch = list_first_entry(watches, struct inotify_watch, h_list);
 		get_inotify_watch(watch);
 		mutex_unlock(&ih->mutex);
 
diff --git a/fs/internal.h b/fs/internal.h
index ea00126c9a59..392e8ccd6fc4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,8 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/ioctl32.h>
-
 struct super_block;
 
 /*
@@ -42,14 +40,6 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 extern void __init chrdev_init(void);
 
 /*
- * compat_ioctl.c
- */
-#ifdef CONFIG_COMPAT
-extern struct ioctl_trans ioctl_start[];
-extern int ioctl_table_size;
-#endif
-
-/*
  * namespace.c
  */
 extern int copy_mount_options(const void __user *, unsigned long *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index ff61772ceedd..8c90cbc903fa 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/security.h>
 #include <linux/module.h>
+#include <linux/kallsyms.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -20,6 +21,7 @@ static long do_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
 	int error = -ENOTTY;
+	void *f;
 
 	if (!filp->f_op)
 		goto out;
@@ -29,10 +31,16 @@ static long do_ioctl(struct file *filp, unsigned int cmd,
 		if (error == -ENOIOCTLCMD)
 			error = -EINVAL;
 		goto out;
-	} else if (filp->f_op->ioctl) {
+	} else if ((f = filp->f_op->ioctl)) {
 		lock_kernel();
-		error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
-					  filp, cmd, arg);
+		if (!filp->f_op->ioctl) {
+			printk("%s: ioctl %p disappeared\n", __FUNCTION__, f);
+			print_symbol("symbol: %s\n", (unsigned long)f);
+			dump_stack();
+		} else {
+			error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
+						  filp, cmd, arg);
+		}
 		unlock_kernel();
 	}
 
@@ -67,8 +75,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
 			return put_user(res, p);
 		}
 		case FIGETBSZ:
-			if (inode->i_sb == NULL)
-				return -EBADF;
 			return put_user(inode->i_sb->s_blocksize, p);
 		case FIONREAD:
 			return put_user(i_size_read(inode) - filp->f_pos, p);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 64a96cdfe3a4..5c3eecf7542e 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -77,9 +77,7 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
 {
 	struct iso_inode_info *ei = foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 0208cc7ac5d0..47552d4a6324 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/checkpoint.c
+ * linux/fs/jbd/checkpoint.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index be4648bc7a2f..1facfaff97cb 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,7 +20,6 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 10fff9443938..46fe7439fb91 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -28,7 +28,6 @@
 #include <linux/jbd.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/freezer.h>
@@ -211,10 +210,16 @@ end_loop:
 	return 0;
 }
 
-static void journal_start_thread(journal_t *journal)
+static int journal_start_thread(journal_t *journal)
 {
-	kthread_run(kjournald, journal, "kjournald");
+	struct task_struct *t;
+
+	t = kthread_run(kjournald, journal, "kjournald");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
 	wait_event(journal->j_wait_done_commit, journal->j_task != 0);
+	return 0;
 }
 
 static void journal_kill_thread(journal_t *journal)
@@ -840,8 +845,7 @@ static int journal_reset(journal_t *journal)
 
 	/* Add the dynamic fields and write it to disk. */
 	journal_update_superblock(journal, 1);
-	journal_start_thread(journal);
-	return 0;
+	return journal_start_thread(journal);
 }
 
 /**
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 11563fe2a52b..2a5f4b833e35 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/recovery.c
+ * linux/fs/jbd/recovery.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index d204ab394f36..824e3b7d4ec1 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/revoke.c
+ * linux/fs/jbd/revoke.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
  *
@@ -66,7 +66,6 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/list.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #endif
 
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index cceaf57e3778..772b6531a2a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/transaction.c
+ * linux/fs/jbd/transaction.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  *
@@ -23,7 +23,6 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
-#include <linux/smp_lock.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 68039fa9a566..3fccde7ba008 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/checkpoint.c
+ * linux/fs/jbd2/checkpoint.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6bd8005e3d34..2856e1100a5f 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -20,7 +20,6 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 44fc32bfd7f1..78d63b818f0b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -28,7 +28,6 @@
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/freezer.h>
@@ -211,10 +210,16 @@ end_loop:
 	return 0;
 }
 
-static void jbd2_journal_start_thread(journal_t *journal)
+static int jbd2_journal_start_thread(journal_t *journal)
 {
-	kthread_run(kjournald2, journal, "kjournald2");
+	struct task_struct *t;
+
+	t = kthread_run(kjournald2, journal, "kjournald2");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
 	wait_event(journal->j_wait_done_commit, journal->j_task != 0);
+	return 0;
 }
 
 static void journal_kill_thread(journal_t *journal)
@@ -840,8 +845,7 @@ static int journal_reset(journal_t *journal)
 
 	/* Add the dynamic fields and write it to disk. */
 	jbd2_journal_update_superblock(journal, 1);
-	jbd2_journal_start_thread(journal);
-	return 0;
+	return jbd2_journal_start_thread(journal);
 }
 
 /**
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 9f10acafaf70..395c92a04ac9 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/recovery.c
+ * linux/fs/jbd2/recovery.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f506646ad0ff..9246e763da78 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/revoke.c
+ * linux/fs/jbd2/revoke.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
  *
@@ -66,7 +66,6 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/list.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #endif
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 3a8700153cb0..7946ff43fc40 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/transaction.c
+ * linux/fs/jbd2/transaction.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  *
@@ -23,7 +23,6 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
-#include <linux/smp_lock.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index e3c69e659efb..e220d3bd610d 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -47,11 +47,8 @@ static void jffs2_i_init_once(void * foo, struct kmem_cache * cachep, unsigned l
 {
 	struct jffs2_inode_info *ei = (struct jffs2_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		init_MUTEX(&ei->sem);
-		inode_init_once(&ei->vfs_inode);
-	}
+	init_MUTEX(&ei->sem);
+	inode_init_once(&ei->vfs_inode);
 }
 
 static int jffs2_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index e285022f006c..3467dde27e5a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -55,7 +55,6 @@ void jfs_read_inode(struct inode *inode)
 		inode->i_op = &jfs_file_inode_operations;
 		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 	}
-	jfs_set_inode_flags(inode);
 }
 
 /*
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index ed814b1ff4d9..fe063af6fd2f 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -59,6 +59,7 @@ int jfs_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
 
 	switch (cmd) {
 	case JFS_IOC_GETFLAGS:
+		jfs_get_inode_flags(jfs_inode);
 		flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
 		flags = jfs_map_ext2(flags, 0);
 		return put_user(flags, (int __user *) arg);
@@ -78,6 +79,7 @@ int jfs_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~JFS_DIRSYNC_FL;
 
+		jfs_get_inode_flags(jfs_inode);
 		oldflags = jfs_inode->mode2;
 
 		/*
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 82b0544bd76d..f3b1ebb22280 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1507,7 +1507,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 		if (l2nb < budmin) {
 
 			/* search the lower level dmap control pages to get
-			 * the starting block number of the the dmap that
+			 * the starting block number of the dmap that
 			 * contains or starts off the free space.
 			 */
 			if ((rc =
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index aa5124b643b1..c6530227cda6 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -386,7 +386,7 @@ int diRead(struct inode *ip)
 		return -EIO;
 	}
 
-	/* locate the the disk inode requested */
+	/* locate the disk inode requested */
 	dp = (struct dinode *) mp->data;
 	dp += rel_inode;
 
@@ -1407,7 +1407,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 	inum = pip->i_ino + 1;
 	ino = inum & (INOSPERIAG - 1);
 
-	/* back off the the hint if it is outside of the iag */
+	/* back off the hint if it is outside of the iag */
 	if (ino == 0)
 		inum = pip->i_ino;
 
@@ -3078,6 +3078,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 
 	jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
 	jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
+	jfs_set_inode_flags(ip);
 
 	ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff;
 	if (sbi->umask != -1) {
@@ -3174,6 +3175,7 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
 		dip->di_gid = cpu_to_le32(ip->i_gid);
 	else
 		dip->di_gid = cpu_to_le32(jfs_ip->saved_gid);
+	jfs_get_inode_flags(jfs_ip);
 	/*
 	 * mode2 is only needed for storing the higher order bits.
 	 * Trust i_mode for the lower order ones
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 4c67ed97682b..ed6574bee51a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -45,6 +45,24 @@ void jfs_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_SYNC;
 }
 
+void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip)
+{
+	unsigned int flags = jfs_ip->vfs_inode.i_flags;
+
+	jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL |
+			   JFS_DIRSYNC_FL | JFS_SYNC_FL);
+	if (flags & S_IMMUTABLE)
+		jfs_ip->mode2 |= JFS_IMMUTABLE_FL;
+	if (flags & S_APPEND)
+		jfs_ip->mode2 |= JFS_APPEND_FL;
+	if (flags & S_NOATIME)
+		jfs_ip->mode2 |= JFS_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		jfs_ip->mode2 |= JFS_DIRSYNC_FL;
+	if (flags & S_SYNC)
+		jfs_ip->mode2 |= JFS_SYNC_FL;
+}
+
 /*
  * NAME:	ialloc()
  *
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 6802837f757e..2374b595f2e1 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -31,6 +31,7 @@ extern void jfs_truncate(struct inode *);
 extern void jfs_truncate_nolock(struct inode *, loff_t);
 extern void jfs_free_zero_link(struct inode *);
 extern struct dentry *jfs_get_parent(struct dentry *dentry);
+extern void jfs_get_inode_flags(struct jfs_inode_info *);
 extern void jfs_set_inode_flags(struct inode *);
 extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
index df48ece4b7a3..ecf04882265e 100644
--- a/fs/jfs/jfs_lock.h
+++ b/fs/jfs/jfs_lock.h
@@ -45,7 +45,7 @@ do {							\
 		io_schedule();				\
 		lock_cmd;				\
 	}						\
-	current->state = TASK_RUNNING;			\
+	__set_current_state(TASK_RUNNING);			\
 	remove_wait_queue(&wq, &__wait);		\
 } while (0)
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 5065baa530b6..44a2f33cb98d 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -62,7 +62,6 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
-#include <linux/smp_lock.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
 #include <linux/buffer_head.h>		/* for sync_blockdev() */
@@ -1590,7 +1589,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		LOGGC_UNLOCK(log);
 		schedule();
-		current->state = TASK_RUNNING;
+		__set_current_state(TASK_RUNNING);
 		LOGGC_LOCK(log);
 		remove_wait_queue(&target->gcwait, &__wait);
 	}
@@ -1961,7 +1960,7 @@ static void lbmfree(struct lbuf * bp)
 /*
  * NAME:	lbmRedrive
  *
- * FUNCTION:	add a log buffer to the the log redrive list
+ * FUNCTION:	add a log buffer to the log redrive list
  *
  * PARAMETER:
  *     bp	- log buffer
@@ -2354,14 +2353,15 @@ int jfsIOWait(void *arg)
 			lbmStartIO(bp);
 			spin_lock_irq(&log_redrive_lock);
 		}
-		spin_unlock_irq(&log_redrive_lock);
 
 		if (freezing(current)) {
+			spin_unlock_irq(&log_redrive_lock);
 			refrigerator();
 		} else {
 			set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&log_redrive_lock);
 			schedule();
-			current->state = TASK_RUNNING;
+			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 58deae007507..43d4f69afbec 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -184,17 +184,14 @@ static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct metapage *mp = (struct metapage *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		mp->lid = 0;
-		mp->lsn = 0;
-		mp->flag = 0;
-		mp->data = NULL;
-		mp->clsn = 0;
-		mp->log = NULL;
-		set_bit(META_free, &mp->flag);
-		init_waitqueue_head(&mp->wait);
-	}
+	mp->lid = 0;
+	mp->lsn = 0;
+	mp->flag = 0;
+	mp->data = NULL;
+	mp->clsn = 0;
+	mp->log = NULL;
+	set_bit(META_free, &mp->flag);
+	init_waitqueue_head(&mp->wait);
 }
 
 static inline struct metapage *alloc_metapage(gfp_t gfp_mask)
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 03893acbfda4..25430d0b0d59 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -44,7 +44,6 @@
 
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
-#include <linux/smp_lock.h>
 #include <linux/completion.h>
 #include <linux/freezer.h>
 #include <linux/module.h>
@@ -136,7 +135,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	TXN_UNLOCK();
 	io_schedule();
-	current->state = TASK_RUNNING;
+	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(event, &wait);
 }
 
@@ -2798,7 +2797,7 @@ int jfs_lazycommit(void *arg)
 			set_current_state(TASK_INTERRUPTIBLE);
 			LAZY_UNLOCK(flags);
 			schedule();
-			current->state = TASK_RUNNING;
+			__set_current_state(TASK_RUNNING);
 			remove_wait_queue(&jfs_commit_thread_wait, &wq);
 		}
 	} while (!kthread_should_stop());
@@ -2990,7 +2989,7 @@ int jfs_sync(void *arg)
 			set_current_state(TASK_INTERRUPTIBLE);
 			TXN_UNLOCK();
 			schedule();
-			current->state = TASK_RUNNING;
+			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 52d73d54a931..20e4ac1c79a3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -752,21 +752,18 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
 {
 	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		memset(jfs_ip, 0, sizeof(struct jfs_inode_info));
-		INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
-		init_rwsem(&jfs_ip->rdwrlock);
-		mutex_init(&jfs_ip->commit_mutex);
-		init_rwsem(&jfs_ip->xattr_sem);
-		spin_lock_init(&jfs_ip->ag_lock);
-		jfs_ip->active_ag = -1;
+	memset(jfs_ip, 0, sizeof(struct jfs_inode_info));
+	INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
+	init_rwsem(&jfs_ip->rdwrlock);
+	mutex_init(&jfs_ip->commit_mutex);
+	init_rwsem(&jfs_ip->xattr_sem);
+	spin_lock_init(&jfs_ip->ag_lock);
+	jfs_ip->active_ag = -1;
 #ifdef CONFIG_JFS_POSIX_ACL
-		jfs_ip->i_acl = JFS_ACL_NOT_CACHED;
-		jfs_ip->i_default_acl = JFS_ACL_NOT_CACHED;
+	jfs_ip->i_acl = JFS_ACL_NOT_CACHED;
+	jfs_ip->i_default_acl = JFS_ACL_NOT_CACHED;
 #endif
-		inode_init_once(&jfs_ip->vfs_inode);
-	}
+	inode_init_once(&jfs_ip->vfs_inode);
 }
 
 static int __init init_jfs_fs(void)
diff --git a/fs/libfs.c b/fs/libfs.c
index d93842d3c0a0..5294de1f40c4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -159,7 +159,10 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 					continue;
 
 				spin_unlock(&dcache_lock);
-				if (filldir(dirent, next->d_name.name, next->d_name.len, filp->f_pos, next->d_inode->i_ino, dt_type(next->d_inode)) < 0)
+				if (filldir(dirent, next->d_name.name, 
+					    next->d_name.len, filp->f_pos, 
+					    next->d_inode->i_ino, 
+					    dt_type(next->d_inode)) < 0)
 					return 0;
 				spin_lock(&dcache_lock);
 				/* next is still alive */
@@ -220,6 +223,12 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	root = new_inode(s);
 	if (!root)
 		goto Enomem;
+	/*
+	 * since this is the first inode, make it number 1. New inodes created
+	 * after this must take care not to collide with it (by passing
+	 * max_reserved of 1 to iunique).
+	 */
+	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
 	root->i_uid = root->i_gid = 0;
 	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
@@ -360,6 +369,11 @@ int simple_commit_write(struct file *file, struct page *page,
 	return 0;
 }
 
+/*
+ * the inodes created here are not hashed. If you use iunique to generate
+ * unique inode values later for this filesystem, then you must take care
+ * to pass it an appropriate max_reserved value to avoid collisions.
+ */
 int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
 {
 	struct inode *inode;
@@ -376,6 +390,11 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 	inode = new_inode(s);
 	if (!inode)
 		return -ENOMEM;
+	/*
+	 * because the root inode is 1, the files array must not contain an
+	 * entry at index 1
+	 */
+	inode->i_ino = 1;
 	inode->i_mode = S_IFDIR | 0755;
 	inode->i_uid = inode->i_gid = 0;
 	inode->i_blocks = 0;
@@ -391,6 +410,13 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 	for (i = 0; !files->name || files->name[0]; i++, files++) {
 		if (!files->name)
 			continue;
+
+		/* warn if it tries to conflict with the root inode */
+		if (unlikely(i == 1))
+			printk(KERN_WARNING "%s: %s passed in a files array"
+				"with an index of 1!\n", __func__,
+				s->s_type->name);
+
 		dentry = d_alloc_name(root, files->name);
 		if (!dentry)
 			goto out;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index f4d45d4d835b..d070b18e539d 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -153,7 +153,7 @@ nlmclnt_recovery(struct nlm_host *host)
 	if (!host->h_reclaiming++) {
 		nlm_get_host(host);
 		__module_get(THIS_MODULE);
-		if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0)
+		if (kernel_thread(reclaimer, host, CLONE_FS | CLONE_FILES) < 0)
 			module_put(THIS_MODULE);
 	}
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index a5c019e1a447..a10343bed160 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -12,7 +12,6 @@
 #include <linux/fs.h>
 #include <linux/nfs_fs.h>
 #include <linux/utsname.h>
-#include <linux/smp_lock.h>
 #include <linux/freezer.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index ad21c0713efa..96070bff93fc 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -221,7 +221,7 @@ nlm_bind_host(struct nlm_host *host)
 					host->h_nextrebind - jiffies);
 		}
 	} else {
-		unsigned long increment = nlmsvc_timeout * HZ;
+		unsigned long increment = nlmsvc_timeout;
 		struct rpc_timeout timeparms = {
 			.to_initval	= increment,
 			.to_increment	= increment,
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index eb243edf8932..2102e2d0134d 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -225,16 +225,13 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 #define SM_monres_sz	2
 #define SM_unmonres_sz	1
 
-#ifndef MAX
-# define MAX(a, b)	(((a) > (b))? (a) : (b))
-#endif
-
 static struct rpc_procinfo	nsm_procedures[] = {
 [SM_MON] = {
 		.p_proc		= SM_MON,
 		.p_encode	= (kxdrproc_t) xdr_encode_mon,
 		.p_decode	= (kxdrproc_t) xdr_decode_stat_res,
-		.p_bufsiz	= MAX(SM_mon_sz, SM_monres_sz) << 2,
+		.p_arglen	= SM_mon_sz,
+		.p_replen	= SM_monres_sz,
 		.p_statidx	= SM_MON,
 		.p_name		= "MONITOR",
 	},
@@ -242,7 +239,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
 		.p_proc		= SM_UNMON,
 		.p_encode	= (kxdrproc_t) xdr_encode_unmon,
 		.p_decode	= (kxdrproc_t) xdr_decode_stat,
-		.p_bufsiz	= MAX(SM_mon_id_sz, SM_unmonres_sz) << 2,
+		.p_arglen	= SM_mon_id_sz,
+		.p_replen	= SM_unmonres_sz,
 		.p_statidx	= SM_UNMON,
 		.p_name		= "UNMONITOR",
 	},
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 47a66aa5d55b..bf27b6c6cb6b 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -99,7 +99,9 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
-	resp->status = nlmsvc_testlock(file, &argp->lock, &resp->lock);
+	resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
+	if (resp->status == nlm_drop_reply)
+		return rpc_drop_reply;
 
 	dprintk("lockd: TEST4          status %d\n", ntohl(resp->status));
 	nlm_release_host(host);
@@ -143,6 +145,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Now try to lock the file */
 	resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
 					argp->block, &argp->cookie);
+	if (resp->status == nlm_drop_reply)
+		return rpc_drop_reply;
 
 	dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
 	nlm_release_host(host);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cf51f849e76c..b3efa4536cc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -173,7 +173,7 @@ found:
  */
 static inline struct nlm_block *
 nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
-				struct nlm_lock *lock, struct nlm_cookie *cookie)
+		struct nlm_lock *lock, struct nlm_cookie *cookie)
 {
 	struct nlm_block	*block;
 	struct nlm_host		*host;
@@ -210,6 +210,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 	block->b_daemon = rqstp->rq_server;
 	block->b_host   = host;
 	block->b_file   = file;
+	block->b_fl = NULL;
 	file->f_count++;
 
 	/* Add to file's list of blocks */
@@ -261,6 +262,7 @@ static void nlmsvc_free_block(struct kref *kref)
 	nlmsvc_freegrantargs(block->b_call);
 	nlm_release_call(block->b_call);
 	nlm_release_file(block->b_file);
+	kfree(block->b_fl);
 	kfree(block);
 }
 
@@ -331,6 +333,31 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
 }
 
 /*
+ * Deferred lock request handling for non-blocking lock
+ */
+static u32
+nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
+{
+	u32 status = nlm_lck_denied_nolocks;
+
+	block->b_flags |= B_QUEUED;
+
+	nlmsvc_insert_block(block, NLM_TIMEOUT);
+
+	block->b_cache_req = &rqstp->rq_chandle;
+	if (rqstp->rq_chandle.defer) {
+		block->b_deferred_req =
+			rqstp->rq_chandle.defer(block->b_cache_req);
+		if (block->b_deferred_req != NULL)
+			status = nlm_drop_reply;
+	}
+	dprintk("lockd: nlmsvc_defer_lock_rqst block %p flags %d status %d\n",
+		block, block->b_flags, status);
+
+	return status;
+}
+
+/*
  * Attempt to establish a lock, and if it can't be granted, block it
  * if required.
  */
@@ -338,7 +365,7 @@ __be32
 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
 {
-	struct nlm_block	*block, *newblock = NULL;
+	struct nlm_block	*block = NULL;
 	int			error;
 	__be32			ret;
 
@@ -351,29 +378,58 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				wait);
 
 
-	lock->fl.fl_flags &= ~FL_SLEEP;
-again:
 	/* Lock file against concurrent access */
 	mutex_lock(&file->f_mutex);
-	/* Get existing block (in case client is busy-waiting) */
+	/* Get existing block (in case client is busy-waiting)
+	 * or create new block
+	 */
 	block = nlmsvc_lookup_block(file, lock);
 	if (block == NULL) {
-		if (newblock != NULL)
-			lock = &newblock->b_call->a_args.lock;
-	} else
+		block = nlmsvc_create_block(rqstp, file, lock, cookie);
+		ret = nlm_lck_denied_nolocks;
+		if (block == NULL)
+			goto out;
 		lock = &block->b_call->a_args.lock;
+	} else
+		lock->fl.fl_flags &= ~FL_SLEEP;
 
-	error = posix_lock_file(file->f_file, &lock->fl);
-	lock->fl.fl_flags &= ~FL_SLEEP;
+	if (block->b_flags & B_QUEUED) {
+		dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n",
+							block, block->b_flags);
+		if (block->b_granted) {
+			nlmsvc_unlink_block(block);
+			ret = nlm_granted;
+			goto out;
+		}
+		if (block->b_flags & B_TIMED_OUT) {
+			nlmsvc_unlink_block(block);
+			ret = nlm_lck_denied;
+			goto out;
+		}
+		ret = nlm_drop_reply;
+		goto out;
+	}
 
-	dprintk("lockd: posix_lock_file returned %d\n", error);
+	if (!wait)
+		lock->fl.fl_flags &= ~FL_SLEEP;
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
+	lock->fl.fl_flags &= ~FL_SLEEP;
 
+	dprintk("lockd: vfs_lock_file returned %d\n", error);
 	switch(error) {
 		case 0:
 			ret = nlm_granted;
 			goto out;
 		case -EAGAIN:
+			ret = nlm_lck_denied;
 			break;
+		case -EINPROGRESS:
+			if (wait)
+				break;
+			/* Filesystem lock operation is in progress
+			   Add it to the queue waiting for callback */
+			ret = nlmsvc_defer_lock_rqst(rqstp, block);
+			goto out;
 		case -EDEADLK:
 			ret = nlm_deadlock;
 			goto out;
@@ -387,26 +443,11 @@ again:
 		goto out;
 
 	ret = nlm_lck_blocked;
-	if (block != NULL)
-		goto out;
-
-	/* If we don't have a block, create and initialize it. Then
-	 * retry because we may have slept in kmalloc. */
-	/* We have to release f_mutex as nlmsvc_create_block may try to
-	 * to claim it while doing host garbage collection */
-	if (newblock == NULL) {
-		mutex_unlock(&file->f_mutex);
-		dprintk("lockd: blocking on this lock (allocating).\n");
-		if (!(newblock = nlmsvc_create_block(rqstp, file, lock, cookie)))
-			return nlm_lck_denied_nolocks;
-		goto again;
-	}
 
 	/* Append to list of blocked */
-	nlmsvc_insert_block(newblock, NLM_NEVER);
+	nlmsvc_insert_block(block, NLM_NEVER);
 out:
 	mutex_unlock(&file->f_mutex);
-	nlmsvc_release_block(newblock);
 	nlmsvc_release_block(block);
 	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
 	return ret;
@@ -416,9 +457,14 @@ out:
  * Test for presence of a conflicting lock.
  */
 __be32
-nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
-				       struct nlm_lock *conflock)
+nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
+		struct nlm_lock *lock, struct nlm_lock *conflock,
+		struct nlm_cookie *cookie)
 {
+	struct nlm_block 	*block = NULL;
+	int			error;
+	__be32			ret;
+
 	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
 				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
 				file->f_file->f_path.dentry->d_inode->i_ino,
@@ -426,19 +472,70 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
-	if (posix_test_lock(file->f_file, &lock->fl, &conflock->fl)) {
-		dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
-				conflock->fl.fl_type,
-				(long long)conflock->fl.fl_start,
-				(long long)conflock->fl.fl_end);
-		conflock->caller = "somehost";	/* FIXME */
-		conflock->len = strlen(conflock->caller);
-		conflock->oh.len = 0;		/* don't return OH info */
-		conflock->svid = conflock->fl.fl_pid;
-		return nlm_lck_denied;
+	/* Get existing block (in case client is busy-waiting) */
+	block = nlmsvc_lookup_block(file, lock);
+
+	if (block == NULL) {
+		struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+
+		if (conf == NULL)
+			return nlm_granted;
+		block = nlmsvc_create_block(rqstp, file, lock, cookie);
+		if (block == NULL) {
+			kfree(conf);
+			return nlm_granted;
+		}
+		block->b_fl = conf;
+	}
+	if (block->b_flags & B_QUEUED) {
+		dprintk("lockd: nlmsvc_testlock deferred block %p flags %d fl %p\n",
+			block, block->b_flags, block->b_fl);
+		if (block->b_flags & B_TIMED_OUT) {
+			nlmsvc_unlink_block(block);
+			return nlm_lck_denied;
+		}
+		if (block->b_flags & B_GOT_CALLBACK) {
+			if (block->b_fl != NULL
+					&& block->b_fl->fl_type != F_UNLCK) {
+				lock->fl = *block->b_fl;
+				goto conf_lock;
+			}
+			else {
+				nlmsvc_unlink_block(block);
+				return nlm_granted;
+			}
+		}
+		return nlm_drop_reply;
 	}
 
-	return nlm_granted;
+	error = vfs_test_lock(file->f_file, &lock->fl);
+	if (error == -EINPROGRESS)
+		return nlmsvc_defer_lock_rqst(rqstp, block);
+	if (error) {
+		ret = nlm_lck_denied_nolocks;
+		goto out;
+	}
+	if (lock->fl.fl_type == F_UNLCK) {
+		ret = nlm_granted;
+		goto out;
+	}
+
+conf_lock:
+	dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
+		lock->fl.fl_type, (long long)lock->fl.fl_start,
+		(long long)lock->fl.fl_end);
+	conflock->caller = "somehost";	/* FIXME */
+	conflock->len = strlen(conflock->caller);
+	conflock->oh.len = 0;		/* don't return OH info */
+	conflock->svid = lock->fl.fl_pid;
+	conflock->fl.fl_type = lock->fl.fl_type;
+	conflock->fl.fl_start = lock->fl.fl_start;
+	conflock->fl.fl_end = lock->fl.fl_end;
+	ret = nlm_lck_denied;
+out:
+	if (block)
+		nlmsvc_release_block(block);
+	return ret;
 }
 
 /*
@@ -464,7 +561,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
 	nlmsvc_cancel_blocked(file, lock);
 
 	lock->fl.fl_type = F_UNLCK;
-	error = posix_lock_file(file->f_file, &lock->fl);
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
 
 	return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
 }
@@ -493,6 +590,8 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 	block = nlmsvc_lookup_block(file, lock);
 	mutex_unlock(&file->f_mutex);
 	if (block != NULL) {
+		vfs_cancel_lock(block->b_file->f_file,
+				&block->b_call->a_args.lock.fl);
 		status = nlmsvc_unlink_block(block);
 		nlmsvc_release_block(block);
 	}
@@ -500,6 +599,63 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 }
 
 /*
+ * This is a callback from the filesystem for VFS file lock requests.
+ * It will be used if fl_grant is defined and the filesystem can not
+ * respond to the request immediately.
+ * For GETLK request it will copy the reply to the nlm_block.
+ * For SETLK or SETLKW request it will get the local posix lock.
+ * In all cases it will move the block to the head of nlm_blocked q where
+ * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the
+ * deferred rpc for GETLK and SETLK.
+ */
+static void
+nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf,
+			     int result)
+{
+	block->b_flags |= B_GOT_CALLBACK;
+	if (result == 0)
+		block->b_granted = 1;
+	else
+		block->b_flags |= B_TIMED_OUT;
+	if (conf) {
+		if (block->b_fl)
+			locks_copy_lock(block->b_fl, conf);
+	}
+}
+
+static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
+					int result)
+{
+	struct nlm_block *block;
+	int rc = -ENOENT;
+
+	lock_kernel();
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
+			dprintk("lockd: nlmsvc_notify_blocked block %p flags %d\n",
+							block, block->b_flags);
+			if (block->b_flags & B_QUEUED) {
+				if (block->b_flags & B_TIMED_OUT) {
+					rc = -ENOLCK;
+					break;
+				}
+				nlmsvc_update_deferred_block(block, conf, result);
+			} else if (result == 0)
+				block->b_granted = 1;
+
+			nlmsvc_insert_block(block, 0);
+			svc_wake_up(block->b_daemon);
+			rc = 0;
+			break;
+		}
+	}
+	unlock_kernel();
+	if (rc == -ENOENT)
+		printk(KERN_WARNING "lockd: grant for unknown block\n");
+	return rc;
+}
+
+/*
  * Unblock a blocked lock request. This is a callback invoked from the
  * VFS layer when a lock on which we blocked is removed.
  *
@@ -531,6 +687,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 struct lock_manager_operations nlmsvc_lock_operations = {
 	.fl_compare_owner = nlmsvc_same_owner,
 	.fl_notify = nlmsvc_notify_blocked,
+	.fl_grant = nlmsvc_grant_deferred,
 };
 
 /*
@@ -553,6 +710,8 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 
 	dprintk("lockd: grant blocked lock %p\n", block);
 
+	kref_get(&block->b_count);
+
 	/* Unlink block request from list */
 	nlmsvc_unlink_block(block);
 
@@ -566,20 +725,23 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 
 	/* Try the lock operation again */
 	lock->fl.fl_flags |= FL_SLEEP;
-	error = posix_lock_file(file->f_file, &lock->fl);
+	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
 	lock->fl.fl_flags &= ~FL_SLEEP;
 
 	switch (error) {
 	case 0:
 		break;
 	case -EAGAIN:
-		dprintk("lockd: lock still blocked\n");
+	case -EINPROGRESS:
+		dprintk("lockd: lock still blocked error %d\n", error);
 		nlmsvc_insert_block(block, NLM_NEVER);
+		nlmsvc_release_block(block);
 		return;
 	default:
 		printk(KERN_WARNING "lockd: unexpected error %d in %s!\n",
 				-error, __FUNCTION__);
 		nlmsvc_insert_block(block, 10 * HZ);
+		nlmsvc_release_block(block);
 		return;
 	}
 
@@ -592,7 +754,6 @@ callback:
 	nlmsvc_insert_block(block, 30 * HZ);
 
 	/* Call the client */
-	kref_get(&block->b_count);
 	nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG, &nlmsvc_grant_ops);
 }
 
@@ -665,6 +826,23 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
 	nlmsvc_release_block(block);
 }
 
+/* Helper function to handle retry of a deferred block.
+ * If it is a blocking lock, call grant_blocked.
+ * For a non-blocking lock or test lock, revisit the request.
+ */
+static void
+retry_deferred_block(struct nlm_block *block)
+{
+	if (!(block->b_flags & B_GOT_CALLBACK))
+		block->b_flags |= B_TIMED_OUT;
+	nlmsvc_insert_block(block, NLM_TIMEOUT);
+	dprintk("revisit block %p flags %d\n",	block, block->b_flags);
+	if (block->b_deferred_req) {
+		block->b_deferred_req->revisit(block->b_deferred_req, 0);
+		block->b_deferred_req = NULL;
+	}
+}
+
 /*
  * Retry all blocked locks that have been notified. This is where lockd
  * picks up locks that can be granted, or grant notifications that must
@@ -688,9 +866,12 @@ nlmsvc_retry_blocked(void)
 
 		dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
 			block, block->b_when);
-		kref_get(&block->b_count);
-		nlmsvc_grant_blocked(block);
-		nlmsvc_release_block(block);
+		if (block->b_flags & B_QUEUED) {
+			dprintk("nlmsvc_retry_blocked delete block (%p, granted=%d, flags=%d)\n",
+				block, block->b_granted, block->b_flags);
+			retry_deferred_block(block);
+		} else
+			nlmsvc_grant_blocked(block);
 	}
 
 	return timeout;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 31cb48425733..9cd5c8b37593 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -33,6 +33,7 @@ cast_to_nlm(__be32 status, u32 vers)
 		case nlm_lck_denied_nolocks:
 		case nlm_lck_blocked:
 		case nlm_lck_denied_grace_period:
+		case nlm_drop_reply:
 			break;
 		case nlm4_deadlock:
 			status = nlm_lck_denied;
@@ -127,7 +128,9 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
-	resp->status = cast_status(nlmsvc_testlock(file, &argp->lock, &resp->lock));
+	resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
+	if (resp->status == nlm_drop_reply)
+		return rpc_drop_reply;
 
 	dprintk("lockd: TEST          status %d vers %d\n",
 		ntohl(resp->status), rqstp->rq_vers);
@@ -172,6 +175,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Now try to lock the file */
 	resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
 					       argp->block, &argp->cookie));
+	if (resp->status == nlm_drop_reply)
+		return rpc_drop_reply;
 
 	dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
 	nlm_release_host(host);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index c0df00c74ce3..84ebba33b98d 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -182,7 +182,7 @@ again:
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
-			if (posix_lock_file(file->f_file, &lock) < 0) {
+			if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) {
 				printk("lockd: unlock failure in %s:%d\n",
 						__FILE__, __LINE__);
 				return 1;
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 34dae5d70738..5316e307a49d 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -510,17 +510,20 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 	return 0;
 }
 
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
 /*
  * Buffer requirements for NLM
  */
 #define NLM_void_sz		0
 #define NLM_cookie_sz		1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz		1+XDR_QUADLEN(sizeof(utsname()->nodename))
-#define NLM_netobj_sz		1+XDR_QUADLEN(XDR_MAX_NETOBJ)
-/* #define NLM_owner_sz		1+XDR_QUADLEN(NLM_MAXOWNER) */
+#define NLM_caller_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
+#define NLM_owner_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
 #define NLM_fhandle_sz		1+XDR_QUADLEN(NFS2_FHSIZE)
-#define NLM_lock_sz		3+NLM_caller_sz+NLM_netobj_sz+NLM_fhandle_sz
-#define NLM_holder_sz		4+NLM_netobj_sz
+#define NLM_lock_sz		3+NLM_caller_sz+NLM_owner_sz+NLM_fhandle_sz
+#define NLM_holder_sz		4+NLM_owner_sz
 
 #define NLM_testargs_sz		NLM_cookie_sz+1+NLM_lock_sz
 #define NLM_lockargs_sz		NLM_cookie_sz+4+NLM_lock_sz
@@ -531,10 +534,6 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 #define NLM_res_sz		NLM_cookie_sz+1
 #define NLM_norep_sz		0
 
-#ifndef MAX
-# define MAX(a, b)		(((a) > (b))? (a) : (b))
-#endif
-
 /*
  * For NLM, a void procedure really returns nothing
  */
@@ -545,7 +544,8 @@ nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 	.p_proc      = NLMPROC_##proc,					\
 	.p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,		\
 	.p_decode    = (kxdrproc_t) nlmclt_decode_##restype,		\
-	.p_bufsiz    = MAX(NLM_##argtype##_sz, NLM_##restype##_sz) << 2,	\
+	.p_arglen    = NLM_##argtype##_sz,				\
+	.p_replen    = NLM_##restype##_sz,				\
 	.p_statidx   = NLMPROC_##proc,					\
 	.p_name      = #proc,						\
 	}
@@ -586,10 +586,6 @@ static struct rpc_version	nlm_version3 = {
 		.procs		= nlm_procedures,
 };
 
-#ifdef 	CONFIG_LOCKD_V4
-extern struct rpc_version nlm_version4;
-#endif
-
 static struct rpc_version *	nlm_versions[] = {
 	[1] = &nlm_version1,
 	[3] = &nlm_version3,
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index a78240551219..846fc1d639dd 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -123,7 +123,8 @@ static __be32 *
 nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
 	struct file_lock	*fl = &lock->fl;
-	__s64			len, start, end;
+	__u64			len, start;
+	__s64			end;
 
 	if (!(p = xdr_decode_string_inplace(p, &lock->caller,
 					    &lock->len, NLM_MAXSTRLEN))
@@ -417,7 +418,8 @@ nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 	if (resp->status == nlm_lck_denied) {
 		struct file_lock	*fl = &resp->lock.fl;
 		u32			excl;
-		s64			start, end, len;
+		__u64			start, len;
+		__s64			end;
 
 		memset(&resp->lock, 0, sizeof(resp->lock));
 		locks_init_lock(fl);
@@ -516,17 +518,24 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 	return 0;
 }
 
+#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
+#  error "NLM host name cannot be larger than XDR_MAX_NETOBJ!"
+#endif
+
+#if (NLMCLNT_OHSIZE > NLM_MAXSTRLEN)
+#  error "NLM host name cannot be larger than NLM's maximum string length!"
+#endif
+
 /*
  * Buffer requirements for NLM
  */
 #define NLM4_void_sz		0
 #define NLM4_cookie_sz		1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM4_caller_sz		1+XDR_QUADLEN(NLM_MAXSTRLEN)
-#define NLM4_netobj_sz		1+XDR_QUADLEN(XDR_MAX_NETOBJ)
-/* #define NLM4_owner_sz		1+XDR_QUADLEN(NLM4_MAXOWNER) */
+#define NLM4_caller_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
+#define NLM4_owner_sz		1+XDR_QUADLEN(NLMCLNT_OHSIZE)
 #define NLM4_fhandle_sz		1+XDR_QUADLEN(NFS3_FHSIZE)
-#define NLM4_lock_sz		5+NLM4_caller_sz+NLM4_netobj_sz+NLM4_fhandle_sz
-#define NLM4_holder_sz		6+NLM4_netobj_sz
+#define NLM4_lock_sz		5+NLM4_caller_sz+NLM4_owner_sz+NLM4_fhandle_sz
+#define NLM4_holder_sz		6+NLM4_owner_sz
 
 #define NLM4_testargs_sz	NLM4_cookie_sz+1+NLM4_lock_sz
 #define NLM4_lockargs_sz	NLM4_cookie_sz+4+NLM4_lock_sz
@@ -537,10 +546,6 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 #define NLM4_res_sz		NLM4_cookie_sz+1
 #define NLM4_norep_sz		0
 
-#ifndef MAX
-# define MAX(a,b)		(((a) > (b))? (a) : (b))
-#endif
-
 /*
  * For NLM, a void procedure really returns nothing
  */
@@ -551,7 +556,8 @@ nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 	.p_proc      = NLMPROC_##proc,					\
 	.p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,		\
 	.p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,		\
-	.p_bufsiz    = MAX(NLM4_##argtype##_sz, NLM4_##restype##_sz) << 2,	\
+	.p_arglen    = NLM4_##argtype##_sz,				\
+	.p_replen    = NLM4_##restype##_sz,				\
 	.p_statidx   = NLMPROC_##proc,					\
 	.p_name      = #proc,						\
 	}
diff --git a/fs/locks.c b/fs/locks.c
index 52a81005dab4..431a8b871fce 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -203,10 +203,6 @@ static void init_once(void *foo, struct kmem_cache *cache, unsigned long flags)
 {
 	struct file_lock *lock = (struct file_lock *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) !=
-					SLAB_CTOR_CONSTRUCTOR)
-		return;
-
 	locks_init_lock(lock);
 }
 
@@ -666,8 +662,7 @@ static int locks_block_on_timeout(struct file_lock *blocker, struct file_lock *w
 }
 
 int
-posix_test_lock(struct file *filp, struct file_lock *fl,
-		struct file_lock *conflock)
+posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
 
@@ -679,10 +674,11 @@ posix_test_lock(struct file *filp, struct file_lock *fl,
 			break;
 	}
 	if (cfl) {
-		__locks_copy_lock(conflock, cfl);
+		__locks_copy_lock(fl, cfl);
 		unlock_kernel();
 		return 1;
-	}
+	} else
+		fl->fl_type = F_UNLCK;
 	unlock_kernel();
 	return 0;
 }
@@ -801,7 +797,7 @@ out:
 	return error;
 }
 
-static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
+static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
 	struct file_lock *fl;
 	struct file_lock *new_fl = NULL;
@@ -1007,6 +1003,7 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
  * posix_lock_file - Apply a POSIX-style lock to a file
  * @filp: The file to apply the lock to
  * @fl: The lock to be applied
+ * @conflock: Place to return a copy of the conflicting lock, if found.
  *
  * Add a POSIX style lock to a file.
  * We merge adjacent & overlapping locks whenever possible.
@@ -1016,26 +1013,12 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
  * whether or not a lock was successfully freed by testing the return
  * value for -ENOENT.
  */
-int posix_lock_file(struct file *filp, struct file_lock *fl)
-{
-	return __posix_lock_file_conf(filp->f_path.dentry->d_inode, fl, NULL);
-}
-EXPORT_SYMBOL(posix_lock_file);
-
-/**
- * posix_lock_file_conf - Apply a POSIX-style lock to a file
- * @filp: The file to apply the lock to
- * @fl: The lock to be applied
- * @conflock: Place to return a copy of the conflicting lock, if found.
- *
- * Except for the conflock parameter, acts just like posix_lock_file.
- */
-int posix_lock_file_conf(struct file *filp, struct file_lock *fl,
+int posix_lock_file(struct file *filp, struct file_lock *fl,
 			struct file_lock *conflock)
 {
-	return __posix_lock_file_conf(filp->f_path.dentry->d_inode, fl, conflock);
+	return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock);
 }
-EXPORT_SYMBOL(posix_lock_file_conf);
+EXPORT_SYMBOL(posix_lock_file);
 
 /**
  * posix_lock_file_wait - Apply a POSIX-style lock to a file
@@ -1051,7 +1034,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 	int error;
 	might_sleep ();
 	for (;;) {
-		error = posix_lock_file(filp, fl);
+		error = posix_lock_file(filp, fl, NULL);
 		if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1123,7 +1106,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 	fl.fl_end = offset + count - 1;
 
 	for (;;) {
-		error = __posix_lock_file_conf(inode, &fl, NULL);
+		error = __posix_lock_file(inode, &fl, NULL);
 		if (error != -EAGAIN)
 			break;
 		if (!(fl.fl_flags & FL_SLEEP))
@@ -1611,12 +1594,63 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
 	return error;
 }
 
+/**
+ * vfs_test_lock - test file byte range lock
+ * @filp: The file to test lock for
+ * @fl: The lock to test
+ * @conf: Place to return a copy of the conflicting lock, if found
+ *
+ * Returns -ERRNO on failure.  Indicates presence of conflicting lock by
+ * setting conf->fl_type to something other than F_UNLCK.
+ */
+int vfs_test_lock(struct file *filp, struct file_lock *fl)
+{
+	if (filp->f_op && filp->f_op->lock)
+		return filp->f_op->lock(filp, F_GETLK, fl);
+	posix_test_lock(filp, fl);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfs_test_lock);
+
+static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
+{
+	flock->l_pid = fl->fl_pid;
+#if BITS_PER_LONG == 32
+	/*
+	 * Make sure we can represent the posix lock via
+	 * legacy 32bit flock.
+	 */
+	if (fl->fl_start > OFFT_OFFSET_MAX)
+		return -EOVERFLOW;
+	if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
+		return -EOVERFLOW;
+#endif
+	flock->l_start = fl->fl_start;
+	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
+		fl->fl_end - fl->fl_start + 1;
+	flock->l_whence = 0;
+	flock->l_type = fl->fl_type;
+	return 0;
+}
+
+#if BITS_PER_LONG == 32
+static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
+{
+	flock->l_pid = fl->fl_pid;
+	flock->l_start = fl->fl_start;
+	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
+		fl->fl_end - fl->fl_start + 1;
+	flock->l_whence = 0;
+	flock->l_type = fl->fl_type;
+}
+#endif
+
 /* Report the first existing lock that would conflict with l.
  * This implements the F_GETLK command of fcntl().
  */
 int fcntl_getlk(struct file *filp, struct flock __user *l)
 {
-	struct file_lock *fl, cfl, file_lock;
+	struct file_lock file_lock;
 	struct flock flock;
 	int error;
 
@@ -1631,38 +1665,15 @@ int fcntl_getlk(struct file *filp, struct flock __user *l)
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock) {
-		error = filp->f_op->lock(filp, F_GETLK, &file_lock);
-		if (file_lock.fl_ops && file_lock.fl_ops->fl_release_private)
-			file_lock.fl_ops->fl_release_private(&file_lock);
-		if (error < 0)
-			goto out;
-		else
-		  fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
-	} else {
-		fl = (posix_test_lock(filp, &file_lock, &cfl) ? &cfl : NULL);
-	}
+	error = vfs_test_lock(filp, &file_lock);
+	if (error)
+		goto out;
  
-	flock.l_type = F_UNLCK;
-	if (fl != NULL) {
-		flock.l_pid = fl->fl_pid;
-#if BITS_PER_LONG == 32
-		/*
-		 * Make sure we can represent the posix lock via
-		 * legacy 32bit flock.
-		 */
-		error = -EOVERFLOW;
-		if (fl->fl_start > OFFT_OFFSET_MAX)
+	flock.l_type = file_lock.fl_type;
+	if (file_lock.fl_type != F_UNLCK) {
+		error = posix_lock_to_flock(&flock, &file_lock);
+		if (error)
 			goto out;
-		if ((fl->fl_end != OFFSET_MAX)
-		    && (fl->fl_end > OFFT_OFFSET_MAX))
-			goto out;
-#endif
-		flock.l_start = fl->fl_start;
-		flock.l_len = fl->fl_end == OFFSET_MAX ? 0 :
-			fl->fl_end - fl->fl_start + 1;
-		flock.l_whence = 0;
-		flock.l_type = fl->fl_type;
 	}
 	error = -EFAULT;
 	if (!copy_to_user(l, &flock, sizeof(flock)))
@@ -1671,6 +1682,48 @@ out:
 	return error;
 }
 
+/**
+ * vfs_lock_file - file byte range lock
+ * @filp: The file to apply the lock to
+ * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
+ * @fl: The lock to be applied
+ * @conf: Place to return a copy of the conflicting lock, if found.
+ *
+ * A caller that doesn't care about the conflicting lock may pass NULL
+ * as the final argument.
+ *
+ * If the filesystem defines a private ->lock() method, then @conf will
+ * be left unchanged; so a caller that cares should initialize it to
+ * some acceptable default.
+ *
+ * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
+ * locks, the ->lock() interface may return asynchronously, before the lock has
+ * been granted or denied by the underlying filesystem, if (and only if)
+ * fl_grant is set. Callers expecting ->lock() to return asynchronously
+ * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
+ * the request is for a blocking lock. When ->lock() does return asynchronously,
+ * it must return -EINPROGRESS, and call ->fl_grant() when the lock
+ * request completes.
+ * If the request is for non-blocking lock the file system should return
+ * -EINPROGRESS then try to get the lock and call the callback routine with
+ * the result. If the request timed out the callback routine will return a
+ * nonzero return code and the file system should release the lock. The file
+ * system is also responsible to keep a corresponding posix lock when it
+ * grants a lock so the VFS can find out which locks are locally held and do
+ * the correct lock cleanup when required.
+ * The underlying filesystem must not drop the kernel lock or call
+ * ->fl_grant() before returning to the caller with a -EINPROGRESS
+ * return code.
+ */
+int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
+{
+	if (filp->f_op && filp->f_op->lock)
+		return filp->f_op->lock(filp, cmd, fl);
+	else
+		return posix_lock_file(filp, fl, conf);
+}
+EXPORT_SYMBOL_GPL(vfs_lock_file);
+
 /* Apply the lock described by l to an open file descriptor.
  * This implements both the F_SETLK and F_SETLKW commands of fcntl().
  */
@@ -1733,21 +1786,17 @@ again:
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock != NULL)
-		error = filp->f_op->lock(filp, cmd, file_lock);
-	else {
-		for (;;) {
-			error = posix_lock_file(filp, file_lock);
-			if ((error != -EAGAIN) || (cmd == F_SETLK))
-				break;
-			error = wait_event_interruptible(file_lock->fl_wait,
-					!file_lock->fl_next);
-			if (!error)
-				continue;
-
-			locks_delete_block(file_lock);
+	for (;;) {
+		error = vfs_lock_file(filp, cmd, file_lock, NULL);
+		if (error != -EAGAIN || cmd == F_SETLK)
 			break;
-		}
+		error = wait_event_interruptible(file_lock->fl_wait,
+				!file_lock->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(file_lock);
+		break;
 	}
 
 	/*
@@ -1770,7 +1819,7 @@ out:
  */
 int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
 {
-	struct file_lock *fl, cfl, file_lock;
+	struct file_lock file_lock;
 	struct flock64 flock;
 	int error;
 
@@ -1785,27 +1834,14 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock) {
-		error = filp->f_op->lock(filp, F_GETLK, &file_lock);
-		if (file_lock.fl_ops && file_lock.fl_ops->fl_release_private)
-			file_lock.fl_ops->fl_release_private(&file_lock);
-		if (error < 0)
-			goto out;
-		else
-		  fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
-	} else {
-		fl = (posix_test_lock(filp, &file_lock, &cfl) ? &cfl : NULL);
-	}
- 
-	flock.l_type = F_UNLCK;
-	if (fl != NULL) {
-		flock.l_pid = fl->fl_pid;
-		flock.l_start = fl->fl_start;
-		flock.l_len = fl->fl_end == OFFSET_MAX ? 0 :
-			fl->fl_end - fl->fl_start + 1;
-		flock.l_whence = 0;
-		flock.l_type = fl->fl_type;
-	}
+	error = vfs_test_lock(filp, &file_lock);
+	if (error)
+		goto out;
+
+	flock.l_type = file_lock.fl_type;
+	if (file_lock.fl_type != F_UNLCK)
+		posix_lock_to_flock64(&flock, &file_lock);
+
 	error = -EFAULT;
 	if (!copy_to_user(l, &flock, sizeof(flock)))
 		error = 0;
@@ -1876,21 +1912,17 @@ again:
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock != NULL)
-		error = filp->f_op->lock(filp, cmd, file_lock);
-	else {
-		for (;;) {
-			error = posix_lock_file(filp, file_lock);
-			if ((error != -EAGAIN) || (cmd == F_SETLK64))
-				break;
-			error = wait_event_interruptible(file_lock->fl_wait,
-					!file_lock->fl_next);
-			if (!error)
-				continue;
-
-			locks_delete_block(file_lock);
+	for (;;) {
+		error = vfs_lock_file(filp, cmd, file_lock, NULL);
+		if (error != -EAGAIN || cmd == F_SETLK64)
 			break;
-		}
+		error = wait_event_interruptible(file_lock->fl_wait,
+				!file_lock->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(file_lock);
+		break;
 	}
 
 	/*
@@ -1935,10 +1967,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 	lock.fl_ops = NULL;
 	lock.fl_lmops = NULL;
 
-	if (filp->f_op && filp->f_op->lock != NULL)
-		filp->f_op->lock(filp, F_SETLK, &lock);
-	else
-		posix_lock_file(filp, &lock);
+	vfs_lock_file(filp, F_SETLK, &lock, NULL);
 
 	if (lock.fl_ops && lock.fl_ops->fl_release_private)
 		lock.fl_ops->fl_release_private(&lock);
@@ -2015,6 +2044,22 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 
 EXPORT_SYMBOL(posix_unblock_lock);
 
+/**
+ * vfs_cancel_lock - file byte range unblock lock
+ * @filp: The file to apply the unblock to
+ * @fl: The lock to be unblocked
+ *
+ * Used by lock managers to cancel blocked requests
+ */
+int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
+{
+	if (filp->f_op && filp->f_op->lock)
+		return filp->f_op->lock(filp, F_CANCELLK, fl);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(vfs_cancel_lock);
+
 static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx)
 {
 	struct inode *inode = NULL;
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index c4a554df7b7e..99a12f127769 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -15,6 +15,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include <linux/sched.h>
 
 static int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
 
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index cb4cb571fddf..e207cbe70951 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -65,7 +65,6 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
 		if (!PageUptodate(page))
 			goto fail;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 92e383af3709..be4044614ac8 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -73,9 +73,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/mpage.c b/fs/mpage.c
index 692a3e578fc8..c1698f2291aa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -284,11 +284,9 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	}
 
 	if (first_hole != blocks_per_page) {
-		char *kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + (first_hole << blkbits), 0,
-				PAGE_CACHE_SIZE - (first_hole << blkbits));
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, first_hole << blkbits,
+				PAGE_CACHE_SIZE - (first_hole << blkbits),
+				KM_USER0);
 		if (first_hole == 0) {
 			SetPageUptodate(page);
 			unlock_page(page);
@@ -456,11 +454,18 @@ EXPORT_SYMBOL(mpage_readpage);
  * written, so it can intelligently allocate a suitably-sized BIO.  For now,
  * just allocate full-size (16-page) BIOs.
  */
-static struct bio *
-__mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
-	sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc,
-	writepage_t writepage_fn)
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
+static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+			     void *data)
 {
+	struct mpage_data *mpd = data;
+	struct bio *bio = mpd->bio;
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
@@ -478,6 +483,7 @@ __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	int length;
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
+	int ret = 0;
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
@@ -540,7 +546,7 @@ __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 
 		map_bh.b_state = 0;
 		map_bh.b_size = 1 << blkbits;
-		if (get_block(inode, block_in_file, &map_bh, 1))
+		if (mpd->get_block(inode, block_in_file, &map_bh, 1))
 			goto confused;
 		if (buffer_new(&map_bh))
 			unmap_underlying_metadata(map_bh.b_bdev,
@@ -576,20 +582,17 @@ page_is_mapped:
 		 * written out to the file."
 		 */
 		unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
-		char *kaddr;
 
 		if (page->index > end_index || !offset)
 			goto confused;
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
+				KM_USER0);
 	}
 
 	/*
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
-	if (bio && *last_block_in_bio != blocks[0] - 1)
+	if (bio && mpd->last_block_in_bio != blocks[0] - 1)
 		bio = mpage_bio_submit(WRITE, bio);
 
 alloc_new:
@@ -646,7 +649,7 @@ alloc_new:
 					boundary_block, 1 << blkbits);
 		}
 	} else {
-		*last_block_in_bio = blocks[blocks_per_page - 1];
+		mpd->last_block_in_bio = blocks[blocks_per_page - 1];
 	}
 	goto out;
 
@@ -654,23 +657,19 @@ confused:
 	if (bio)
 		bio = mpage_bio_submit(WRITE, bio);
 
-	if (writepage_fn) {
-		*ret = (*writepage_fn)(page, wbc);
+	if (mpd->use_writepage) {
+		ret = mapping->a_ops->writepage(page, wbc);
 	} else {
-		*ret = -EAGAIN;
+		ret = -EAGAIN;
 		goto out;
 	}
 	/*
 	 * The caller has a ref on the inode, so *mapping is stable
 	 */
-	if (*ret) {
-		if (*ret == -ENOSPC)
-			set_bit(AS_ENOSPC, &mapping->flags);
-		else
-			set_bit(AS_EIO, &mapping->flags);
-	}
+	mapping_set_error(mapping, ret);
 out:
-	return bio;
+	mpd->bio = bio;
+	return ret;
 }
 
 /**
@@ -693,127 +692,27 @@ out:
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
- *
- * If you fix this you should check generic_writepages() also!
  */
 int
 mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	struct bio *bio = NULL;
-	sector_t last_block_in_bio = 0;
-	int ret = 0;
-	int done = 0;
-	int (*writepage)(struct page *page, struct writeback_control *wbc);
-	struct pagevec pvec;
-	int nr_pages;
-	pgoff_t index;
-	pgoff_t end;		/* Inclusive */
-	int scanned = 0;
-	int range_whole = 0;
-
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
-	writepage = NULL;
-	if (get_block == NULL)
-		writepage = mapping->a_ops->writepage;
-
-	pagevec_init(&pvec, 0);
-	if (wbc->range_cyclic) {
-		index = mapping->writeback_index; /* Start from prev offset */
-		end = -1;
-	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
-		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
-		scanned = 1;
-	}
-retry:
-	while (!done && (index <= end) &&
-			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			PAGECACHE_TAG_DIRTY,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-		unsigned i;
-
-		scanned = 1;
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			/*
-			 * At this point we hold neither mapping->tree_lock nor
-			 * lock on the page itself: the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or even
-			 * swizzled back from swapper_space to tmpfs file
-			 * mapping
-			 */
-
-			lock_page(page);
-
-			if (unlikely(page->mapping != mapping)) {
-				unlock_page(page);
-				continue;
-			}
-
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
-				unlock_page(page);
-				continue;
-			}
-
-			if (wbc->sync_mode != WB_SYNC_NONE)
-				wait_on_page_writeback(page);
-
-			if (PageWriteback(page) ||
-					!clear_page_dirty_for_io(page)) {
-				unlock_page(page);
-				continue;
-			}
-
-			if (writepage) {
-				ret = (*writepage)(page, wbc);
-				if (ret) {
-					if (ret == -ENOSPC)
-						set_bit(AS_ENOSPC,
-							&mapping->flags);
-					else
-						set_bit(AS_EIO,
-							&mapping->flags);
-				}
-			} else {
-				bio = __mpage_writepage(bio, page, get_block,
-						&last_block_in_bio, &ret, wbc,
-						page->mapping->a_ops->writepage);
-			}
-			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
-				unlock_page(page);
-			if (ret || (--(wbc->nr_to_write) <= 0))
-				done = 1;
-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			}
-		}
-		pagevec_release(&pvec);
-		cond_resched();
+	int ret;
+
+	if (!get_block)
+		ret = generic_writepages(mapping, wbc);
+	else {
+		struct mpage_data mpd = {
+			.bio = NULL,
+			.last_block_in_bio = 0,
+			.get_block = get_block,
+			.use_writepage = 1,
+		};
+
+		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
+		if (mpd.bio)
+			mpage_bio_submit(WRITE, mpd.bio);
 	}
-	if (!scanned && !done) {
-		/*
-		 * We hit the last page and there is more work to be done: wrap
-		 * back to the start of the file
-		 */
-		scanned = 1;
-		index = 0;
-		goto retry;
-	}
-	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
-		mapping->writeback_index = index;
-	if (bio)
-		mpage_bio_submit(WRITE, bio);
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepages);
@@ -821,15 +720,15 @@ EXPORT_SYMBOL(mpage_writepages);
 int mpage_writepage(struct page *page, get_block_t get_block,
 	struct writeback_control *wbc)
 {
-	int ret = 0;
-	struct bio *bio;
-	sector_t last_block_in_bio = 0;
-
-	bio = __mpage_writepage(NULL, page, get_block,
-			&last_block_in_bio, &ret, wbc, NULL);
-	if (bio)
-		mpage_bio_submit(WRITE, bio);
-
+	struct mpage_data mpd = {
+		.bio = NULL,
+		.last_block_in_bio = 0,
+		.get_block = get_block,
+		.use_writepage = 0,
+	};
+	int ret = __mpage_writepage(page, wbc, &mpd);
+	if (mpd.bio)
+		mpage_bio_submit(WRITE, mpd.bio);
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namei.c b/fs/namei.c
index 880052cadbcd..5e2d98d10c5d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -22,7 +22,6 @@
 #include <linux/quotaops.h>
 #include <linux/pagemap.h>
 #include <linux/fsnotify.h>
-#include <linux/smp_lock.h>
 #include <linux/personality.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -1153,14 +1152,12 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 
 		fput_light(file, fput_needed);
 	}
-	current->total_link_count = 0;
-	retval = link_path_walk(name, nd);
+
+	retval = path_walk(name, nd);
 out:
-	if (likely(retval == 0)) {
-		if (unlikely(!audit_dummy_context() && nd && nd->dentry &&
+	if (unlikely(!retval && !audit_dummy_context() && nd->dentry &&
 				nd->dentry->d_inode))
 		audit_inode(name, nd->dentry->d_inode);
-	}
 out_fail:
 	return retval;
 
@@ -1350,17 +1347,6 @@ struct dentry *lookup_one_len_kern(const char *name, struct dentry *base, int le
 	return __lookup_hash_kern(&this, base, NULL);
 }
 
-/*
- *	namei()
- *
- * is used by most simple commands to get the inode of a specified name.
- * Open, link etc use their own routines, but this is enough for things
- * like 'chmod' etc.
- *
- * namei exists in two versions: namei/lnamei. The only difference is
- * that namei follows links, while lnamei does not.
- * SMP-safe
- */
 int fastcall __user_walk_fd(int dfd, const char __user *name, unsigned flags,
 			    struct nameidata *nd)
 {
@@ -1733,7 +1719,7 @@ do_last:
 	 * It already exists.
 	 */
 	mutex_unlock(&dir->d_inode->i_mutex);
-	audit_inode_update(path.dentry->d_inode);
+	audit_inode(pathname, path.dentry->d_inode);
 
 	error = -EEXIST;
 	if (flag & O_EXCL)
@@ -2671,19 +2657,9 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 	struct address_space *mapping = dentry->d_inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
-		goto sync_fail;
-	wait_on_page_locked(page);
-	if (!PageUptodate(page))
-		goto async_fail;
+		return (char*)page;
 	*ppage = page;
 	return kmap(page);
-
-async_fail:
-	page_cache_release(page);
-	return ERR_PTR(-EIO);
-
-sync_fail:
-	return (char*)page;
 }
 
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
diff --git a/fs/namespace.c b/fs/namespace.c
index fd999cab7b57..b696e3a0d18f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -377,6 +377,10 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 	seq_path(m, mnt, mnt->mnt_root, " \t\n\\");
 	seq_putc(m, ' ');
 	mangle(m, mnt->mnt_sb->s_type->name);
+	if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) {
+		seq_putc(m, '.');
+		mangle(m, mnt->mnt_sb->s_subtype);
+	}
 	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
 	for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
 		if (mnt->mnt_sb->s_flags & fs_infop->flag)
@@ -495,7 +499,7 @@ void release_mounts(struct list_head *head)
 {
 	struct vfsmount *mnt;
 	while (!list_empty(head)) {
-		mnt = list_entry(head->next, struct vfsmount, mnt_hash);
+		mnt = list_first_entry(head, struct vfsmount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
 		if (mnt->mnt_parent != mnt) {
 			struct dentry *dentry;
@@ -882,6 +886,9 @@ static int do_change_type(struct nameidata *nd, int flag)
 	int recurse = flag & MS_REC;
 	int type = flag & ~MS_REC;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (nd->dentry != nd->mnt->mnt_root)
 		return -EINVAL;
 
@@ -1173,7 +1180,7 @@ static void expire_mount_list(struct list_head *graveyard, struct list_head *mou
 
 	while (!list_empty(graveyard)) {
 		LIST_HEAD(umounts);
-		mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire);
+		mnt = list_first_entry(graveyard, struct vfsmount, mnt_expire);
 		list_del_init(&mnt->mnt_expire);
 
 		/* don't do anything if the namespace is dead - all the
@@ -1441,10 +1448,9 @@ dput_out:
  * Allocate a new namespace structure and populate it with contents
  * copied from the namespace of the passed in task structure.
  */
-struct mnt_namespace *dup_mnt_ns(struct task_struct *tsk,
+static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		struct fs_struct *fs)
 {
-	struct mnt_namespace *mnt_ns = tsk->nsproxy->mnt_ns;
 	struct mnt_namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
 	struct vfsmount *p, *q;
@@ -1509,36 +1515,21 @@ struct mnt_namespace *dup_mnt_ns(struct task_struct *tsk,
 	return new_ns;
 }
 
-int copy_mnt_ns(int flags, struct task_struct *tsk)
+struct mnt_namespace *copy_mnt_ns(int flags, struct mnt_namespace *ns,
+		struct fs_struct *new_fs)
 {
-	struct mnt_namespace *ns = tsk->nsproxy->mnt_ns;
 	struct mnt_namespace *new_ns;
-	int err = 0;
-
-	if (!ns)
-		return 0;
 
+	BUG_ON(!ns);
 	get_mnt_ns(ns);
 
 	if (!(flags & CLONE_NEWNS))
-		return 0;
+		return ns;
 
-	if (!capable(CAP_SYS_ADMIN)) {
-		err = -EPERM;
-		goto out;
-	}
-
-	new_ns = dup_mnt_ns(tsk, tsk->fs);
-	if (!new_ns) {
-		err = -ENOMEM;
-		goto out;
-	}
+	new_ns = dup_mnt_ns(ns, new_fs);
 
-	tsk->nsproxy->mnt_ns = new_ns;
-
-out:
 	put_mnt_ns(ns);
-	return err;
+	return new_ns;
 }
 
 asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6b1f6d27099a..d3152f8d95c6 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -17,7 +17,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/smp_lock.h>
+#include <linux/sched.h>
 
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 7285c94956c4..cf06eb9f050e 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -60,11 +60,8 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		mutex_init(&ei->open_mutex);
-		inode_init_once(&ei->vfs_inode);
-	}
+	mutex_init(&ei->open_mutex);
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 8843a83d4ef0..c67b4bdcf719 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -17,6 +17,7 @@
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
+#include <linux/sched.h>
 
 #include <linux/ncp_fs.h>
 
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index db3d7919c601..c2bb14e053e1 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -24,7 +24,7 @@ enum nfs4_callback_opnum {
 };
 
 struct cb_compound_hdr_arg {
-	int taglen;
+	unsigned int taglen;
 	const char *tag;
 	unsigned int callback_ident;
 	unsigned nops;
@@ -32,7 +32,7 @@ struct cb_compound_hdr_arg {
 
 struct cb_compound_hdr_res {
 	__be32 *status;
-	int taglen;
+	unsigned int taglen;
 	const char *tag;
 	__be32 *nops;
 };
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2190e6c2792e..881fa4900923 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -12,7 +12,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-
+#include <linux/sched.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -27,7 +27,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
@@ -618,7 +617,8 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat
 	if (clp->cl_nfsversion == 3) {
 		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
 			server->namelen = NFS3_MAXNAMLEN;
-		server->caps |= NFS_CAP_READDIRPLUS;
+		if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
+			server->caps |= NFS_CAP_READDIRPLUS;
 	} else {
 		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
 			server->namelen = NFS2_MAXNAMLEN;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 841c99a9b11c..7f37d1bea83f 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -226,7 +226,7 @@ restart:
 	spin_unlock(&clp->cl_lock);
 }
 
-int nfs_do_expire_all_delegations(void *ptr)
+static int nfs_do_expire_all_delegations(void *ptr)
 {
 	struct nfs_client *clp = ptr;
 	struct nfs_delegation *delegation;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cd3469720cbf..c27258b5d3e1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,12 +33,12 @@
 #include <linux/pagevec.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/sched.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
 #include "iostat.h"
 
-#define NFS_PARANOIA 1
 /* #define NFS_DEBUG_VERBOSE 1 */
 
 static int nfs_opendir(struct inode *, struct file *);
@@ -154,6 +154,8 @@ typedef struct {
 	decode_dirent_t	decode;
 	int		plus;
 	int		error;
+	unsigned long	timestamp;
+	int		timestamp_valid;
 } nfs_readdir_descriptor_t;
 
 /* Now we cache directories properly, by stuffing the dirent
@@ -195,6 +197,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 		}
 		goto error;
 	}
+	desc->timestamp = timestamp;
+	desc->timestamp_valid = 1;
 	SetPageUptodate(page);
 	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
@@ -225,6 +229,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 	desc->ptr = p;
+	if (desc->timestamp_valid)
+		desc->entry->fattr->time_start = desc->timestamp;
+	else
+		desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
 	return 0;
 }
 
@@ -316,14 +324,16 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
 			__FUNCTION__, desc->page_index,
 			(long long) *desc->dir_cookie);
 
+	/* If we find the page in the page_cache, we cannot be sure
+	 * how fresh the data is, so we will ignore readdir_plus attributes.
+	 */
+	desc->timestamp_valid = 0;
 	page = read_cache_page(inode->i_mapping, desc->page_index,
 			       (filler_t *)nfs_readdir_filler, desc);
 	if (IS_ERR(page)) {
 		status = PTR_ERR(page);
 		goto out;
 	}
-	if (!PageUptodate(page))
-		goto read_error;
 
 	/* NOTE: Someone else may have changed the READDIRPLUS flag */
 	desc->page = page;
@@ -337,9 +347,6 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, status);
 	return status;
- read_error:
-	page_cache_release(page);
-	return -EIO;
 }
 
 /*
@@ -468,6 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	struct rpc_cred	*cred = nfs_file_cred(file);
 	struct page	*page = NULL;
 	int		status;
+	unsigned long	timestamp;
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
 			(unsigned long long)*desc->dir_cookie);
@@ -477,6 +485,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 		status = -ENOMEM;
 		goto out;
 	}
+	timestamp = jiffies;
 	desc->error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie,
 						page,
 						NFS_SERVER(inode)->dtsize,
@@ -487,6 +496,8 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	desc->page = page;
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
 	if (desc->error >= 0) {
+		desc->timestamp = timestamp;
+		desc->timestamp_valid = 1;
 		if ((status = dir_decode(desc)) == 0)
 			desc->entry->prev_cookie = *desc->dir_cookie;
 	} else
@@ -597,7 +608,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	return res;
 }
 
-loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
+static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 {
 	mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
 	switch (origin) {
@@ -623,7 +634,7 @@ out:
  * All directory operations under NFS are synchronous, so fsync()
  * is a dummy operation.
  */
-int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
+static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
 {
 	dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -639,12 +650,15 @@ int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
  */
 static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 {
+	unsigned long verf;
+
 	if (IS_ROOT(dentry))
 		return 1;
-	if ((NFS_I(dir)->cache_validity & NFS_INO_INVALID_ATTR) != 0
-			|| nfs_attribute_timeout(dir))
+	verf = (unsigned long)dentry->d_fsdata;
+	if (nfs_caches_unstable(dir)
+			|| verf != NFS_I(dir)->cache_change_attribute)
 		return 0;
-	return nfs_verify_change_attribute(dir, (unsigned long)dentry->d_fsdata);
+	return 1;
 }
 
 static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
@@ -654,8 +668,7 @@ static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
 
 static void nfs_refresh_verifier(struct dentry * dentry, unsigned long verf)
 {
-	if (time_after(verf, (unsigned long)dentry->d_fsdata))
-		nfs_set_verifier(dentry, verf);
+	nfs_set_verifier(dentry, verf);
 }
 
 /*
@@ -754,6 +767,10 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
 	inode = dentry->d_inode;
 
+	/* Revalidate parent directory attribute cache */
+	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+		goto out_zap_parent;
+
 	if (!inode) {
 		if (nfs_neg_need_reval(dir, dentry, nd))
 			goto out_bad;
@@ -767,10 +784,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 		goto out_bad;
 	}
 
-	/* Revalidate parent directory attribute cache */
-	if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
-		goto out_zap_parent;
-
 	/* Force a full look up iff the parent directory has changed */
 	if (nfs_check_verifier(dir, dentry)) {
 		if (nfs_lookup_verify_inode(inode, nd))
@@ -849,6 +862,10 @@ static int nfs_dentry_delete(struct dentry *dentry)
 static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
 	nfs_inode_return_delegation(inode);
+	if (S_ISDIR(inode->i_mode))
+		/* drop any readdir cache as it could easily be old */
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
 		lock_kernel();
 		drop_nlink(inode);
@@ -1345,11 +1362,6 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 		atomic_read(&dentry->d_count));
 	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
 
-#ifdef NFS_PARANOIA
-if (!dentry->d_inode)
-printk("NFS: silly-renaming %s/%s, negative dentry??\n",
-dentry->d_parent->d_name.name, dentry->d_name.name);
-#endif
 	/*
 	 * We don't allow a dentry to be silly-renamed twice.
 	 */
@@ -1666,16 +1678,9 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			new_inode = NULL;
 			/* instantiate the replacement target */
 			d_instantiate(new_dentry, NULL);
-		} else if (atomic_read(&new_dentry->d_count) > 1) {
-		/* dentry still busy? */
-#ifdef NFS_PARANOIA
-			printk("nfs_rename: target %s/%s busy, d_count=%d\n",
-			       new_dentry->d_parent->d_name.name,
-			       new_dentry->d_name.name,
-			       atomic_read(&new_dentry->d_count));
-#endif
+		} else if (atomic_read(&new_dentry->d_count) > 1)
+			/* dentry still busy? */
 			goto out;
-		}
 	} else
 		drop_nlink(new_inode);
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2877744cb606..00eee87510fe 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -41,7 +41,6 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
-#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/kref.h>
@@ -54,6 +53,7 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
 
+#include "internal.h"
 #include "iostat.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
@@ -122,19 +122,25 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
 	return -EINVAL;
 }
 
-static void nfs_direct_dirty_pages(struct page **pages, int npages)
+static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count)
 {
-	int i;
+	unsigned int npages;
+	unsigned int i;
+
+	if (count == 0)
+		return;
+	pages += (pgbase >> PAGE_SHIFT);
+	npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	for (i = 0; i < npages; i++) {
 		struct page *page = pages[i];
 		if (!PageCompound(page))
-			set_page_dirty_lock(page);
+			set_page_dirty(page);
 	}
 }
 
-static void nfs_direct_release_pages(struct page **pages, int npages)
+static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 {
-	int i;
+	unsigned int i;
 	for (i = 0; i < npages; i++)
 		page_cache_release(pages[i]);
 }
@@ -162,7 +168,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 	return dreq;
 }
 
-static void nfs_direct_req_release(struct kref *kref)
+static void nfs_direct_req_free(struct kref *kref)
 {
 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
 
@@ -171,6 +177,11 @@ static void nfs_direct_req_release(struct kref *kref)
 	kmem_cache_free(nfs_direct_cachep, dreq);
 }
 
+static void nfs_direct_req_release(struct nfs_direct_req *dreq)
+{
+	kref_put(&dreq->kref, nfs_direct_req_free);
+}
+
 /*
  * Collects and returns the final error value/byte-count.
  */
@@ -190,7 +201,6 @@ static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
 		result = dreq->count;
 
 out:
-	kref_put(&dreq->kref, nfs_direct_req_release);
 	return (ssize_t) result;
 }
 
@@ -208,7 +218,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 	}
 	complete_all(&dreq->completion);
 
-	kref_put(&dreq->kref, nfs_direct_req_release);
+	nfs_direct_req_release(dreq);
 }
 
 /*
@@ -224,17 +234,18 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 
-	nfs_direct_dirty_pages(data->pagevec, data->npages);
-	nfs_direct_release_pages(data->pagevec, data->npages);
-
 	spin_lock(&dreq->lock);
-
-	if (likely(task->tk_status >= 0))
-		dreq->count += data->res.count;
-	else
+	if (unlikely(task->tk_status < 0)) {
 		dreq->error = task->tk_status;
-
-	spin_unlock(&dreq->lock);
+		spin_unlock(&dreq->lock);
+	} else {
+		dreq->count += data->res.count;
+		spin_unlock(&dreq->lock);
+		nfs_direct_dirty_pages(data->pagevec,
+				data->args.pgbase,
+				data->res.count);
+	}
+	nfs_direct_release_pages(data->pagevec, data->npages);
 
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
@@ -271,7 +282,7 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 		bytes = min(rsize,count);
 
 		result = -ENOMEM;
-		data = nfs_readdata_alloc(pgbase + bytes);
+		data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes));
 		if (unlikely(!data))
 			break;
 
@@ -279,9 +290,12 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 		result = get_user_pages(current, current->mm, user_addr,
 					data->npages, 1, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages)) {
-			if (result > 0)
-				nfs_direct_release_pages(data->pagevec, result);
+		if (result < 0) {
+			nfs_readdata_release(data);
+			break;
+		}
+		if ((unsigned)result < data->npages) {
+			nfs_direct_release_pages(data->pagevec, result);
 			nfs_readdata_release(data);
 			break;
 		}
@@ -359,6 +373,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 	if (!result)
 		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
+	nfs_direct_req_release(dreq);
 
 	return result;
 }
@@ -602,7 +617,7 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 		bytes = min(wsize,count);
 
 		result = -ENOMEM;
-		data = nfs_writedata_alloc(pgbase + bytes);
+		data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes));
 		if (unlikely(!data))
 			break;
 
@@ -610,9 +625,12 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 		result = get_user_pages(current, current->mm, user_addr,
 					data->npages, 0, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages)) {
-			if (result > 0)
-				nfs_direct_release_pages(data->pagevec, result);
+		if (result < 0) {
+			nfs_writedata_release(data);
+			break;
+		}
+		if ((unsigned)result < data->npages) {
+			nfs_direct_release_pages(data->pagevec, result);
 			nfs_writedata_release(data);
 			break;
 		}
@@ -703,6 +721,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	if (!result)
 		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
+	nfs_direct_req_release(dreq);
 
 	return result;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8e66b5a2d490..9eb8eb4e4a08 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -391,17 +392,12 @@ out_swapfile:
 
 static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct file_lock cfl;
 	struct inode *inode = filp->f_mapping->host;
 	int status = 0;
 
 	lock_kernel();
 	/* Try local locking first */
-	if (posix_test_lock(filp, fl, &cfl)) {
-		fl->fl_start = cfl.fl_start;
-		fl->fl_end = cfl.fl_end;
-		fl->fl_type = cfl.fl_type;
-		fl->fl_pid = cfl.fl_pid;
+	if (posix_test_lock(filp, fl)) {
 		goto out;
 	}
 
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 6ef268f7c300..d1cbf0a0fbb2 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -25,7 +25,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
@@ -42,7 +41,6 @@
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_CLIENT
-#define NFS_PARANOIA 1
 
 /*
  * get an NFS2/NFS3 root dentry from the root filehandle
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 9d4a6b2d1996..d11eb055265c 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -272,7 +272,7 @@ nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	mutex_unlock(&idmap->idmap_im_lock);
 	schedule();
-	current->state = TASK_RUNNING;
+	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&idmap->idmap_wq, &wq);
 	mutex_lock(&idmap->idmap_im_lock);
 
@@ -333,7 +333,7 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	mutex_unlock(&idmap->idmap_im_lock);
 	schedule();
-	current->state = TASK_RUNNING;
+	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&idmap->idmap_wq, &wq);
 	mutex_lock(&idmap->idmap_im_lock);
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 44aa9b726573..bd9f5a836592 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -15,7 +15,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-
+#include <linux/sched.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -48,7 +48,6 @@
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
-#define NFS_PARANOIA 1
 
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
@@ -1075,10 +1074,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	/*
 	 * Big trouble! The inode has become a different object.
 	 */
-#ifdef NFS_PARANOIA
 	printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n",
 			__FUNCTION__, inode->i_ino, inode->i_mode, fattr->mode);
-#endif
  out_err:
 	/*
 	 * No need to worry about unhashing the dentry, as the
@@ -1167,22 +1164,19 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		inode_init_once(&nfsi->vfs_inode);
-		spin_lock_init(&nfsi->req_lock);
-		INIT_LIST_HEAD(&nfsi->dirty);
-		INIT_LIST_HEAD(&nfsi->commit);
-		INIT_LIST_HEAD(&nfsi->open_files);
-		INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
-		INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
-		INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-		atomic_set(&nfsi->data_updates, 0);
-		nfsi->ndirty = 0;
-		nfsi->ncommit = 0;
-		nfsi->npages = 0;
-		nfs4_init_once(nfsi);
-	}
+	inode_init_once(&nfsi->vfs_inode);
+	spin_lock_init(&nfsi->req_lock);
+	INIT_LIST_HEAD(&nfsi->dirty);
+	INIT_LIST_HEAD(&nfsi->commit);
+	INIT_LIST_HEAD(&nfsi->open_files);
+	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
+	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
+	atomic_set(&nfsi->data_updates, 0);
+	nfsi->ndirty = 0;
+	nfsi->ncommit = 0;
+	nfsi->npages = 0;
+	nfs4_init_once(nfsi);
 }
  
 static int __init nfs_init_inodecache(void)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 6610f2b02077..ad2b40db1e65 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -231,3 +231,15 @@ unsigned int nfs_page_length(struct page *page)
 	}
 	return 0;
 }
+
+/*
+ * Determine the number of pages in an array of length 'len' and
+ * with a base offset of 'base'
+ */
+static inline
+unsigned int nfs_page_array_len(unsigned int base, size_t len)
+{
+	return ((unsigned long)len + (unsigned long)base +
+		PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index f75fe72b4160..ca5a266a3140 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -133,13 +133,15 @@ xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
 
 #define MNT_dirpath_sz		(1 + 256)
 #define MNT_fhstatus_sz		(1 + 8)
+#define MNT_fhstatus3_sz	(1 + 16)
 
 static struct rpc_procinfo	mnt_procedures[] = {
 [MNTPROC_MNT] = {
 	  .p_proc		= MNTPROC_MNT,
 	  .p_encode		= (kxdrproc_t) xdr_encode_dirpath,	
 	  .p_decode		= (kxdrproc_t) xdr_decode_fhstatus,
-	  .p_bufsiz		= MNT_dirpath_sz << 2,
+	  .p_arglen		= MNT_dirpath_sz,
+	  .p_replen		= MNT_fhstatus_sz,
 	  .p_statidx		= MNTPROC_MNT,
 	  .p_name		= "MOUNT",
 	},
@@ -150,7 +152,8 @@ static struct rpc_procinfo mnt3_procedures[] = {
 	  .p_proc		= MOUNTPROC3_MNT,
 	  .p_encode		= (kxdrproc_t) xdr_encode_dirpath,
 	  .p_decode		= (kxdrproc_t) xdr_decode_fhstatus3,
-	  .p_bufsiz		= MNT_dirpath_sz << 2,
+	  .p_arglen		= MNT_dirpath_sz,
+	  .p_replen		= MNT_fhstatus3_sz,
 	  .p_statidx		= MOUNTPROC3_MNT,
 	  .p_name		= "MOUNT",
 	},
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 3be4e72a0227..cd3ca7b5d3db 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -26,7 +26,6 @@
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
-/* #define NFS_PARANOIA 1 */
 
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO		EIO
@@ -687,16 +686,13 @@ nfs_stat_to_errno(int stat)
 	return nfs_errtbl[i].errno;
 }
 
-#ifndef MAX
-# define MAX(a, b)	(((a) > (b))? (a) : (b))
-#endif
-
 #define PROC(proc, argtype, restype, timer)				\
 [NFSPROC_##proc] = {							\
 	.p_proc	    =  NFSPROC_##proc,					\
 	.p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,			\
 	.p_decode   =  (kxdrproc_t) nfs_xdr_##restype,			\
-	.p_bufsiz   =  MAX(NFS_##argtype##_sz,NFS_##restype##_sz) << 2,	\
+	.p_arglen   =  NFS_##argtype##_sz,				\
+	.p_replen   =  NFS_##restype##_sz,				\
 	.p_timer    =  timer,						\
 	.p_statidx  =  NFSPROC_##proc,					\
 	.p_name     =  #proc,						\
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 7d0371e2bad5..45268d6def2e 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -16,7 +16,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include <linux/nfs_mount.h>
 
 #include "iostat.h"
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 0ace092d126f..b51df8eb9f01 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1102,16 +1102,13 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 }
 #endif  /* CONFIG_NFS_V3_ACL */
 
-#ifndef MAX
-# define MAX(a, b)	(((a) > (b))? (a) : (b))
-#endif
-
 #define PROC(proc, argtype, restype, timer)				\
 [NFS3PROC_##proc] = {							\
 	.p_proc      = NFS3PROC_##proc,					\
 	.p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,			\
 	.p_decode    = (kxdrproc_t) nfs3_xdr_##restype,			\
-	.p_bufsiz    = MAX(NFS3_##argtype##_sz,NFS3_##restype##_sz) << 2,	\
+	.p_arglen    = NFS3_##argtype##_sz,				\
+	.p_replen    = NFS3_##restype##_sz,				\
 	.p_timer     = timer,						\
 	.p_statidx   = NFS3PROC_##proc,					\
 	.p_name      = #proc,						\
@@ -1153,7 +1150,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 		.p_proc = ACLPROC3_GETACL,
 		.p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
 		.p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
-		.p_bufsiz = MAX(ACL3_getaclargs_sz, ACL3_getaclres_sz) << 2,
+		.p_arglen = ACL3_getaclargs_sz,
+		.p_replen = ACL3_getaclres_sz,
 		.p_timer = 1,
 		.p_name = "GETACL",
 	},
@@ -1161,7 +1159,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 		.p_proc = ACLPROC3_SETACL,
 		.p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
 		.p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
-		.p_bufsiz = MAX(ACL3_setaclargs_sz, ACL3_setaclres_sz) << 2,
+		.p_arglen = ACL3_setaclargs_sz,
+		.p_replen = ACL3_setaclres_sz,
 		.p_timer = 0,
 		.p_name = "SETACL",
 	},
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f52cf5c33c6c..648e0ac0f90e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -790,7 +790,7 @@ out:
 	return -EACCES;
 }
 
-int nfs4_recover_expired_lease(struct nfs_server *server)
+static int nfs4_recover_expired_lease(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
 	int ret;
@@ -2647,8 +2647,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 	nfs_inode_return_delegation(inode);
 	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
 	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-	if (ret == 0)
-		nfs4_write_cached_acl(inode, buf, buflen);
+	nfs_zap_caches(inode);
 	return ret;
 }
 
@@ -2749,7 +2748,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 /* This is the error handling routine for processes that are allowed
  * to sleep.
  */
-int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
 {
 	struct nfs_client *clp = server->nfs_client;
 	int ret = errorcode;
@@ -3018,6 +3017,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 		case -NFS4ERR_DENIED:
 			status = 0;
 	}
+	request->fl_ops->fl_release_private(request);
 out:
 	up_read(&clp->cl_sem);
 	return status;
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index f5f4430fb2a4..0505ca124034 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -43,7 +43,6 @@
  * child task framework of the RPC layer?
  */
 
-#include <linux/smp_lock.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/sunrpc/sched.h>
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5fffbdfa971f..8ed79d5c54f9 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -104,7 +104,7 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
 	return cred;
 }
 
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
+static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f02d522fd788..8003c91ccb9a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -224,7 +224,8 @@ static int nfs4_stat_to_errno(int);
                                 encode_getattr_maxsz)
 #define NFS4_dec_setattr_sz     (compound_decode_hdr_maxsz + \
                                 decode_putfh_maxsz + \
-                                op_decode_hdr_maxsz + 3)
+                                op_decode_hdr_maxsz + 3 + \
+                                nfs4_fattr_maxsz)
 #define NFS4_enc_fsinfo_sz	(compound_encode_hdr_maxsz + \
 				encode_putfh_maxsz + \
 				encode_fsinfo_maxsz)
@@ -645,10 +646,10 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8+sizeof(arg->stateid->data));
+	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
 	WRITE32(OP_CLOSE);
 	WRITE32(arg->seqid->sequence->counter);
-	WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
+	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
 	
 	return 0;
 }
@@ -792,17 +793,17 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
 	WRITE64(nfs4_lock_length(args->fl));
 	WRITE32(args->new_lock_owner);
 	if (args->new_lock_owner){
-		RESERVE_SPACE(40);
+		RESERVE_SPACE(4+NFS4_STATEID_SIZE+20);
 		WRITE32(args->open_seqid->sequence->counter);
-		WRITEMEM(args->open_stateid->data, sizeof(args->open_stateid->data));
+		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
 		WRITE32(args->lock_seqid->sequence->counter);
 		WRITE64(args->lock_owner.clientid);
 		WRITE32(4);
 		WRITE32(args->lock_owner.id);
 	}
 	else {
-		RESERVE_SPACE(20);
-		WRITEMEM(args->lock_stateid->data, sizeof(args->lock_stateid->data));
+		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
+		WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
 		WRITE32(args->lock_seqid->sequence->counter);
 	}
 
@@ -829,11 +830,11 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
 {
 	__be32 *p;
 
-	RESERVE_SPACE(44);
+	RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
 	WRITE32(OP_LOCKU);
 	WRITE32(nfs4_lock_type(args->fl, 0));
 	WRITE32(args->seqid->sequence->counter);
-	WRITEMEM(args->stateid->data, sizeof(args->stateid->data));
+	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
 
@@ -965,9 +966,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+sizeof(stateid->data));
+	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
-	WRITEMEM(stateid->data, sizeof(stateid->data));
+	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -995,9 +996,9 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8+sizeof(arg->stateid->data));
+	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
 	WRITE32(OP_OPEN_CONFIRM);
-	WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
+	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
 	WRITE32(arg->seqid->sequence->counter);
 
 	return 0;
@@ -1007,9 +1008,9 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8+sizeof(arg->stateid->data));
+	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
 	WRITE32(OP_OPEN_DOWNGRADE);
-	WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
+	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
 	WRITE32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->open_flags);
 	return 0;
@@ -1044,12 +1045,12 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	nfs4_stateid stateid;
 	__be32 *p;
 
-	RESERVE_SPACE(16);
+	RESERVE_SPACE(NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
-		WRITEMEM(stateid.data, sizeof(stateid.data));
+		WRITEMEM(stateid.data, NFS4_STATEID_SIZE);
 	} else
-		WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
+		WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
 }
 
 static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
@@ -1078,10 +1079,10 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	int replen;
 	__be32 *p;
 
-	RESERVE_SPACE(32+sizeof(nfs4_verifier));
+	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
 	WRITE32(OP_READDIR);
 	WRITE64(readdir->cookie);
-	WRITEMEM(readdir->verifier.data, sizeof(readdir->verifier.data));
+	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
 	WRITE32(readdir->count >> 1);  /* We're not doing readdirplus */
 	WRITE32(readdir->count);
 	WRITE32(2);
@@ -1189,9 +1190,9 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+sizeof(zero_stateid.data));
+	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	WRITE32(OP_SETATTR);
-	WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
+	WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
 	RESERVE_SPACE(2*4);
 	WRITE32(1);
 	WRITE32(FATTR4_WORD0_ACL);
@@ -1219,9 +1220,9 @@ static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *
 	int status;
 	__be32 *p;
 	
-        RESERVE_SPACE(4+sizeof(arg->stateid.data));
+        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
         WRITE32(OP_SETATTR);
-	WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
+	WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
 
         if ((status = encode_attrs(xdr, arg->iap, server)))
 		return status;
@@ -1233,9 +1234,9 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4 + sizeof(setclientid->sc_verifier->data));
+	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
 	WRITE32(OP_SETCLIENTID);
-	WRITEMEM(setclientid->sc_verifier->data, sizeof(setclientid->sc_verifier->data));
+	WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	RESERVE_SPACE(4);
@@ -1252,10 +1253,10 @@ static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_c
 {
         __be32 *p;
 
-        RESERVE_SPACE(12 + sizeof(client_state->cl_confirm.data));
+        RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
         WRITE32(OP_SETCLIENTID_CONFIRM);
         WRITE64(client_state->cl_clientid);
-        WRITEMEM(client_state->cl_confirm.data, sizeof(client_state->cl_confirm.data));
+        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 
         return 0;
 }
@@ -1283,10 +1284,10 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
 {
 	__be32 *p;
 
-	RESERVE_SPACE(20);
+	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 
 	WRITE32(OP_DELEGRETURN);
-	WRITEMEM(stateid->data, sizeof(stateid->data));
+	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
 	return 0;
 
 }
@@ -2079,9 +2080,11 @@ out:
 
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
-	if (!p) { \
-		printk(KERN_WARNING "%s: reply buffer overflowed in line %d.", \
-			       	__FUNCTION__, __LINE__); \
+	if (unlikely(!p)) { \
+		printk(KERN_INFO "%s: prematurely hit end of receive" \
+				" buffer\n", __FUNCTION__); \
+		printk(KERN_INFO "%s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
+				__FUNCTION__, xdr->p, nbytes, xdr->end); \
 		return -EIO; \
 	} \
 } while (0)
@@ -2491,7 +2494,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 				int i;
 				dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
 				for (i = loc->nservers; i < m; i++) {
-					int len;
+					unsigned int len;
 					char *data;
 					status = decode_opaque_inline(xdr, &len, &data);
 					if (unlikely(status != 0))
@@ -2639,7 +2642,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 	return 0;
 }
 
-static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *uid)
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
 {
 	uint32_t len;
 	__be32 *p;
@@ -2664,7 +2667,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	return 0;
 }
 
-static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *gid)
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
 {
 	uint32_t len;
 	__be32 *p;
@@ -2894,8 +2897,8 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 	status = decode_op_hdr(xdr, OP_CLOSE);
 	if (status)
 		return status;
-	READ_BUF(sizeof(res->stateid.data));
-	COPYMEM(res->stateid.data, sizeof(res->stateid.data));
+	READ_BUF(NFS4_STATEID_SIZE);
+	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
 	return 0;
 }
 
@@ -3183,8 +3186,8 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 
 	status = decode_op_hdr(xdr, OP_LOCK);
 	if (status == 0) {
-		READ_BUF(sizeof(res->stateid.data));
-		COPYMEM(res->stateid.data, sizeof(res->stateid.data));
+		READ_BUF(NFS4_STATEID_SIZE);
+		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
 	} else if (status == -NFS4ERR_DENIED)
 		return decode_lock_denied(xdr, NULL);
 	return status;
@@ -3206,8 +3209,8 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 
 	status = decode_op_hdr(xdr, OP_LOCKU);
 	if (status == 0) {
-		READ_BUF(sizeof(res->stateid.data));
-		COPYMEM(res->stateid.data, sizeof(res->stateid.data));
+		READ_BUF(NFS4_STATEID_SIZE);
+		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
 	}
 	return status;
 }
@@ -3248,8 +3251,8 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 		res->delegation_type = 0;
 		return 0;
 	}
-	READ_BUF(20);
-	COPYMEM(res->delegation.data, sizeof(res->delegation.data));
+	READ_BUF(NFS4_STATEID_SIZE+4);
+	COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
 	READ32(res->do_recall);
 	switch (delegation_type) {
 		case NFS4_OPEN_DELEGATE_READ:
@@ -3272,8 +3275,8 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
         status = decode_op_hdr(xdr, OP_OPEN);
         if (status)
                 return status;
-        READ_BUF(sizeof(res->stateid.data));
-        COPYMEM(res->stateid.data, sizeof(res->stateid.data));
+        READ_BUF(NFS4_STATEID_SIZE);
+        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
 
         decode_change_info(xdr, &res->cinfo);
 
@@ -3299,8 +3302,8 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
         status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
         if (status)
                 return status;
-        READ_BUF(sizeof(res->stateid.data));
-        COPYMEM(res->stateid.data, sizeof(res->stateid.data));
+        READ_BUF(NFS4_STATEID_SIZE);
+        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
         return 0;
 }
 
@@ -3312,8 +3315,8 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re
 	status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
 	if (status)
 		return status;
-	READ_BUF(sizeof(res->stateid.data));
-	COPYMEM(res->stateid.data, sizeof(res->stateid.data));
+	READ_BUF(NFS4_STATEID_SIZE);
+	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
 	return 0;
 }
 
@@ -3587,9 +3590,9 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	}
 	READ32(nfserr);
 	if (nfserr == NFS_OK) {
-		READ_BUF(8 + sizeof(clp->cl_confirm.data));
+		READ_BUF(8 + NFS4_VERIFIER_SIZE);
 		READ64(clp->cl_clientid);
-		COPYMEM(clp->cl_confirm.data, sizeof(clp->cl_confirm.data));
+		COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
 
@@ -4546,16 +4549,13 @@ nfs4_stat_to_errno(int stat)
 	return stat;
 }
 
-#ifndef MAX
-# define MAX(a, b)	(((a) > (b))? (a) : (b))
-#endif
-
 #define PROC(proc, argtype, restype)				\
 [NFSPROC4_CLNT_##proc] = {					\
 	.p_proc   = NFSPROC4_COMPOUND,				\
 	.p_encode = (kxdrproc_t) nfs4_xdr_##argtype,		\
 	.p_decode = (kxdrproc_t) nfs4_xdr_##restype,		\
-	.p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,	\
+	.p_arglen = NFS4_##argtype##_sz,			\
+	.p_replen = NFS4_##restype##_sz,			\
 	.p_statidx = NFSPROC4_CLNT_##proc,			\
 	.p_name   = #proc,					\
     }
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 75f819dc0255..49d1008ce1d7 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -428,7 +428,7 @@ static int __init root_nfs_getport(int program, int version, int proto)
 	printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n",
 		program, version, NIPQUAD(servaddr));
 	set_sockaddr(&sin, servaddr, 0);
-	return rpc_getport_external(&sin, program, version, proto);
+	return rpcb_getport_external(&sin, program, version, proto);
 }
 
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ca4b1d4ff42b..c5bb51a29e80 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -11,15 +11,15 @@
 
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/sched.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs3.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_page.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
-#include <linux/writeback.h>
 
-#define NFS_PARANOIA 1
+#include "internal.h"
 
 static struct kmem_cache *nfs_page_cachep;
 
@@ -50,9 +50,7 @@ nfs_page_free(struct nfs_page *p)
  * @count: number of bytes to read/write
  *
  * The page must be locked by the caller. This makes sure we never
- * create two different requests for the same page, and avoids
- * a possible deadlock when we reach the hard limit on the number
- * of dirty pages.
+ * create two different requests for the same page.
  * User should ensure it is safe to sleep in this function.
  */
 struct nfs_page *
@@ -63,16 +61,12 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_page		*req;
 
-	/* Deal with hard limits.  */
 	for (;;) {
 		/* try to allocate the request struct */
 		req = nfs_page_alloc();
 		if (req != NULL)
 			break;
 
-		/* Try to free up at least one request in order to stay
-		 * below the hard limit
-		 */
 		if (signalled() && (server->flags & NFS_MOUNT_INTR))
 			return ERR_PTR(-ERESTARTSYS);
 		yield();
@@ -172,11 +166,6 @@ nfs_release_request(struct nfs_page *req)
 	if (!atomic_dec_and_test(&req->wb_count))
 		return;
 
-#ifdef NFS_PARANOIA
-	BUG_ON (!list_empty(&req->wb_list));
-	BUG_ON (NFS_WBACK_BUSY(req));
-#endif
-
 	/* Release struct file or cached credential */
 	nfs_clear_request(req);
 	put_nfs_open_context(req->wb_context);
@@ -223,123 +212,170 @@ out:
 }
 
 /**
- * nfs_coalesce_requests - Split coalesced requests out from a list.
- * @head: source list
- * @dst: destination list
- * @nmax: maximum number of requests to coalesce
+ * nfs_pageio_init - initialise a page io descriptor
+ * @desc: pointer to descriptor
+ * @inode: pointer to inode
+ * @doio: pointer to io function
+ * @bsize: io block size
+ * @io_flags: extra parameters for the io function
+ */
+void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
+		     struct inode *inode,
+		     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+		     size_t bsize,
+		     int io_flags)
+{
+	INIT_LIST_HEAD(&desc->pg_list);
+	desc->pg_bytes_written = 0;
+	desc->pg_count = 0;
+	desc->pg_bsize = bsize;
+	desc->pg_base = 0;
+	desc->pg_inode = inode;
+	desc->pg_doio = doio;
+	desc->pg_ioflags = io_flags;
+	desc->pg_error = 0;
+}
+
+/**
+ * nfs_can_coalesce_requests - test two requests for compatibility
+ * @prev: pointer to nfs_page
+ * @req: pointer to nfs_page
  *
- * Moves a maximum of 'nmax' elements from one list to another.
- * The elements are checked to ensure that they form a contiguous set
- * of pages, and that the RPC credentials are the same.
+ * The nfs_page structures 'prev' and 'req' are compared to ensure that the
+ * page data area they describe is contiguous, and that their RPC
+ * credentials, NFSv4 open state, and lockowners are the same.
+ *
+ * Return 'true' if this is the case, else return 'false'.
  */
-int
-nfs_coalesce_requests(struct list_head *head, struct list_head *dst,
-		      unsigned int nmax)
+static int nfs_can_coalesce_requests(struct nfs_page *prev,
+				     struct nfs_page *req)
 {
-	struct nfs_page		*req = NULL;
-	unsigned int		npages = 0;
-
-	while (!list_empty(head)) {
-		struct nfs_page	*prev = req;
-
-		req = nfs_list_entry(head->next);
-		if (prev) {
-			if (req->wb_context->cred != prev->wb_context->cred)
-				break;
-			if (req->wb_context->lockowner != prev->wb_context->lockowner)
-				break;
-			if (req->wb_context->state != prev->wb_context->state)
-				break;
-			if (req->wb_index != (prev->wb_index + 1))
-				break;
-
-			if (req->wb_pgbase != 0)
-				break;
-		}
-		nfs_list_remove_request(req);
-		nfs_list_add_request(req, dst);
-		npages++;
-		if (req->wb_pgbase + req->wb_bytes != PAGE_CACHE_SIZE)
-			break;
-		if (npages >= nmax)
-			break;
-	}
-	return npages;
+	if (req->wb_context->cred != prev->wb_context->cred)
+		return 0;
+	if (req->wb_context->lockowner != prev->wb_context->lockowner)
+		return 0;
+	if (req->wb_context->state != prev->wb_context->state)
+		return 0;
+	if (req->wb_index != (prev->wb_index + 1))
+		return 0;
+	if (req->wb_pgbase != 0)
+		return 0;
+	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+		return 0;
+	return 1;
 }
 
-#define NFS_SCAN_MAXENTRIES 16
 /**
- * nfs_scan_dirty - Scan the radix tree for dirty requests
- * @mapping: pointer to address space
- * @wbc: writeback_control structure
- * @dst: Destination list
+ * nfs_pageio_do_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
  *
- * Moves elements from one of the inode request lists.
- * If the number of requests is set to 0, the entire address_space
- * starting at index idx_start, is scanned.
- * The requests are *not* checked to ensure that they form a contiguous set.
- * You must be holding the inode's req_lock when calling this function
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
  */
-long nfs_scan_dirty(struct address_space *mapping,
-			struct writeback_control *wbc,
-			struct list_head *dst)
+static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+				     struct nfs_page *req)
 {
-	struct nfs_inode *nfsi = NFS_I(mapping->host);
-	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
-	struct nfs_page *req;
-	pgoff_t idx_start, idx_end;
-	long res = 0;
-	int found, i;
+	size_t newlen = req->wb_bytes;
 
-	if (nfsi->ndirty == 0)
-		return 0;
-	if (wbc->range_cyclic) {
-		idx_start = 0;
-		idx_end = ULONG_MAX;
-	} else if (wbc->range_end == 0) {
-		idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
-		idx_end = ULONG_MAX;
-	} else {
-		idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
-		idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
-	}
+	if (desc->pg_count != 0) {
+		struct nfs_page *prev;
 
-	for (;;) {
-		unsigned int toscan = NFS_SCAN_MAXENTRIES;
+		/*
+		 * FIXME: ideally we should be able to coalesce all requests
+		 * that are not block boundary aligned, but currently this
+		 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
+		 * since nfs_flush_multi and nfs_pagein_multi assume you
+		 * can have only one struct nfs_page.
+		 */
+		if (desc->pg_bsize < PAGE_SIZE)
+			return 0;
+		newlen += desc->pg_count;
+		if (newlen > desc->pg_bsize)
+			return 0;
+		prev = nfs_list_entry(desc->pg_list.prev);
+		if (!nfs_can_coalesce_requests(prev, req))
+			return 0;
+	} else
+		desc->pg_base = req->wb_pgbase;
+	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &desc->pg_list);
+	desc->pg_count = newlen;
+	return 1;
+}
 
-		found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
-				(void **)&pgvec[0], idx_start, toscan,
-				NFS_PAGE_TAG_DIRTY);
+/*
+ * Helper for nfs_pageio_add_request and nfs_pageio_complete
+ */
+static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
+{
+	if (!list_empty(&desc->pg_list)) {
+		int error = desc->pg_doio(desc->pg_inode,
+					  &desc->pg_list,
+					  nfs_page_array_len(desc->pg_base,
+							     desc->pg_count),
+					  desc->pg_count,
+					  desc->pg_ioflags);
+		if (error < 0)
+			desc->pg_error = error;
+		else
+			desc->pg_bytes_written += desc->pg_count;
+	}
+	if (list_empty(&desc->pg_list)) {
+		desc->pg_count = 0;
+		desc->pg_base = 0;
+	}
+}
 
-		/* Did we make progress? */
-		if (found <= 0)
-			break;
+/**
+ * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
+ * @desc: destination io descriptor
+ * @req: request
+ *
+ * Returns true if the request 'req' was successfully coalesced into the
+ * existing list of pages 'desc'.
+ */
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+			   struct nfs_page *req)
+{
+	while (!nfs_pageio_do_add_request(desc, req)) {
+		nfs_pageio_doio(desc);
+		if (desc->pg_error < 0)
+			return 0;
+	}
+	return 1;
+}
 
-		for (i = 0; i < found; i++) {
-			req = pgvec[i];
-			if (!wbc->range_cyclic && req->wb_index > idx_end)
-				goto out;
+/**
+ * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
+{
+	nfs_pageio_doio(desc);
+}
 
-			/* Try to lock request and mark it for writeback */
-			if (!nfs_set_page_writeback_locked(req))
-				goto next;
-			radix_tree_tag_clear(&nfsi->nfs_page_tree,
-					req->wb_index, NFS_PAGE_TAG_DIRTY);
-			nfsi->ndirty--;
-			nfs_list_remove_request(req);
-			nfs_list_add_request(req, dst);
-			res++;
-			if (res == LONG_MAX)
-				goto out;
-next:
-			idx_start = req->wb_index + 1;
-		}
+/**
+ * nfs_pageio_cond_complete - Conditional I/O completion
+ * @desc: pointer to io descriptor
+ * @index: page index
+ *
+ * It is important to ensure that processes don't try to take locks
+ * on non-contiguous ranges of pages as that might deadlock. This
+ * function should be called before attempting to wait on a locked
+ * nfs_page. It will complete the I/O if the page index 'index'
+ * is not contiguous with the existing list of pages in 'desc'.
+ */
+void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
+{
+	if (!list_empty(&desc->pg_list)) {
+		struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);
+		if (index != prev->wb_index + 1)
+			nfs_pageio_doio(desc);
 	}
-out:
-	WARN_ON ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty));
-	return res;
 }
 
+#define NFS_SCAN_MAXENTRIES 16
 /**
  * nfs_scan_list - Scan a list for matching requests
  * @nfsi: NFS inode
@@ -355,12 +391,12 @@ out:
  * You must be holding the inode's req_lock when calling this function
  */
 int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
-		struct list_head *dst, unsigned long idx_start,
+		struct list_head *dst, pgoff_t idx_start,
 		unsigned int npages)
 {
 	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
 	struct nfs_page *req;
-	unsigned long idx_end;
+	pgoff_t idx_end;
 	int found, i;
 	int res;
 
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 1dcf56de9482..7be0ee2782cb 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -43,7 +43,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6ab4d5a9edf2..7bd7cb95c034 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -27,7 +27,8 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
-static int nfs_pagein_one(struct list_head *, struct inode *);
+static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 
@@ -36,9 +37,8 @@ static mempool_t *nfs_rdata_mempool;
 
 #define MIN_POOL_READ	(32)
 
-struct nfs_read_data *nfs_readdata_alloc(size_t len)
+struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-	unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
 
 	if (p) {
@@ -79,7 +79,7 @@ void nfs_readdata_release(void *data)
 static
 int nfs_return_empty_page(struct page *page)
 {
-	memclear_highpage_flush(page, 0, PAGE_CACHE_SIZE);
+	zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
 	SetPageUptodate(page);
 	unlock_page(page);
 	return 0;
@@ -103,10 +103,10 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 	pglen = PAGE_CACHE_SIZE - base;
 	for (;;) {
 		if (remainder <= pglen) {
-			memclear_highpage_flush(*pages, base, remainder);
+			zero_user_page(*pages, base, remainder, KM_USER0);
 			break;
 		}
-		memclear_highpage_flush(*pages, base, pglen);
+		zero_user_page(*pages, base, pglen, KM_USER0);
 		pages++;
 		remainder -= pglen;
 		pglen = PAGE_CACHE_SIZE;
@@ -130,10 +130,13 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 		return PTR_ERR(new);
 	}
 	if (len < PAGE_CACHE_SIZE)
-		memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
+		zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0);
 
 	nfs_list_add_request(new, &one_request);
-	nfs_pagein_one(&one_request, inode);
+	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
+		nfs_pagein_multi(inode, &one_request, 1, len, 0);
+	else
+		nfs_pagein_one(inode, &one_request, 1, len, 0);
 	return 0;
 }
 
@@ -230,7 +233,7 @@ static void nfs_execute_read(struct nfs_read_data *data)
  * won't see the new data until our attribute cache is updated.  This is more
  * or less conventional NFS client behavior.
  */
-static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
+static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
 {
 	struct nfs_page *req = nfs_list_entry(head->next);
 	struct page *page = req->wb_page;
@@ -242,11 +245,11 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
 
 	nfs_list_remove_request(req);
 
-	nbytes = req->wb_bytes;
+	nbytes = count;
 	do {
 		size_t len = min(nbytes,rsize);
 
-		data = nfs_readdata_alloc(len);
+		data = nfs_readdata_alloc(1);
 		if (!data)
 			goto out_bad;
 		INIT_LIST_HEAD(&data->pages);
@@ -258,23 +261,19 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
 
 	ClearPageError(page);
 	offset = 0;
-	nbytes = req->wb_bytes;
+	nbytes = count;
 	do {
 		data = list_entry(list.next, struct nfs_read_data, pages);
 		list_del_init(&data->pages);
 
 		data->pagevec[0] = page;
 
-		if (nbytes > rsize) {
-			nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-					rsize, offset);
-			offset += rsize;
-			nbytes -= rsize;
-		} else {
-			nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-					nbytes, offset);
-			nbytes = 0;
-		}
+		if (nbytes < rsize)
+			rsize = nbytes;
+		nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
+				  rsize, offset);
+		offset += rsize;
+		nbytes -= rsize;
 		nfs_execute_read(data);
 	} while (nbytes != 0);
 
@@ -291,30 +290,24 @@ out_bad:
 	return -ENOMEM;
 }
 
-static int nfs_pagein_one(struct list_head *head, struct inode *inode)
+static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_read_data	*data;
-	unsigned int		count;
 
-	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
-		return nfs_pagein_multi(head, inode);
-
-	data = nfs_readdata_alloc(NFS_SERVER(inode)->rsize);
+	data = nfs_readdata_alloc(npages);
 	if (!data)
 		goto out_bad;
 
 	INIT_LIST_HEAD(&data->pages);
 	pages = data->pagevec;
-	count = 0;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
 		ClearPageError(req->wb_page);
 		*pages++ = req->wb_page;
-		count += req->wb_bytes;
 	}
 	req = nfs_list_entry(data->pages.next);
 
@@ -327,28 +320,6 @@ out_bad:
 	return -ENOMEM;
 }
 
-static int
-nfs_pagein_list(struct list_head *head, int rpages)
-{
-	LIST_HEAD(one_request);
-	struct nfs_page		*req;
-	int			error = 0;
-	unsigned int		pages = 0;
-
-	while (!list_empty(head)) {
-		pages += nfs_coalesce_requests(head, &one_request, rpages);
-		req = nfs_list_entry(one_request.next);
-		error = nfs_pagein_one(&one_request, req->wb_context->dentry->d_inode);
-		if (error < 0)
-			break;
-	}
-	if (error >= 0)
-		return pages;
-
-	nfs_async_read_error(head);
-	return error;
-}
-
 /*
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
@@ -538,7 +509,7 @@ out_error:
 }
 
 struct nfs_readdesc {
-	struct list_head *head;
+	struct nfs_pageio_descriptor *pgio;
 	struct nfs_open_context *ctx;
 };
 
@@ -561,20 +532,22 @@ readpage_async_filler(void *data, struct page *page)
 			return PTR_ERR(new);
 	}
 	if (len < PAGE_CACHE_SIZE)
-		memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
-	nfs_list_add_request(new, desc->head);
+		zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0);
+	nfs_pageio_add_request(desc->pgio, new);
 	return 0;
 }
 
 int nfs_readpages(struct file *filp, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	LIST_HEAD(head);
+	struct nfs_pageio_descriptor pgio;
 	struct nfs_readdesc desc = {
-		.head		= &head,
+		.pgio = &pgio,
 	};
 	struct inode *inode = mapping->host;
 	struct nfs_server *server = NFS_SERVER(inode);
+	size_t rsize = server->rsize;
+	unsigned long npages;
 	int ret = -ESTALE;
 
 	dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
@@ -593,13 +566,16 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	} else
 		desc.ctx = get_nfs_open_context((struct nfs_open_context *)
 				filp->private_data);
+	if (rsize < PAGE_CACHE_SIZE)
+		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
+	else
+		nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0);
+
 	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-	if (!list_empty(&head)) {
-		int err = nfs_pagein_list(&head, server->rpages);
-		if (!ret)
-			nfs_add_stats(inode, NFSIOS_READPAGES, err);
-			ret = err;
-	}
+
+	nfs_pageio_complete(&pgio);
+	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 	put_nfs_open_context(desc.ctx);
 out:
 	return ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1eae44b9a1a..ca20d3cc2609 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -204,9 +204,9 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	lock_kernel();
 
 	error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
-	buf->f_type = NFS_SUPER_MAGIC;
 	if (error < 0)
 		goto out_err;
+	buf->f_type = NFS_SUPER_MAGIC;
 
 	/*
 	 * Current versions of glibc do not correctly handle the
@@ -233,15 +233,14 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_ffree = res.afiles;
 
 	buf->f_namelen = server->namelen;
- out:
+
 	unlock_kernel();
 	return 0;
 
  out_err:
 	dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
-	buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
-	goto out;
-
+	unlock_kernel();
+	return error;
 }
 
 /*
@@ -291,6 +290,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		{ NFS_MOUNT_NOAC, ",noac", "" },
 		{ NFS_MOUNT_NONLM, ",nolock", "" },
 		{ NFS_MOUNT_NOACL, ",noacl", "" },
+		{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
 		{ 0, NULL, NULL }
 	};
 	const struct proc_nfs_info *nfs_infop;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index f4a0548b9ce8..83e865a16ad1 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/namei.h>
 
 /* Symlink caching in the page cache is even more simplistic
@@ -61,15 +60,9 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 		err = page;
 		goto read_failed;
 	}
-	if (!PageUptodate(page)) {
-		err = ERR_PTR(-EIO);
-		goto getlink_read_error;
-	}
 	nd_set_link(nd, kmap(page));
 	return page;
 
-getlink_read_error:
-	page_cache_release(page);
 read_failed:
 	nd_set_link(nd, err);
 	return NULL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 797558941745..af344a158e01 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -21,7 +21,6 @@
 #include <linux/backing-dev.h>
 
 #include <asm/uaccess.h>
-#include <linux/smp_lock.h>
 
 #include "delegation.h"
 #include "internal.h"
@@ -38,7 +37,8 @@
 static struct nfs_page * nfs_update_request(struct nfs_open_context*,
 					    struct page *,
 					    unsigned int, unsigned int);
-static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how);
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
+				  struct inode *inode, int ioflags);
 static const struct rpc_call_ops nfs_write_partial_ops;
 static const struct rpc_call_ops nfs_write_full_ops;
 static const struct rpc_call_ops nfs_commit_ops;
@@ -58,7 +58,7 @@ struct nfs_write_data *nfs_commit_alloc(void)
 	return p;
 }
 
-void nfs_commit_rcu_free(struct rcu_head *head)
+static void nfs_commit_rcu_free(struct rcu_head *head)
 {
 	struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu);
 	if (p && (p->pagevec != &p->page_array[0]))
@@ -71,9 +71,8 @@ void nfs_commit_free(struct nfs_write_data *wdata)
 	call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free);
 }
 
-struct nfs_write_data *nfs_writedata_alloc(size_t len)
+struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
-	unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
 
 	if (p) {
@@ -139,7 +138,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
 {
 	struct inode *inode = page->mapping->host;
 	loff_t end, i_size = i_size_read(inode);
-	unsigned long end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
 
 	if (i_size > 0 && page->index < end_index)
 		return;
@@ -169,7 +168,7 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int
 	if (count != nfs_page_length(page))
 		return;
 	if (count != PAGE_CACHE_SIZE)
-		memclear_highpage_flush(page, count, PAGE_CACHE_SIZE - count);
+		zero_user_page(page, count, PAGE_CACHE_SIZE - count, KM_USER0);
 	SetPageUptodate(page);
 }
 
@@ -201,7 +200,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 static int wb_priority(struct writeback_control *wbc)
 {
 	if (wbc->for_reclaim)
-		return FLUSH_HIGHPRI;
+		return FLUSH_HIGHPRI | FLUSH_STABLE;
 	if (wbc->for_kupdate)
 		return FLUSH_LOWPRI;
 	return 0;
@@ -225,7 +224,7 @@ static int nfs_set_page_writeback(struct page *page)
 		struct inode *inode = page->mapping->host;
 		struct nfs_server *nfss = NFS_SERVER(inode);
 
-		if (atomic_inc_return(&nfss->writeback) >
+		if (atomic_long_inc_return(&nfss->writeback) >
 				NFS_CONGESTION_ON_THRESH)
 			set_bdi_congested(&nfss->backing_dev_info, WRITE);
 	}
@@ -238,7 +237,7 @@ static void nfs_end_page_writeback(struct page *page)
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
-	if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
+	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
 		clear_bdi_congested(&nfss->backing_dev_info, WRITE);
 		congestion_end(WRITE);
 	}
@@ -251,7 +250,8 @@ static void nfs_end_page_writeback(struct page *page)
  * was not tagged.
  * May also return an error if the user signalled nfs_wait_on_request().
  */
-static int nfs_page_mark_flush(struct page *page)
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+				struct page *page)
 {
 	struct nfs_page *req;
 	struct nfs_inode *nfsi = NFS_I(page->mapping->host);
@@ -283,21 +283,18 @@ static int nfs_page_mark_flush(struct page *page)
 		/* This request is marked for commit */
 		spin_unlock(req_lock);
 		nfs_unlock_request(req);
+		nfs_pageio_complete(pgio);
 		return 1;
 	}
-	if (nfs_set_page_writeback(page) == 0) {
-		nfs_list_remove_request(req);
-		/* add the request to the inode's dirty list. */
-		radix_tree_tag_set(&nfsi->nfs_page_tree,
-				req->wb_index, NFS_PAGE_TAG_DIRTY);
-		nfs_list_add_request(req, &nfsi->dirty);
-		nfsi->ndirty++;
-		spin_unlock(req_lock);
-		__mark_inode_dirty(page->mapping->host, I_DIRTY_PAGES);
-	} else
+	if (nfs_set_page_writeback(page) != 0) {
 		spin_unlock(req_lock);
+		BUG();
+	}
+	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
+			NFS_PAGE_TAG_WRITEBACK);
 	ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
-	nfs_unlock_request(req);
+	spin_unlock(req_lock);
+	nfs_pageio_add_request(pgio, req);
 	return ret;
 }
 
@@ -306,6 +303,7 @@ static int nfs_page_mark_flush(struct page *page)
  */
 static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
 {
+	struct nfs_pageio_descriptor mypgio, *pgio;
 	struct nfs_open_context *ctx;
 	struct inode *inode = page->mapping->host;
 	unsigned offset;
@@ -314,7 +312,16 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
-	err = nfs_page_mark_flush(page);
+	if (wbc->for_writepages)
+		pgio = wbc->fs_private;
+	else {
+		nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
+		pgio = &mypgio;
+	}
+
+	nfs_pageio_cond_complete(pgio, page->index);
+
+	err = nfs_page_async_flush(pgio, page);
 	if (err <= 0)
 		goto out;
 	err = 0;
@@ -322,6 +329,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	if (!offset)
 		goto out;
 
+	nfs_pageio_cond_complete(pgio, page->index);
+
 	ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE);
 	if (ctx == NULL) {
 		err = -EBADF;
@@ -331,12 +340,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	put_nfs_open_context(ctx);
 	if (err != 0)
 		goto out;
-	err = nfs_page_mark_flush(page);
+	err = nfs_page_async_flush(pgio, page);
 	if (err > 0)
 		err = 0;
 out:
 	if (!wbc->for_writepages)
-		nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc));
+		nfs_pageio_complete(pgio);
 	return err;
 }
 
@@ -352,20 +361,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
+	struct nfs_pageio_descriptor pgio;
 	int err;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
+	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
+	wbc->fs_private = &pgio;
 	err = generic_writepages(mapping, wbc);
+	nfs_pageio_complete(&pgio);
 	if (err)
 		return err;
-	err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
-	if (err < 0)
-		goto out;
-	nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
-	err = 0;
-out:
-	return err;
+	if (pgio.pg_error)
+		return pgio.pg_error;
+	return 0;
 }
 
 /*
@@ -503,11 +512,11 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
  *
  * Interruptible by signals only if mounted with intr flag.
  */
-static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_start, unsigned int npages)
+static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *req;
-	unsigned long		idx_end, next;
+	pgoff_t idx_end, next;
 	unsigned int		res = 0;
 	int			error;
 
@@ -536,18 +545,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st
 	return res;
 }
 
-static void nfs_cancel_dirty_list(struct list_head *head)
-{
-	struct nfs_page *req;
-	while(!list_empty(head)) {
-		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_end_page_writeback(req->wb_page);
-		nfs_inode_remove_request(req);
-		nfs_clear_page_writeback(req);
-	}
-}
-
 static void nfs_cancel_commit_list(struct list_head *head)
 {
 	struct nfs_page *req;
@@ -574,7 +571,7 @@ static void nfs_cancel_commit_list(struct list_head *head)
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
 static int
-nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int res = 0;
@@ -588,40 +585,12 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
 	return res;
 }
 #else
-static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	return 0;
 }
 #endif
 
-static int nfs_wait_on_write_congestion(struct address_space *mapping)
-{
-	struct inode *inode = mapping->host;
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	int ret = 0;
-
-	might_sleep();
-
-	if (!bdi_write_congested(bdi))
-		return 0;
-
-	nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
-
-	do {
-		struct rpc_clnt *clnt = NFS_CLIENT(inode);
-		sigset_t oldset;
-
-		rpc_clnt_sigmask(clnt, &oldset);
-		ret = congestion_wait_interruptible(WRITE, HZ/10);
-		rpc_clnt_sigunmask(clnt, &oldset);
-		if (ret == -ERESTARTSYS)
-			break;
-		ret = 0;
-	} while (bdi_write_congested(bdi));
-
-	return ret;
-}
-
 /*
  * Try to update any existing write request, or create one if there is none.
  * In order to match, the request's credentials must match those of
@@ -636,12 +605,10 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 	struct inode *inode = mapping->host;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page		*req, *new = NULL;
-	unsigned long		rqend, end;
+	pgoff_t		rqend, end;
 
 	end = offset + bytes;
 
-	if (nfs_wait_on_write_congestion(mapping))
-		return ERR_PTR(-ERESTARTSYS);
 	for (;;) {
 		/* Loop over all inode entries and see if we find
 		 * A request for the page we wish to update
@@ -865,7 +832,7 @@ static void nfs_execute_write(struct nfs_write_data *data)
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
  */
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
+static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
 {
 	struct nfs_page *req = nfs_list_entry(head->next);
 	struct page *page = req->wb_page;
@@ -877,11 +844,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
 
 	nfs_list_remove_request(req);
 
-	nbytes = req->wb_bytes;
+	nbytes = count;
 	do {
 		size_t len = min(nbytes, wsize);
 
-		data = nfs_writedata_alloc(len);
+		data = nfs_writedata_alloc(1);
 		if (!data)
 			goto out_bad;
 		list_add(&data->pages, &list);
@@ -892,23 +859,19 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
 
 	ClearPageError(page);
 	offset = 0;
-	nbytes = req->wb_bytes;
+	nbytes = count;
 	do {
 		data = list_entry(list.next, struct nfs_write_data, pages);
 		list_del_init(&data->pages);
 
 		data->pagevec[0] = page;
 
-		if (nbytes > wsize) {
-			nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-					wsize, offset, how);
-			offset += wsize;
-			nbytes -= wsize;
-		} else {
-			nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-					nbytes, offset, how);
-			nbytes = 0;
-		}
+		if (nbytes < wsize)
+			wsize = nbytes;
+		nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
+				   wsize, offset, how);
+		offset += wsize;
+		nbytes -= wsize;
 		nfs_execute_write(data);
 	} while (nbytes != 0);
 
@@ -934,26 +897,23 @@ out_bad:
  * This is the case if nfs_updatepage detects a conflicting request
  * that has been written but not committed.
  */
-static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
+static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_write_data	*data;
-	unsigned int		count;
 
-	data = nfs_writedata_alloc(NFS_SERVER(inode)->wsize);
+	data = nfs_writedata_alloc(npages);
 	if (!data)
 		goto out_bad;
 
 	pages = data->pagevec;
-	count = 0;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
 		ClearPageError(req->wb_page);
 		*pages++ = req->wb_page;
-		count += req->wb_bytes;
 	}
 	req = nfs_list_entry(data->pages.next);
 
@@ -964,7 +924,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
 	return 0;
  out_bad:
 	while (!list_empty(head)) {
-		struct nfs_page *req = nfs_list_entry(head->next);
+		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_redirty_request(req);
 		nfs_end_page_writeback(req->wb_page);
@@ -973,40 +933,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
 	return -ENOMEM;
 }
 
-static int nfs_flush_list(struct inode *inode, struct list_head *head, int npages, int how)
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+				  struct inode *inode, int ioflags)
 {
-	LIST_HEAD(one_request);
-	int (*flush_one)(struct inode *, struct list_head *, int);
-	struct nfs_page	*req;
-	int wpages = NFS_SERVER(inode)->wpages;
 	int wsize = NFS_SERVER(inode)->wsize;
-	int error;
 
-	flush_one = nfs_flush_one;
 	if (wsize < PAGE_CACHE_SIZE)
-		flush_one = nfs_flush_multi;
-	/* For single writes, FLUSH_STABLE is more efficient */
-	if (npages <= wpages && npages == NFS_I(inode)->npages
-			&& nfs_list_entry(head->next)->wb_bytes <= wsize)
-		how |= FLUSH_STABLE;
-
-	do {
-		nfs_coalesce_requests(head, &one_request, wpages);
-		req = nfs_list_entry(one_request.next);
-		error = flush_one(inode, &one_request, how);
-		if (error < 0)
-			goto out_err;
-	} while (!list_empty(head));
-	return 0;
-out_err:
-	while (!list_empty(head)) {
-		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_redirty_request(req);
-		nfs_end_page_writeback(req->wb_page);
-		nfs_clear_page_writeback(req);
-	}
-	return error;
+		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
+	else
+		nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
 }
 
 /*
@@ -1330,31 +1265,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
 	.rpc_call_done = nfs_commit_done,
 	.rpc_release = nfs_commit_release,
 };
-#else
-static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
-{
-	return 0;
-}
-#endif
-
-static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
-{
-	struct nfs_inode *nfsi = NFS_I(mapping->host);
-	LIST_HEAD(head);
-	long res;
 
-	spin_lock(&nfsi->req_lock);
-	res = nfs_scan_dirty(mapping, wbc, &head);
-	spin_unlock(&nfsi->req_lock);
-	if (res) {
-		int error = nfs_flush_list(mapping->host, &head, res, how);
-		if (error < 0)
-			return error;
-	}
-	return res;
-}
-
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 int nfs_commit_inode(struct inode *inode, int how)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -1371,13 +1282,18 @@ int nfs_commit_inode(struct inode *inode, int how)
 	}
 	return res;
 }
+#else
+static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+{
+	return 0;
+}
 #endif
 
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
 {
 	struct inode *inode = mapping->host;
 	struct nfs_inode *nfsi = NFS_I(inode);
-	unsigned long idx_start, idx_end;
+	pgoff_t idx_start, idx_end;
 	unsigned int npages = 0;
 	LIST_HEAD(head);
 	int nocommit = how & FLUSH_NOCOMMIT;
@@ -1390,41 +1306,24 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 		idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
 		idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
 		if (idx_end > idx_start) {
-			unsigned long l_npages = 1 + idx_end - idx_start;
+			pgoff_t l_npages = 1 + idx_end - idx_start;
 			npages = l_npages;
 			if (sizeof(npages) != sizeof(l_npages) &&
-					(unsigned long)npages != l_npages)
+					(pgoff_t)npages != l_npages)
 				npages = 0;
 		}
 	}
 	how &= ~FLUSH_NOCOMMIT;
 	spin_lock(&nfsi->req_lock);
 	do {
-		wbc->pages_skipped = 0;
 		ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
 		if (ret != 0)
 			continue;
-		pages = nfs_scan_dirty(mapping, wbc, &head);
-		if (pages != 0) {
-			spin_unlock(&nfsi->req_lock);
-			if (how & FLUSH_INVALIDATE) {
-				nfs_cancel_dirty_list(&head);
-				ret = pages;
-			} else
-				ret = nfs_flush_list(inode, &head, pages, how);
-			spin_lock(&nfsi->req_lock);
-			continue;
-		}
-		if (wbc->pages_skipped != 0)
-			continue;
 		if (nocommit)
 			break;
 		pages = nfs_scan_commit(inode, &head, idx_start, npages);
-		if (pages == 0) {
-			if (wbc->pages_skipped != 0)
-				continue;
+		if (pages == 0)
 			break;
-		}
 		if (how & FLUSH_INVALIDATE) {
 			spin_unlock(&nfsi->req_lock);
 			nfs_cancel_commit_list(&head);
@@ -1456,7 +1355,7 @@ int nfs_wb_all(struct inode *inode)
 	};
 	int ret;
 
-	ret = generic_writepages(mapping, &wbc);
+	ret = nfs_writepages(mapping, &wbc);
 	if (ret < 0)
 		goto out;
 	ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
@@ -1479,11 +1378,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo
 	};
 	int ret;
 
-	if (!(how & FLUSH_NOWRITEPAGE)) {
-		ret = generic_writepages(mapping, &wbc);
-		if (ret < 0)
-			goto out;
-	}
+	ret = nfs_writepages(mapping, &wbc);
+	if (ret < 0)
+		goto out;
 	ret = nfs_sync_mapping_wait(mapping, &wbc, how);
 	if (ret >= 0)
 		return 0;
@@ -1506,7 +1403,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
 	int ret;
 
 	BUG_ON(!PageLocked(page));
-	if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) {
+	if (clear_page_dirty_for_io(page)) {
 		ret = nfs_writepage_locked(page, &wbc);
 		if (ret < 0)
 			goto out;
@@ -1531,10 +1428,18 @@ int nfs_wb_page(struct inode *inode, struct page* page)
 
 int nfs_set_page_dirty(struct page *page)
 {
-	spinlock_t *req_lock = &NFS_I(page->mapping->host)->req_lock;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	spinlock_t *req_lock;
 	struct nfs_page *req;
 	int ret;
 
+	if (!mapping)
+		goto out_raced;
+	inode = mapping->host;
+	if (!inode)
+		goto out_raced;
+	req_lock = &NFS_I(inode)->req_lock;
 	spin_lock(req_lock);
 	req = nfs_page_find_request_locked(page);
 	if (req != NULL) {
@@ -1547,6 +1452,8 @@ int nfs_set_page_dirty(struct page *page)
 	ret = __set_page_dirty_nobuffers(page);
 	spin_unlock(req_lock);
 	return ret;
+out_raced:
+	return !TestSetPageDirty(page);
 }
 
 
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index ce341dc76d5e..9b118ee20193 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -11,4 +11,3 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
 			   nfs4acl.o nfs4callback.o nfs4recover.o
-nfsd-objs		:= $(nfsd-y)
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 6f24768272a1..79bd03b8bbf8 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -469,6 +469,13 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	nd.dentry = NULL;
 	exp.ex_path = NULL;
 
+	/* fs locations */
+	exp.ex_fslocs.locations = NULL;
+	exp.ex_fslocs.locations_count = 0;
+	exp.ex_fslocs.migrated = 0;
+
+	exp.ex_uuid = NULL;
+
 	if (mesg[mlen-1] != '\n')
 		return -EINVAL;
 	mesg[mlen-1] = 0;
@@ -509,13 +516,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	if (exp.h.expiry_time == 0)
 		goto out;
 
-	/* fs locations */
-	exp.ex_fslocs.locations = NULL;
-	exp.ex_fslocs.locations_count = 0;
-	exp.ex_fslocs.migrated = 0;
-
-	exp.ex_uuid = NULL;
-
 	/* flags */
 	err = get_int(&mesg, &an_int);
 	if (err == -ENOENT)
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7f5bad0393b1..eac82830bfd7 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -177,7 +177,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 	if (max_blocksize < resp->count)
 		resp->count = max_blocksize;
 
-	svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_read(rqstp, &resp->fh, NULL,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 7e4bb0af24d7..10f6e7dcf633 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -239,7 +239,7 @@ static __be32 *
 encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
 	struct dentry *dentry = fhp->fh_dentry;
-	if (dentry && dentry->d_inode != NULL) {
+	if (dentry && dentry->d_inode) {
 	        int err;
 		struct kstat stat;
 
@@ -300,9 +300,9 @@ int
 nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_sattrargs *args)
 {
-	if (!(p = decode_fh(p, &args->fh))
-	 || !(p = decode_sattr3(p, &args->attrs)))
+	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
+	p = decode_sattr3(p, &args->attrs);
 
 	if ((args->check_guard = ntohl(*p++)) != 0) { 
 		struct timespec time; 
@@ -343,9 +343,9 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 	int v,pn;
 	u32 max_blocksize = svc_max_payload(rqstp);
 
-	if (!(p = decode_fh(p, &args->fh))
-	 || !(p = xdr_decode_hyper(p, &args->offset)))
+	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
+	p = xdr_decode_hyper(p, &args->offset);
 
 	len = args->count = ntohl(*p++);
 
@@ -369,28 +369,44 @@ int
 nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_writeargs *args)
 {
-	unsigned int len, v, hdr;
+	unsigned int len, v, hdr, dlen;
 	u32 max_blocksize = svc_max_payload(rqstp);
 
-	if (!(p = decode_fh(p, &args->fh))
-	 || !(p = xdr_decode_hyper(p, &args->offset)))
+	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
+	p = xdr_decode_hyper(p, &args->offset);
 
 	args->count = ntohl(*p++);
 	args->stable = ntohl(*p++);
 	len = args->len = ntohl(*p++);
+	/*
+	 * The count must equal the amount of data passed.
+	 */
+	if (args->count != args->len)
+		return 0;
 
+	/*
+	 * Check to make sure that we got the right number of
+	 * bytes.
+	 */
 	hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
-	if (rqstp->rq_arg.len < hdr ||
-	    rqstp->rq_arg.len - hdr < len)
+	dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
+		- hdr;
+	/*
+	 * Round the length of the data which was specified up to
+	 * the next multiple of XDR units and then compare that
+	 * against the length which was actually received.
+	 */
+	if (dlen != XDR_QUADLEN(len)*4)
 		return 0;
 
+	if (args->count > max_blocksize) {
+		args->count = max_blocksize;
+		len = args->len = max_blocksize;
+	}
 	rqstp->rq_vec[0].iov_base = (void*)p;
 	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
-
-	if (len > max_blocksize)
-		len = max_blocksize;
-	v=  0;
+	v = 0;
 	while (len > rqstp->rq_vec[v].iov_len) {
 		len -= rqstp->rq_vec[v].iov_len;
 		v++;
@@ -398,9 +414,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
 	}
 	rqstp->rq_vec[v].iov_len = len;
-	args->vlen = v+1;
-
-	return args->count == args->len && rqstp->rq_vec[0].iov_len > 0;
+	args->vlen = v + 1;
+	return 1;
 }
 
 int
@@ -414,8 +429,7 @@ nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
 	switch (args->createmode = ntohl(*p++)) {
 	case NFS3_CREATE_UNCHECKED:
 	case NFS3_CREATE_GUARDED:
-		if (!(p = decode_sattr3(p, &args->attrs)))
-			return 0;
+		p = decode_sattr3(p, &args->attrs);
 		break;
 	case NFS3_CREATE_EXCLUSIVE:
 		args->verf = p;
@@ -431,10 +445,10 @@ int
 nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_createargs *args)
 {
-	if (!(p = decode_fh(p, &args->fh))
-	 || !(p = decode_filename(p, &args->name, &args->len))
-	 || !(p = decode_sattr3(p, &args->attrs)))
+	if (!(p = decode_fh(p, &args->fh)) ||
+	    !(p = decode_filename(p, &args->name, &args->len)))
 		return 0;
+	p = decode_sattr3(p, &args->attrs);
 
 	return xdr_argsize_check(rqstp, p);
 }
@@ -448,11 +462,12 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
 	char *old, *new;
 	struct kvec *vec;
 
-	if (!(p = decode_fh(p, &args->ffh))
-	 || !(p = decode_filename(p, &args->fname, &args->flen))
-	 || !(p = decode_sattr3(p, &args->attrs))
+	if (!(p = decode_fh(p, &args->ffh)) ||
+	    !(p = decode_filename(p, &args->fname, &args->flen))
 		)
 		return 0;
+	p = decode_sattr3(p, &args->attrs);
+
 	/* now decode the pathname, which might be larger than the first page.
 	 * As we have to check for nul's anyway, we copy it into a new page
 	 * This page appears in the rq_res.pages list, but as pages_len is always
@@ -502,10 +517,8 @@ nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p,
 	args->ftype = ntohl(*p++);
 
 	if (args->ftype == NF3BLK  || args->ftype == NF3CHR
-	 || args->ftype == NF3SOCK || args->ftype == NF3FIFO) {
-		if (!(p = decode_sattr3(p, &args->attrs)))
-			return 0;
-	}
+	 || args->ftype == NF3SOCK || args->ftype == NF3FIFO)
+		p = decode_sattr3(p, &args->attrs);
 
 	if (args->ftype == NF3BLK || args->ftype == NF3CHR) {
 		args->major = ntohl(*p++);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 673a53c014a3..cc3b7badd486 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -137,7 +137,6 @@ struct ace_container {
 static short ace2type(struct nfs4_ace *);
 static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
 				unsigned int);
-void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
 
 struct nfs4_acl *
 nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
@@ -785,21 +784,6 @@ nfs4_acl_new(int n)
 	return acl;
 }
 
-void
-nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
-		int whotype, uid_t who)
-{
-	struct nfs4_ace *ace = acl->aces + acl->naces;
-
-	ace->type = type;
-	ace->flag = flag;
-	ace->access_mask = access_mask;
-	ace->whotype = whotype;
-	ace->who = who;
-
-	acl->naces++;
-}
-
 static struct {
 	char *string;
 	int   stringlen;
@@ -851,6 +835,5 @@ nfs4_acl_write_who(int who, char *p)
 }
 
 EXPORT_SYMBOL(nfs4_acl_new);
-EXPORT_SYMBOL(nfs4_acl_add_ace);
 EXPORT_SYMBOL(nfs4_acl_get_whotype);
 EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index fb14d68eacab..864090edc28b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -38,6 +38,7 @@
 #include <linux/inet.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
@@ -315,16 +316,13 @@ out:
 /*
  * RPC procedure tables
  */
-#ifndef MAX
-# define MAX(a, b)      (((a) > (b))? (a) : (b))
-#endif
-
 #define PROC(proc, call, argtype, restype)                              \
 [NFSPROC4_CLNT_##proc] = {                                      	\
         .p_proc   = NFSPROC4_CB_##call,					\
         .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
         .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
-        .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,  \
+        .p_arglen = NFS4_##argtype##_sz,                                \
+        .p_replen = NFS4_##restype##_sz,                                \
         .p_statidx = NFSPROC4_CB_##call,				\
 	.p_name   = #proc,                                              \
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index e4a83d727afd..45aa21ce6784 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -46,7 +46,6 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
-#include <linux/smp_lock.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/nfsd_idmap.h>
 #include <linux/list.h>
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index c7774e3a9469..ebd03cc07479 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -45,7 +45,7 @@
 #include <asm/uaccess.h>
 #include <asm/scatterlist.h>
 #include <linux/crypto.h>
-
+#include <linux/sched.h>
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index af360705e551..3cc8ce422ab1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -50,6 +50,7 @@
 #include <linux/nfsd/xdr4.h>
 #include <linux/namei.h>
 #include <linux/mutex.h>
+#include <linux/lockd/bind.h>
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -1325,8 +1326,6 @@ do_recall(void *__dp)
 {
 	struct nfs4_delegation *dp = __dp;
 
-	daemonize("nfsv4-recall");
-
 	nfsd4_cb_recall(dp);
 	return 0;
 }
@@ -2657,6 +2656,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct file_lock conflock;
 	__be32 status = 0;
 	unsigned int strhashval;
+	unsigned int cmd;
 	int err;
 
 	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -2739,10 +2739,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		case NFS4_READ_LT:
 		case NFS4_READW_LT:
 			file_lock.fl_type = F_RDLCK;
+			cmd = F_SETLK;
 		break;
 		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
 			file_lock.fl_type = F_WRLCK;
+			cmd = F_SETLK;
 		break;
 		default:
 			status = nfserr_inval;
@@ -2769,9 +2771,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	/* XXX?: Just to divert the locks_release_private at the start of
 	 * locks_copy_lock: */
-	conflock.fl_ops = NULL;
-	conflock.fl_lmops = NULL;
-	err = posix_lock_file_conf(filp, &file_lock, &conflock);
+	locks_init_lock(&conflock);
+	err = vfs_lock_file(filp, cmd, &file_lock, &conflock);
 	switch (-err) {
 	case 0: /* success! */
 		update_stateid(&lock_stp->st_stateid);
@@ -2788,7 +2789,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfserr_deadlock;
 		break;
 	default:        
-		dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",err);
+		dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
 		status = nfserr_resource;
 		break;
 	}
@@ -2813,7 +2814,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct inode *inode;
 	struct file file;
 	struct file_lock file_lock;
-	struct file_lock conflock;
+	int error;
 	__be32 status;
 
 	if (nfs4_in_grace())
@@ -2869,18 +2870,23 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_transform_lock_offset(&file_lock);
 
-	/* posix_test_lock uses the struct file _only_ to resolve the inode.
+	/* vfs_test_lock uses the struct file _only_ to resolve the inode.
 	 * since LOCKT doesn't require an OPEN, and therefore a struct
-	 * file may not exist, pass posix_test_lock a struct file with
+	 * file may not exist, pass vfs_test_lock a struct file with
 	 * only the dentry:inode set.
 	 */
 	memset(&file, 0, sizeof (struct file));
 	file.f_path.dentry = cstate->current_fh.fh_dentry;
 
 	status = nfs_ok;
-	if (posix_test_lock(&file, &file_lock, &conflock)) {
+	error = vfs_test_lock(&file, &file_lock);
+	if (error) {
+		status = nfserrno(error);
+		goto out;
+	}
+	if (file_lock.fl_type != F_UNLCK) {
 		status = nfserr_denied;
-		nfs4_set_lock_denied(&conflock, &lockt->lt_denied);
+		nfs4_set_lock_denied(&file_lock, &lockt->lt_denied);
 	}
 out:
 	nfs4_unlock_state();
@@ -2933,9 +2939,9 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/*
 	*  Try to unlock the file in the VFS.
 	*/
-	err = posix_lock_file(filp, &file_lock);
+	err = vfs_lock_file(filp, F_SETLK, &file_lock, NULL);
 	if (err) {
-		dprintk("NFSD: nfs4_locku: posix_lock_file failed!\n");
+		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
 		goto out_nfserr;
 	}
 	/*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5d090f11f2be..15809dfd88a5 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,7 +44,6 @@
 
 #include <linux/param.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/vfs.h>
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8d995bcef806..6ca2d24fc216 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/unistd.h>
 #include <linux/string.h>
@@ -324,7 +323,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 	 *
 	 */
 
-	u8 version = 1;
+	u8 version;
 	u8 fsid_type = 0;
 	struct inode * inode = dentry->d_inode;
 	struct dentry *parent = dentry->d_parent;
@@ -342,15 +341,59 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 	 * the reference filehandle (if it is in the same export)
 	 * or the export options.
 	 */
+ retry:
+	version = 1;
 	if (ref_fh && ref_fh->fh_export == exp) {
 		version = ref_fh->fh_handle.fh_version;
-		if (version == 0xca)
+		fsid_type = ref_fh->fh_handle.fh_fsid_type;
+
+		if (ref_fh == fhp)
+			fh_put(ref_fh);
+		ref_fh = NULL;
+
+		switch (version) {
+		case 0xca:
 			fsid_type = FSID_DEV;
-		else
-			fsid_type = ref_fh->fh_handle.fh_fsid_type;
-		/* We know this version/type works for this export
-		 * so there is no need for further checks.
+			break;
+		case 1:
+			break;
+		default:
+			goto retry;
+		}
+
+		/* Need to check that this type works for this
+		 * export point.  As the fsid -> filesystem mapping
+		 * was guided by user-space, there is no guarantee
+		 * that the filesystem actually supports that fsid
+		 * type. If it doesn't we loop around again without
+		 * ref_fh set.
 		 */
+		switch(fsid_type) {
+		case FSID_DEV:
+			if (!old_valid_dev(ex_dev))
+				goto retry;
+			/* FALL THROUGH */
+		case FSID_MAJOR_MINOR:
+		case FSID_ENCODE_DEV:
+			if (!(exp->ex_dentry->d_inode->i_sb->s_type->fs_flags
+			      & FS_REQUIRES_DEV))
+				goto retry;
+			break;
+		case FSID_NUM:
+			if (! (exp->ex_flags & NFSEXP_FSID))
+				goto retry;
+			break;
+		case FSID_UUID8:
+		case FSID_UUID16:
+			if (!root_export)
+				goto retry;
+			/* fall through */
+		case FSID_UUID4_INUM:
+		case FSID_UUID16_INUM:
+			if (exp->ex_uuid == NULL)
+				goto retry;
+			break;
+		}
 	} else if (exp->ex_uuid) {
 		if (fhp->fh_maxsize >= 64) {
 			if (root_export)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cc2eec981b8..b2c7147aa921 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -155,7 +155,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 				argp->count);
 		argp->count = NFSSVC_MAXBLKSIZE_V2;
 	}
-	svc_reserve(rqstp, (19<<2) + argp->count + 4);
+	svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
 
 	resp->count = argp->count;
 	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d7759ce6ed94..ff55950efb43 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -9,7 +9,7 @@
  */
 
 #include <linux/module.h>
-
+#include <linux/sched.h>
 #include <linux/time.h>
 #include <linux/errno.h>
 #include <linux/nfs.h>
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 0c24b9e24fe8..cb3e7fadb772 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -231,9 +231,10 @@ int
 nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_sattrargs *args)
 {
-	if (!(p = decode_fh(p, &args->fh))
-	 || !(p = decode_sattr(p, &args->attrs)))
+	p = decode_fh(p, &args->fh);
+	if (!p)
 		return 0;
+	p = decode_sattr(p, &args->attrs);
 
 	return xdr_argsize_check(rqstp, p);
 }
@@ -284,8 +285,9 @@ int
 nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_writeargs *args)
 {
-	unsigned int len;
+	unsigned int len, hdr, dlen;
 	int v;
+
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
 
@@ -293,11 +295,30 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 	args->offset = ntohl(*p++);	/* offset */
 	p++;				/* totalcount */
 	len = args->len = ntohl(*p++);
-	rqstp->rq_vec[0].iov_base = (void*)p;
-	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
-				(((void*)p) - rqstp->rq_arg.head[0].iov_base);
+	/*
+	 * The protocol specifies a maximum of 8192 bytes.
+	 */
 	if (len > NFSSVC_MAXBLKSIZE_V2)
-		len = NFSSVC_MAXBLKSIZE_V2;
+		return 0;
+
+	/*
+	 * Check to make sure that we got the right number of
+	 * bytes.
+	 */
+	hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
+	dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
+		- hdr;
+
+	/*
+	 * Round the length of the data which was specified up to
+	 * the next multiple of XDR units and then compare that
+	 * against the length which was actually received.
+	 */
+	if (dlen != XDR_QUADLEN(len)*4)
+		return 0;
+
+	rqstp->rq_vec[0].iov_base = (void*)p;
+	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
 	v = 0;
 	while (len > rqstp->rq_vec[v].iov_len) {
 		len -= rqstp->rq_vec[v].iov_len;
@@ -306,18 +327,18 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
 	}
 	rqstp->rq_vec[v].iov_len = len;
-	args->vlen = v+1;
-	return rqstp->rq_vec[0].iov_len > 0;
+	args->vlen = v + 1;
+	return 1;
 }
 
 int
 nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_createargs *args)
 {
-	if (!(p = decode_fh(p, &args->fh))
-	 || !(p = decode_filename(p, &args->name, &args->len))
-	 || !(p = decode_sattr(p, &args->attrs)))
+	if (   !(p = decode_fh(p, &args->fh))
+	    || !(p = decode_filename(p, &args->name, &args->len)))
 		return 0;
+	p = decode_sattr(p, &args->attrs);
 
 	return xdr_argsize_check(rqstp, p);
 }
@@ -361,11 +382,11 @@ int
 nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_symlinkargs *args)
 {
-	if (!(p = decode_fh(p, &args->ffh))
-	 || !(p = decode_filename(p, &args->fname, &args->flen))
-	 || !(p = decode_pathname(p, &args->tname, &args->tlen))
-	 || !(p = decode_sattr(p, &args->attrs)))
+	if (   !(p = decode_fh(p, &args->ffh))
+	    || !(p = decode_filename(p, &args->fname, &args->flen))
+	    || !(p = decode_pathname(p, &args->tname, &args->tlen)))
 		return 0;
+	p = decode_sattr(p, &args->attrs);
 
 	return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 629e7abdd840..6e5c2534f4bc 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -86,19 +86,15 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		/* Check for the current buffer head overflowing. */
 		if (unlikely(file_ofs + bh->b_size > init_size)) {
-			u8 *kaddr;
 			int ofs;
 
 			ofs = 0;
 			if (file_ofs < init_size)
 				ofs = init_size - file_ofs;
 			local_irq_save(flags);
-			kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
-			memset(kaddr + bh_offset(bh) + ofs, 0,
-					bh->b_size - ofs);
-			kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
+			zero_user_page(page, bh_offset(bh) + ofs,
+					 bh->b_size - ofs, KM_BIO_SRC_IRQ);
 			local_irq_restore(flags);
-			flush_dcache_page(page);
 		}
 	} else {
 		clear_buffer_uptodate(bh);
@@ -245,8 +241,7 @@ static int ntfs_read_block(struct page *page)
 	rl = NULL;
 	nr = i = 0;
 	do {
-		u8 *kaddr;
-		int err;
+		int err = 0;
 
 		if (unlikely(buffer_uptodate(bh)))
 			continue;
@@ -254,7 +249,6 @@ static int ntfs_read_block(struct page *page)
 			arr[nr++] = bh;
 			continue;
 		}
-		err = 0;
 		bh->b_bdev = vol->sb->s_bdev;
 		/* Is the block within the allowed limits? */
 		if (iblock < lblock) {
@@ -340,10 +334,7 @@ handle_hole:
 		bh->b_blocknr = -1UL;
 		clear_buffer_mapped(bh);
 handle_zblock:
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + i * blocksize, 0, blocksize);
-		kunmap_atomic(kaddr, KM_USER0);
-		flush_dcache_page(page);
+		zero_user_page(page, i * blocksize, blocksize, KM_USER0);
 		if (likely(!err))
 			set_buffer_uptodate(bh);
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
@@ -460,10 +451,7 @@ retry_readpage:
 	 * ok to ignore the compressed flag here.
 	 */
 	if (unlikely(page->index > 0)) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr, 0, PAGE_CACHE_SIZE);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
 		goto done;
 	}
 	if (!NInoAttr(ni))
@@ -790,14 +778,10 @@ lock_retry_remap:
 		 * uptodate so it can get discarded by the VM.
 		 */
 		if (err == -ENOENT || lcn == LCN_ENOENT) {
-			u8 *kaddr;
-
 			bh->b_blocknr = -1;
 			clear_buffer_dirty(bh);
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + bh_offset(bh), 0, blocksize);
-			kunmap_atomic(kaddr, KM_USER0);
-			flush_dcache_page(page);
+			zero_user_page(page, bh_offset(bh), blocksize,
+					KM_USER0);
 			set_buffer_uptodate(bh);
 			err = 0;
 			continue;
@@ -1422,10 +1406,8 @@ retry_writepage:
 		if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
 			/* The page straddles i_size. */
 			unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + ofs, 0, PAGE_CACHE_SIZE - ofs);
-			kunmap_atomic(kaddr, KM_USER0);
-			flush_dcache_page(page);
+			zero_user_page(page, ofs, PAGE_CACHE_SIZE - ofs,
+					KM_USER0);
 		}
 		/* Handle mst protected attributes. */
 		if (NInoMstProtected(ni))
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 9393f4b1e298..caecc58f529c 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -89,9 +89,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping,
 	struct page *page = read_mapping_page(mapping, index, NULL);
 
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (PageUptodate(page) && !PageError(page))
+		if (!PageError(page))
 			return page;
 		ntfs_unmap_page(page);
 		return ERR_PTR(-EIO);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 7659cc192995..1c08fefe487a 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2532,14 +2532,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read first partial "
-					"page (sync error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		wait_on_page_locked(page);
-		if (unlikely(!PageUptodate(page))) {
-			ntfs_error(vol->sb, "Failed to read first partial page "
-					"(async error, index 0x%lx).", idx);
-			page_cache_release(page);
+					"page (error, index 0x%lx).", idx);
 			return PTR_ERR(page);
 		}
 		/*
@@ -2602,14 +2595,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read last partial page "
-					"(sync error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		wait_on_page_locked(page);
-		if (unlikely(!PageUptodate(page))) {
-			ntfs_error(vol->sb, "Failed to read last partial page "
-					"(async error, index 0x%lx).", idx);
-			page_cache_release(page);
+					"(error, index 0x%lx).", idx);
 			return PTR_ERR(page);
 		}
 		kaddr = kmap_atomic(page, KM_USER0);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 74f99a6a369b..34314b33dbd4 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -20,7 +20,6 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 
 #include "dir.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index d69c4595ccd0..7ed56390b582 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -26,6 +26,7 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
+#include <linux/sched.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -236,8 +237,7 @@ do_non_resident_extend:
 			err = PTR_ERR(page);
 			goto init_err_out;
 		}
-		wait_on_page_locked(page);
-		if (unlikely(!PageUptodate(page) || PageError(page))) {
+		if (unlikely(PageError(page))) {
 			page_cache_release(page);
 			err = -EIO;
 			goto init_err_out;
@@ -607,11 +607,8 @@ do_next_page:
 					ntfs_submit_bh_for_read(bh);
 					*wait_bh++ = bh;
 				} else {
-					u8 *kaddr = kmap_atomic(page, KM_USER0);
-					memset(kaddr + bh_offset(bh), 0,
-							blocksize);
-					kunmap_atomic(kaddr, KM_USER0);
-					flush_dcache_page(page);
+					zero_user_page(page, bh_offset(bh),
+							blocksize, KM_USER0);
 					set_buffer_uptodate(bh);
 				}
 			}
@@ -686,12 +683,9 @@ map_buffer_cached:
 						ntfs_submit_bh_for_read(bh);
 						*wait_bh++ = bh;
 					} else {
-						u8 *kaddr = kmap_atomic(page,
-								KM_USER0);
-						memset(kaddr + bh_offset(bh),
-								0, blocksize);
-						kunmap_atomic(kaddr, KM_USER0);
-						flush_dcache_page(page);
+						zero_user_page(page,
+							bh_offset(bh),
+							blocksize, KM_USER0);
 						set_buffer_uptodate(bh);
 					}
 				}
@@ -709,11 +703,8 @@ map_buffer_cached:
 			 */
 			if (bh_end <= pos || bh_pos >= end) {
 				if (!buffer_uptodate(bh)) {
-					u8 *kaddr = kmap_atomic(page, KM_USER0);
-					memset(kaddr + bh_offset(bh), 0,
-							blocksize);
-					kunmap_atomic(kaddr, KM_USER0);
-					flush_dcache_page(page);
+					zero_user_page(page, bh_offset(bh),
+							blocksize, KM_USER0);
 					set_buffer_uptodate(bh);
 				}
 				mark_buffer_dirty(bh);
@@ -752,10 +743,8 @@ map_buffer_cached:
 				if (!buffer_uptodate(bh))
 					set_buffer_uptodate(bh);
 			} else if (!buffer_uptodate(bh)) {
-				u8 *kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + bh_offset(bh), 0, blocksize);
-				kunmap_atomic(kaddr, KM_USER0);
-				flush_dcache_page(page);
+				zero_user_page(page, bh_offset(bh), blocksize,
+						KM_USER0);
 				set_buffer_uptodate(bh);
 			}
 			continue;
@@ -879,11 +868,8 @@ rl_not_mapped_enoent:
 					if (!buffer_uptodate(bh))
 						set_buffer_uptodate(bh);
 				} else if (!buffer_uptodate(bh)) {
-					u8 *kaddr = kmap_atomic(page, KM_USER0);
-					memset(kaddr + bh_offset(bh), 0,
-							blocksize);
-					kunmap_atomic(kaddr, KM_USER0);
-					flush_dcache_page(page);
+					zero_user_page(page, bh_offset(bh),
+							blocksize, KM_USER0);
 					set_buffer_uptodate(bh);
 				}
 				continue;
@@ -1138,16 +1124,12 @@ rl_not_mapped_enoent:
 			 * to zero the overflowing region.
 			 */
 			if (unlikely(bh_pos + blocksize > initialized_size)) {
-				u8 *kaddr;
 				int ofs = 0;
 
 				if (likely(bh_pos < initialized_size))
 					ofs = initialized_size - bh_pos;
-				kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + bh_offset(bh) + ofs, 0,
-						blocksize - ofs);
-				kunmap_atomic(kaddr, KM_USER0);
-				flush_dcache_page(page);
+				zero_user_page(page, bh_offset(bh) + ofs,
+						blocksize - ofs, KM_USER0);
 			}
 		} else /* if (unlikely(!buffer_uptodate(bh))) */
 			err = -EIO;
@@ -1287,11 +1269,8 @@ rl_not_mapped_enoent:
 				if (PageUptodate(page))
 					set_buffer_uptodate(bh);
 				else {
-					u8 *kaddr = kmap_atomic(page, KM_USER0);
-					memset(kaddr + bh_offset(bh), 0,
-							blocksize);
-					kunmap_atomic(kaddr, KM_USER0);
-					flush_dcache_page(page);
+					zero_user_page(page, bh_offset(bh),
+							blocksize, KM_USER0);
 					set_buffer_uptodate(bh);
 				}
 			}
@@ -1351,9 +1330,7 @@ err_out:
 		len = PAGE_CACHE_SIZE;
 		if (len > bytes)
 			len = bytes;
-		kaddr = kmap_atomic(*pages, KM_USER0);
-		memset(kaddr, 0, len);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(*pages, 0, len, KM_USER0);
 	}
 	goto out;
 }
@@ -1474,9 +1451,7 @@ err_out:
 		len = PAGE_CACHE_SIZE;
 		if (len > bytes)
 			len = bytes;
-		kaddr = kmap_atomic(*pages, KM_USER0);
-		memset(kaddr, 0, len);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(*pages, 0, len, KM_USER0);
 	}
 	goto out;
 }
@@ -2130,28 +2105,13 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	loff_t pos;
-	unsigned long seg;
 	size_t count;		/* after file limit checks */
 	ssize_t written, err;
 
 	count = 0;
-	for (seg = 0; seg < nr_segs; seg++) {
-		const struct iovec *iv = &iov[seg];
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		count += iv->iov_len;
-		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-			continue;
-		if (!seg)
-			return -EFAULT;
-		nr_segs = seg;
-		count -= iv->iov_len;	/* This segment is no good */
-		break;
-	}
+	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	if (err)
+		return err;
 	pos = *ppos;
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 	/* We can write back this queue in page reclaim. */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index f8bf8da67ee8..b532a730cec2 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -27,7 +27,6 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 
 #include "aops.h"
 #include "attrib.h"
@@ -141,7 +140,7 @@ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
 		if (!ni->name)
 			return -ENOMEM;
 		memcpy(ni->name, na->name, i);
-		ni->name[i] = 0;
+		ni->name[na->name_len] = 0;
 	}
 	return 0;
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1594c90b7164..4566b9182551 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2471,7 +2471,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 	s64 nr_free = vol->nr_clusters;
 	u32 *kaddr;
 	struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
-	filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
 	struct page *page;
 	pgoff_t index, max_index;
 
@@ -2494,24 +2493,14 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
 		 */
-		page = read_cache_page(mapping, index, (filler_t*)readpage,
-				NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		/* Ignore pages which errored synchronously. */
 		if (IS_ERR(page)) {
-			ntfs_debug("Sync read_cache_page() error. Skipping "
+			ntfs_debug("read_mapping_page() error. Skipping "
 					"page (index 0x%lx).", index);
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
-		wait_on_page_locked(page);
-		/* Ignore pages which errored asynchronously. */
-		if (!PageUptodate(page)) {
-			ntfs_debug("Async read_cache_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			page_cache_release(page);
-			nr_free -= PAGE_CACHE_SIZE * 8;
-			continue;
-		}
 		kaddr = (u32*)kmap_atomic(page, KM_USER0);
 		/*
 		 * For each 4 bytes, subtract the number of set bits. If this
@@ -2562,7 +2551,6 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 {
 	u32 *kaddr;
 	struct address_space *mapping = vol->mftbmp_ino->i_mapping;
-	filler_t *readpage = (filler_t*)mapping->a_ops->readpage;
 	struct page *page;
 	pgoff_t index;
 
@@ -2576,21 +2564,11 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
 		 */
-		page = read_cache_page(mapping, index, (filler_t*)readpage,
-				NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		/* Ignore pages which errored synchronously. */
 		if (IS_ERR(page)) {
-			ntfs_debug("Sync read_cache_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			nr_free -= PAGE_CACHE_SIZE * 8;
-			continue;
-		}
-		wait_on_page_locked(page);
-		/* Ignore pages which errored asynchronously. */
-		if (!PageUptodate(page)) {
-			ntfs_debug("Async read_cache_page() error. Skipping "
+			ntfs_debug("read_mapping_page() error. Skipping "
 					"page (index 0x%lx).", index);
-			page_cache_release(page);
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
@@ -3107,9 +3085,7 @@ static void ntfs_big_inode_init_once(void *foo, struct kmem_cache *cachep,
 {
 	ntfs_inode *ni = (ntfs_inode *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-			SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(VFS_I(ni));
+	inode_init_once(VFS_I(ni));
 }
 
 /*
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a0c8667caa72..19712a7d145f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2869,7 +2869,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 	tl = &tl_copy->id2.i_dealloc;
 	num_recs = le16_to_cpu(tl->tl_used);
 	mlog(0, "cleanup %u records from %llu\n", num_recs,
-	     (unsigned long long)tl_copy->i_blkno);
+	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
 
 	mutex_lock(&tl_inode->i_mutex);
 	for(i = 0; i < num_recs; i++) {
@@ -3801,8 +3801,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-	     "%llu\n", fe->i_clusters, new_i_clusters,
-	     (unsigned long long)fe->i_size);
+	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
+	     (unsigned long long)le64_to_cpu(fe->i_size));
 
 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
 	if (!(*tc)) {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 56963e6c46c0..0023b31e48a8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -78,7 +78,8 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)fe->i_blkno, 7, fe->i_signature);
+		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+		     fe->i_signature);
 		goto bail;
 	}
 
@@ -221,7 +222,10 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 		goto out;
 	}
 
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) {
+		ret = AOP_TRUNCATED_PAGE;
+		goto out_meta_unlock;
+	}
 
 	/*
 	 * i_size might have just been updated as we grabed the meta lock.  We
@@ -234,10 +238,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 	 * XXX sys_readahead() seems to get that wrong?
 	 */
 	if (start >= i_size_read(inode)) {
-		char *addr = kmap(page);
-		memset(addr, 0, PAGE_SIZE);
-		flush_dcache_page(page);
-		kunmap(page);
+		zero_user_page(page, 0, PAGE_SIZE, KM_USER0);
 		SetPageUptodate(page);
 		ret = 0;
 		goto out_alloc;
@@ -257,6 +258,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 	ocfs2_data_unlock(inode, 0);
 out_alloc:
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+out_meta_unlock:
 	ocfs2_meta_unlock(inode, 0);
 out:
 	if (unlock)
@@ -939,9 +941,9 @@ out:
  * Returns a negative error code or the number of bytes copied into
  * the page.
  */
-int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
-			  u64 *p_blkno, struct page *page,
-			  struct ocfs2_write_ctxt *wc, int new)
+static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
+				 u64 *p_blkno, struct page *page,
+				 struct ocfs2_write_ctxt *wc, int new)
 {
 	int ret, copied = 0;
 	unsigned int from = 0, to = 0;
@@ -1086,7 +1088,7 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 	for(i = 0; i < numpages; i++) {
 		index = start + i;
 
-		cpages[i] = grab_cache_page(mapping, index);
+		cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
 		if (!cpages[i]) {
 			ret = -ENOMEM;
 			mlog_errno(ret);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index eba282da500e..979113479c66 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -438,7 +438,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
 								   hb_block));
 
 	mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n",
-	     (long long)cpu_to_le64(generation),
+	     (long long)generation,
 	     le32_to_cpu(hb_block->hb_cksum));
 }
 
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 636593bf4d17..a93620ce4aca 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -144,10 +144,11 @@ static struct kobj_type mlog_ktype = {
 };
 
 static struct kset mlog_kset = {
-	.kobj   = {.name = "logmask", .ktype = &mlog_ktype},
+	.kobj  = {.name = "logmask"},
+	.ktype = &mlog_ktype
 };
 
-int mlog_sys_init(struct subsystem *o2cb_subsys)
+int mlog_sys_init(struct kset *o2cb_subsys)
 {
 	int i = 0;
 
@@ -157,7 +158,7 @@ int mlog_sys_init(struct subsystem *o2cb_subsys)
 	}
 	mlog_attr_ptrs[i] = NULL;
 
-	mlog_kset.subsys = o2cb_subsys;
+	kobj_set_kset_s(&mlog_kset, *o2cb_subsys);
 	return kset_register(&mlog_kset);
 }
 
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index a42628ba9ddf..75cd877f6d42 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -278,7 +278,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
-int mlog_sys_init(struct subsystem *o2cb_subsys);
+int mlog_sys_init(struct kset *o2cb_subsys);
 void mlog_sys_shutdown(void);
 
 #endif /* O2CLUSTER_MASKLOG_H */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 1d9f6acafa2e..64f6f378fd09 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -42,7 +42,6 @@ struct o2cb_attribute {
 #define O2CB_ATTR(_name, _mode, _show, _store)	\
 struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
 
-#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
 #define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
 
 static ssize_t o2cb_interface_revision_show(char *buf)
@@ -79,7 +78,7 @@ static ssize_t
 o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
 {
 	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-	struct subsystem *sbs = to_o2cb_subsys(kobj);
+	struct kset *sbs = to_kset(kobj);
 
 	BUG_ON(sbs != &o2cb_subsys);
 
@@ -93,7 +92,7 @@ o2cb_store(struct kobject * kobj, struct attribute * attr,
 	     const char * buffer, size_t count)
 {
 	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-	struct subsystem *sbs = to_o2cb_subsys(kobj);
+	struct kset *sbs = to_kset(kobj);
 
 	BUG_ON(sbs != &o2cb_subsys);
 
@@ -112,7 +111,7 @@ int o2cb_sys_init(void)
 {
 	int ret;
 
-	o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
+	o2cb_subsys.kobj.ktype = &o2cb_subsys_type;
 	ret = subsystem_register(&o2cb_subsys);
 	if (ret)
 		return ret;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 69caf3e12fea..0b229a9c7952 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1496,7 +1496,7 @@ static void o2net_start_connect(struct work_struct *work)
 	sock->sk->sk_allocation = GFP_ATOMIC;
 
 	myaddr.sin_family = AF_INET;
-	myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address;
+	myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
 	myaddr.sin_port = (__force u16)htons(0); /* any port */
 
 	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
@@ -1521,8 +1521,8 @@ static void o2net_start_connect(struct work_struct *work)
 	spin_unlock(&nn->nn_lock);
 
 	remoteaddr.sin_family = AF_INET;
-	remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
-	remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
+	remoteaddr.sin_addr.s_addr = node->nd_ipv4_address;
+	remoteaddr.sin_port = node->nd_ipv4_port;
 
 	ret = sc->sc_sock->ops->connect(sc->sc_sock,
 					(struct sockaddr *)&remoteaddr,
@@ -1810,8 +1810,8 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
 	int ret;
 	struct sockaddr_in sin = {
 		.sin_family = PF_INET,
-		.sin_addr = { .s_addr = (__force u32)addr },
-		.sin_port = (__force u16)port,
+		.sin_addr = { .s_addr = addr },
+		.sin_port = port,
 	};
 
 	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 67e6866a2a4f..c441ef1f2bad 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -403,7 +403,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			    struct buffer_head **new_de_bh)
 {
 	int status = 0;
-	int credits, num_free_extents;
+	int credits, num_free_extents, drop_alloc_sem = 0;
 	loff_t dir_i_size;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
 	struct ocfs2_alloc_context *data_ac = NULL;
@@ -452,6 +452,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
 	}
 
+	down_write(&OCFS2_I(dir)->ip_alloc_sem);
+	drop_alloc_sem = 1;
+
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@@ -497,6 +500,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	*new_de_bh = new_bh;
 	get_bh(*new_de_bh);
 bail:
+	if (drop_alloc_sem)
+		up_write(&OCFS2_I(dir)->ip_alloc_sem);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 241cad342a48..2fd8bded38f3 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -312,8 +312,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	    past->type != DLM_BAST) {
 		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
 		     "name=%.*s\n", past->type, 
-		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_node(cookie),
+		     dlm_get_lock_cookie_seq(cookie),
 		     locklen, name);
 		ret = DLM_IVLOCKID;
 		goto leave;
@@ -324,8 +324,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 		mlog(0, "got %sast for unknown lockres! "
 		     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
 		     past->type == DLM_AST ? "" : "b",
-		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_node(cookie),
+		     dlm_get_lock_cookie_seq(cookie),
 		     locklen, name, locklen);
 		ret = DLM_IVLOCKID;
 		goto leave;
@@ -370,8 +370,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
 	     "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
-	     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-	     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+	     dlm_get_lock_cookie_node(cookie),
+	     dlm_get_lock_cookie_seq(cookie),
 	     locklen, name, locklen);
 
 	ret = DLM_NORMAL;
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index de952eba29a9..fd8cb1badc9b 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -42,7 +42,6 @@
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 
 #include <asm/uaccess.h>
@@ -263,13 +262,10 @@ static void dlmfs_init_once(void *foo,
 	struct dlmfs_inode_private *ip =
 		(struct dlmfs_inode_private *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		ip->ip_dlm = NULL;
-		ip->ip_parent = NULL;
+	ip->ip_dlm = NULL;
+	ip->ip_parent = NULL;
 
-		inode_init_once(&ip->ip_vfs_inode);
-	}
+	inode_init_once(&ip->ip_vfs_inode);
 }
 
 static struct inode *dlmfs_alloc_inode(struct super_block *sb)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index c1807a42c49f..671c4ed58ee2 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1769,7 +1769,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 			/* lock is always created locally first, and
 			 * destroyed locally last.  it must be on the list */
 			if (!lock) {
-				u64 c = ml->cookie;
+				__be64 c = ml->cookie;
 				mlog(ML_ERROR, "could not find local lock "
 					       "with cookie %u:%llu!\n",
 				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
@@ -1878,7 +1878,7 @@ skip_lvb:
 		spin_lock(&res->spinlock);
 		list_for_each_entry(lock, queue, list) {
 			if (lock->ml.cookie == ml->cookie) {
-				u64 c = lock->ml.cookie;
+				__be64 c = lock->ml.cookie;
 				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
 				     "exists on this lockres!\n", dlm->name,
 				     res->lockname.len, res->lockname.name,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2b264c6ba039..cebd089f8955 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -76,7 +76,7 @@ repeat:
 		goto repeat;
 	}
 	remove_wait_queue(&res->wq, &wait);
-	current->state = TASK_RUNNING;
+	__set_current_state(TASK_RUNNING);
 }
 
 int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 27e43b0c0eae..d1bd305ef0d7 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -27,7 +27,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/crc32.h>
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
@@ -104,6 +103,35 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
 				     struct ocfs2_lock_res *lockres);
 
+
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+
+/* This aids in debugging situations where a bad LVB might be involved. */
+static void ocfs2_dump_meta_lvb_info(u64 level,
+				     const char *function,
+				     unsigned int line,
+				     struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	mlog(level, "LVB information for %s (called from %s:%u):\n",
+	     lockres->l_name, function, line);
+	mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
+	     lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
+	     be32_to_cpu(lvb->lvb_igeneration));
+	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
+	     (unsigned long long)be64_to_cpu(lvb->lvb_isize),
+	     be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
+	     be16_to_cpu(lvb->lvb_imode));
+	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
+	     "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
+	     (long long)be64_to_cpu(lvb->lvb_iatime_packed),
+	     (long long)be64_to_cpu(lvb->lvb_ictime_packed),
+	     (long long)be64_to_cpu(lvb->lvb_imtime_packed),
+	     be32_to_cpu(lvb->lvb_iattr));
+}
+
+
 /*
  * OCFS2 Lock Resource Operations
  *
@@ -3078,28 +3106,3 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 
 	mlog_exit_void();
 }
-
-/* This aids in debugging situations where a bad LVB might be involved. */
-void ocfs2_dump_meta_lvb_info(u64 level,
-			      const char *function,
-			      unsigned int line,
-			      struct ocfs2_lock_res *lockres)
-{
-	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
-
-	mlog(level, "LVB information for %s (called from %s:%u):\n",
-	     lockres->l_name, function, line);
-	mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
-	     lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
-	     be32_to_cpu(lvb->lvb_igeneration));
-	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
-	     (unsigned long long)be64_to_cpu(lvb->lvb_isize),
-	     be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
-	     be16_to_cpu(lvb->lvb_imode));
-	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
-	     "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
-	     (long long)be64_to_cpu(lvb->lvb_iatime_packed),
-	     (long long)be64_to_cpu(lvb->lvb_ictime_packed),
-	     (long long)be64_to_cpu(lvb->lvb_imtime_packed),
-	     be32_to_cpu(lvb->lvb_iattr));
-}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 59cb566e7983..492bad32a8c0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -119,11 +119,4 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
 
-/* aids in debugging and tracking lvbs */
-void ocfs2_dump_meta_lvb_info(u64 level,
-			      const char *function,
-			      unsigned int line,
-			      struct ocfs2_lock_res *lockres);
-#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
-
 #endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 56e1fefc1205..bc48177bd183 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -140,7 +140,7 @@ bail:
 	return parent;
 }
 
-static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
+static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 			   int connectable)
 {
 	struct inode *inode = dentry->d_inode;
@@ -148,6 +148,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
 	int type = 1;
 	u64 blkno;
 	u32 generation;
+	__le32 *fh = (__force __le32 *) fh_in;
 
 	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
 		   dentry->d_name.len, dentry->d_name.name,
@@ -199,7 +200,7 @@ bail:
 	return type;
 }
 
-static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
+static struct dentry *ocfs2_decode_fh(struct super_block *sb, u32 *fh_in,
 				      int fh_len, int fileid_type,
 				      int (*acceptable)(void *context,
 						        struct dentry *de),
@@ -207,6 +208,7 @@ static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
 {
 	struct ocfs2_inode_handle handle, parent;
 	struct dentry *ret = NULL;
+	__le32 *fh = (__force __le32 *) fh_in;
 
 	mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
 		   sb, fh, fh_len, fileid_type, acceptable, context);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 520a2a6d7670..ac6c96431bbc 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -207,10 +207,10 @@ out:
 	return ret;
 }
 
-int ocfs2_set_inode_size(handle_t *handle,
-			 struct inode *inode,
-			 struct buffer_head *fe_bh,
-			 u64 new_i_size)
+static int ocfs2_set_inode_size(handle_t *handle,
+				struct inode *inode,
+				struct buffer_head *fe_bh,
+				u64 new_i_size)
 {
 	int status;
 
@@ -326,6 +326,7 @@ static int ocfs2_truncate_file(struct inode *inode,
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		   (unsigned long long)new_i_size);
 
+	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 	truncate_inode_pages(inode->i_mapping, new_i_size);
 
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
@@ -713,7 +714,8 @@ restarted_transaction:
 	}
 
 	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
-	     fe->i_clusters, (unsigned long long)fe->i_size);
+	     le32_to_cpu(fe->i_clusters),
+	     (unsigned long long)le64_to_cpu(fe->i_size));
 	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
 	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 
@@ -1417,36 +1419,6 @@ out:
 	return total ? total : ret;
 }
 
-static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
-			     unsigned long *nr_segs)
-{
-	size_t ocount;		/* original count */
-	unsigned long seg;
-
-	ocount = 0;
-	for (seg = 0; seg < *nr_segs; seg++) {
-		const struct iovec *iv = &iov[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		ocount += iv->iov_len;
-		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		*nr_segs = seg;
-		ocount -= iv->iov_len;	/* This segment is no good */
-		break;
-	}
-
-	*counted = ocount;
-	return 0;
-}
-
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs,
@@ -1469,7 +1441,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	if (iocb->ki_left == 0)
 		return 0;
 
-	ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
+	ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
 	if (ret)
 		return ret;
 
@@ -1853,6 +1825,9 @@ const struct file_operations ocfs2_fops = {
 	.aio_read	= ocfs2_file_aio_read,
 	.aio_write	= ocfs2_file_aio_write,
 	.ioctl		= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
 };
@@ -1862,4 +1837,7 @@ const struct file_operations ocfs2_dops = {
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.ioctl		= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
 };
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 2c4460fced52..a4dd1fa1822b 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -56,11 +56,6 @@ int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 int ocfs2_permission(struct inode *inode, int mask,
 		     struct nameidata *nd);
 
-int ocfs2_set_inode_size(handle_t *handle,
-			 struct inode *inode,
-			 struct buffer_head *fe_bh,
-			 u64 new_i_size);
-
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt);
 int ocfs2_update_inode_atime(struct inode *inode,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 21a605079c62..c53a6763bbbe 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 
 #include <asm/byteorder.h>
 
@@ -89,6 +88,25 @@ void ocfs2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DIRSYNC;
 }
 
+/* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
+{
+	unsigned int flags = oi->vfs_inode.i_flags;
+
+	oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL|
+			OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		oi->ip_attr |= OCFS2_SYNC_FL;
+	if (flags & S_APPEND)
+		oi->ip_attr |= OCFS2_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		oi->ip_attr |= OCFS2_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		oi->ip_attr |= OCFS2_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		oi->ip_attr |= OCFS2_DIRSYNC_FL;
+}
+
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
 {
 	struct inode *inode = NULL;
@@ -196,7 +214,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	int status = -EINVAL;
 
 	mlog_entry("(0x%p, size:%llu)\n", inode,
-		   (unsigned long long)fe->i_size);
+		   (unsigned long long)le64_to_cpu(fe->i_size));
 
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);
@@ -248,7 +266,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		mlog(ML_ERROR,
 		     "ip_blkno %llu != i_blkno %llu!\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     (unsigned long long)fe->i_blkno);
+		     (unsigned long long)le64_to_cpu(fe->i_blkno));
 
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
 
@@ -301,7 +319,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		 * the generation argument to
 		 * ocfs2_inode_lock_res_init() will have to change.
 		 */
-		BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL));
+		BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
 
 		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
 					  OCFS2_LOCK_TYPE_META, 0, inode);
@@ -437,7 +455,8 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)fe->i_blkno, 7, fe->i_signature);
+		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+		     fe->i_signature);
 		goto bail;
 	}
 
@@ -812,8 +831,8 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 		     "Inode %llu (on-disk %llu) not orphaned! "
 		     "Disk flags  0x%x, inode flags 0x%x\n",
 		     (unsigned long long)oi->ip_blkno,
-		     (unsigned long long)di->i_blkno, di->i_flags,
-		     oi->ip_flags);
+		     (unsigned long long)le64_to_cpu(di->i_blkno),
+		     le32_to_cpu(di->i_flags), oi->ip_flags);
 		goto bail;
 	}
 
@@ -1106,8 +1125,10 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 		return NULL;
 	}
 
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
 					     NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 	if (tmperr < 0) {
 		mlog_errno(tmperr);
 		goto fail;
@@ -1197,6 +1218,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+	ocfs2_get_inode_flags(OCFS2_I(inode));
 	fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 03ae075869ee..a41d0817121b 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -141,6 +141,7 @@ int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
 
 void ocfs2_set_inode_flags(struct inode *inode);
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
 
 static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
 {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 4768be5f3086..f3ad21ad9aed 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -31,6 +31,7 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 		mlog_errno(status);
 		return status;
 	}
+	ocfs2_get_inode_flags(OCFS2_I(inode));
 	*flags = OCFS2_I(inode)->ip_attr;
 	ocfs2_meta_unlock(inode, 0);
 
@@ -134,3 +135,26 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 	}
 }
 
+#ifdef CONFIG_COMPAT
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int ret;
+
+	switch (cmd) {
+	case OCFS2_IOC32_GETFLAGS:
+		cmd = OCFS2_IOC_GETFLAGS;
+		break;
+	case OCFS2_IOC32_SETFLAGS:
+		cmd = OCFS2_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	lock_kernel();
+	ret = ocfs2_ioctl(inode, file, cmd, arg);
+	unlock_kernel();
+	return ret;
+}
+#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4a7c82931dba..4d6c4f430d0d 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -12,5 +12,6 @@
 
 int ocfs2_ioctl(struct inode * inode, struct file * filp,
 	unsigned int cmd, unsigned long arg);
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
 #endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 5a8a90d1c787..dc1188081720 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -435,7 +435,8 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 		 * handle the errors in a specific manner, so no need
 		 * to call ocfs2_error() here. */
 		mlog(ML_ERROR, "Journal dinode %llu  has invalid "
-		     "signature: %.*s", (unsigned long long)fe->i_blkno, 7,
+		     "signature: %.*s",
+		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
 		     fe->i_signature);
 		status = -EIO;
 		goto out;
@@ -742,7 +743,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 		la_dinode = item->lri_la_dinode;
 		if (la_dinode) {
 			mlog(0, "Clean up local alloc %llu\n",
-			     (unsigned long long)la_dinode->i_blkno);
+			     (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
 
 			ret = ocfs2_complete_local_alloc_recovery(osb,
 								  la_dinode);
@@ -755,7 +756,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 		tl_dinode = item->lri_tl_dinode;
 		if (tl_dinode) {
 			mlog(0, "Clean up truncate log %llu\n",
-			     (unsigned long long)tl_dinode->i_blkno);
+			     (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
 
 			ret = ocfs2_complete_truncate_log_recovery(osb,
 								   tl_dinode);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 4dedd9789108..545f7892cdf3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -471,9 +471,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 
 	mutex_lock(&local_alloc_inode->i_mutex);
 
-	ac->ac_inode = local_alloc_inode;
-	ac->ac_which = OCFS2_AC_USE_LOCAL;
-
 	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
 		status = -ENOSPC;
 		goto bail;
@@ -511,10 +508,14 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 		}
 	}
 
+	ac->ac_inode = local_alloc_inode;
+	ac->ac_which = OCFS2_AC_USE_LOCAL;
 	get_bh(osb->local_alloc_bh);
 	ac->ac_bh = osb->local_alloc_bh;
 	status = 0;
 bail:
+	if (status < 0 && local_alloc_inode)
+		iput(local_alloc_inode);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2bcf353fd7c5..36289e6295ce 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -578,8 +578,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
 		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
 		     "i_blkno=%llu, i_ino=%lu\n",
-		     (unsigned long long) (*new_fe_bh)->b_blocknr,
-		     (unsigned long long)fe->i_blkno, inode->i_ino);
+		     (unsigned long long)(*new_fe_bh)->b_blocknr,
+		     (unsigned long long)le64_to_cpu(fe->i_blkno),
+		     inode->i_ino);
 		BUG();
 	}
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 82cc92dcf8a6..a860633e833f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -363,9 +363,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 	typeof(__di) ____di = (__di);					\
 	ocfs2_error((__sb), 						\
 		"Dinode # %llu has bad signature %.*s",			\
-		(unsigned long long)(____di)->i_blkno, 7,		\
+		(unsigned long long)le64_to_cpu((____di)->i_blkno), 7, 	\
 		(____di)->i_signature);					\
-} while (0);
+} while (0)
 
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
 	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
@@ -374,9 +374,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 	typeof(__eb) ____eb = (__eb);					\
 	ocfs2_error((__sb), 						\
 		"Extent Block # %llu has bad signature %.*s",		\
-		(unsigned long long)(____eb)->h_blkno, 7,		\
+		(unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,	\
 		(____eb)->h_signature);					\
-} while (0);
+} while (0)
 
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
 	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
@@ -385,9 +385,9 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 	typeof(__gd) ____gd = (__gd);					\
 		ocfs2_error((__sb),					\
 		"Group Descriptor # %llu has bad signature %.*s",	\
-		(unsigned long long)(____gd)->bg_blkno, 7,		\
+		(unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
 		(____gd)->bg_signature);				\
-} while (0);
+} while (0)
 
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 71306479c68f..f0d9eb08547a 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -166,6 +166,8 @@
  */
 #define OCFS2_IOC_GETFLAGS	_IOR('f', 1, long)
 #define OCFS2_IOC_SETFLAGS	_IOW('f', 2, long)
+#define OCFS2_IOC32_GETFLAGS	_IOR('f', 1, int)
+#define OCFS2_IOC32_SETFLAGS	_IOW('f', 2, int)
 
 /*
  * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d921a28329dc..d8b79067dc14 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -26,7 +26,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 0da655ae5d6f..e3437626d183 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -849,9 +849,9 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	}
 
 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
-	     (unsigned long long)fe->i_blkno, chain,
-	     (unsigned long long)bg->bg_blkno,
-	     (unsigned long long)prev_bg->bg_blkno);
+	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+	     (unsigned long long)le64_to_cpu(bg->bg_blkno),
+	     (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
 
 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
@@ -1162,7 +1162,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	}
 
 	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-	     tmp_bits, (unsigned long long)bg->bg_blkno);
+	     tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
 
 	*num_bits = tmp_bits;
 
@@ -1227,7 +1227,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	}
 
 	mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
-	     (unsigned long long)fe->i_blkno);
+	     (unsigned long long)le64_to_cpu(fe->i_blkno));
 
 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5c9e8243691f..86b559c7dce9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -937,32 +937,29 @@ static void ocfs2_inode_init_once(void *data,
 {
 	struct ocfs2_inode_info *oi = data;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		oi->ip_flags = 0;
-		oi->ip_open_count = 0;
-		spin_lock_init(&oi->ip_lock);
-		ocfs2_extent_map_init(&oi->vfs_inode);
-		INIT_LIST_HEAD(&oi->ip_io_markers);
-		oi->ip_created_trans = 0;
-		oi->ip_last_trans = 0;
-		oi->ip_dir_start_lookup = 0;
+	oi->ip_flags = 0;
+	oi->ip_open_count = 0;
+	spin_lock_init(&oi->ip_lock);
+	ocfs2_extent_map_init(&oi->vfs_inode);
+	INIT_LIST_HEAD(&oi->ip_io_markers);
+	oi->ip_created_trans = 0;
+	oi->ip_last_trans = 0;
+	oi->ip_dir_start_lookup = 0;
 
-		init_rwsem(&oi->ip_alloc_sem);
-		mutex_init(&oi->ip_io_mutex);
+	init_rwsem(&oi->ip_alloc_sem);
+	mutex_init(&oi->ip_io_mutex);
 
-		oi->ip_blkno = 0ULL;
-		oi->ip_clusters = 0;
+	oi->ip_blkno = 0ULL;
+	oi->ip_clusters = 0;
 
-		ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
-		ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
-		ocfs2_lock_res_init_once(&oi->ip_data_lockres);
-		ocfs2_lock_res_init_once(&oi->ip_open_lockres);
+	ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
+	ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
+	ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+	ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 
-		ocfs2_metadata_cache_init(&oi->vfs_inode);
+	ocfs2_metadata_cache_init(&oi->vfs_inode);
 
-		inode_init_once(&oi->vfs_inode);
-	}
+	inode_init_once(&oi->vfs_inode);
 }
 
 static int ocfs2_initialize_mem_caches(void)
@@ -1538,7 +1535,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 		} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
 			mlog(ML_ERROR, "bad block number on superblock: "
 			     "found %llu, should be %llu\n",
-			     (unsigned long long)di->i_blkno,
+			     (unsigned long long)le64_to_cpu(di->i_blkno),
 			     (unsigned long long)bh->b_blocknr);
 		} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
 			    le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 40dc1a51f4a9..7134007ba22f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -67,16 +67,9 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
-	wait_on_page_locked(page);
-	if (!PageUptodate(page))
-		goto async_fail;
 	*ppage = page;
 	return kmap(page);
 
-async_fail:
-	page_cache_release(page);
-	return ERR_PTR(-EIO);
-
 sync_fail:
 	return (char*)page;
 }
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index 4f82a2f0efef..66a13ee63d4c 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -26,7 +26,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <linux/kthread.h>
 
 #include <cluster/heartbeat.h>
diff --git a/fs/open.c b/fs/open.c
index c989fb4cf7b9..0d515d161974 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -7,7 +7,6 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/file.h>
-#include <linux/smp_lock.h>
 #include <linux/quotaops.h>
 #include <linux/fsnotify.h>
 #include <linux/module.h>
@@ -211,6 +210,9 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 		newattrs.ia_valid |= ATTR_FILE;
 	}
 
+	/* Remove suid/sgid on truncate too */
+	newattrs.ia_valid |= should_remove_suid(dentry);
+
 	mutex_lock(&dentry->d_inode->i_mutex);
 	err = notify_change(dentry, &newattrs);
 	mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index bde1c164417d..e62397341c36 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -419,9 +419,7 @@ static void op_inode_init_once(void *data, struct kmem_cache * cachep, unsigned
 {
 	struct op_inode_info *oi = (struct op_inode_info *) data;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&oi->vfs_inode);
+	inode_init_once(&oi->vfs_inode);
 }
 
 static int __init init_openprom_fs(void)
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index 6e8bb66fe619..a99acd8de353 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -166,8 +166,12 @@ config LDM_PARTITION
 	depends on PARTITION_ADVANCED
 	---help---
 	  Say Y here if you would like to use hard disks under Linux which
-	  were partitioned using Windows 2000's or XP's Logical Disk Manager.
-	  They are also known as "Dynamic Disks".
+	  were partitioned using Windows 2000's/XP's or Vista's Logical Disk
+	  Manager.  They are also known as "Dynamic Disks".
+
+	  Note this driver only supports Dynamic Disks with a protective MBR
+	  label, i.e. DOS partition table.  It does not support GPT labelled
+	  Dynamic Disks yet as can be created with Vista.
 
 	  Windows 2000 introduced the concept of Dynamic Disks to get around
 	  the limitations of the PC's partitioning scheme.  The Logical Disk
@@ -175,8 +179,8 @@ config LDM_PARTITION
 	  mirrored, striped or RAID volumes, all without the need for
 	  rebooting.
 
-	  Normal partitions are now called Basic Disks under Windows 2000 and
-	  XP.
+	  Normal partitions are now called Basic Disks under Windows 2000, XP,
+	  and Vista.
 
 	  For a fuller description read <file:Documentation/ldm.txt>.
 
@@ -236,3 +240,12 @@ config EFI_PARTITION
 	help
 	  Say Y here if you would like to use hard disks under Linux which
 	  were partitioned using EFI GPT.
+
+config SYSV68_PARTITION
+	bool "SYSV68 partition table support" if PARTITION_ADVANCED
+	default y if VME
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by Motorola Delta machines (using
+	  sysv68).
+	  Otherwise, say N.
diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile
index 67e665fdb7fc..03af8eac51da 100644
--- a/fs/partitions/Makefile
+++ b/fs/partitions/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o
 obj-$(CONFIG_IBM_PARTITION) += ibm.o
 obj-$(CONFIG_EFI_PARTITION) += efi.o
 obj-$(CONFIG_KARMA_PARTITION) += karma.o
+obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 1bc9f372c7d4..e3491328596b 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -271,7 +271,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 		extern void xd_set_geometry(struct block_device *,
 			unsigned char, unsigned char, unsigned int);
 		xd_set_geometry(bdev, dr->secspertrack, heads, 1);
-		invalidate_bdev(bdev, 1);
+		invalidate_bh_lrus();
 		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 	}
 #endif
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8a7d0035ad7a..9a3a058f3553 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -34,6 +34,7 @@
 #include "ultrix.h"
 #include "efi.h"
 #include "karma.h"
+#include "sysv68.h"
 
 #ifdef CONFIG_BLK_DEV_MD
 extern void md_autodetect_dev(dev_t dev);
@@ -105,6 +106,9 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
 #ifdef CONFIG_KARMA_PARTITION
 	karma_partition,
 #endif
+#ifdef CONFIG_SYSV68_PARTITION
+	sysv68_partition,
+#endif
 	NULL
 };
  
@@ -312,7 +316,7 @@ static struct attribute * default_attrs[] = {
 	NULL,
 };
 
-extern struct subsystem block_subsys;
+extern struct kset block_subsys;
 
 static void part_release(struct kobject *kobj)
 {
@@ -388,7 +392,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	kobject_add(&p->kobj);
 	if (!disk->part_uevent_suppress)
 		kobject_uevent(&p->kobj, KOBJ_ADD);
-	sysfs_create_link(&p->kobj, &block_subsys.kset.kobj, "subsystem");
+	sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem");
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		static struct attribute addpartattr = {
 			.name = "whole_disk",
@@ -444,7 +448,7 @@ static int disk_sysfs_symlinks(struct gendisk *disk)
 			goto err_out_dev_link;
 	}
 
-	err = sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj,
+	err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,
 				"subsystem");
 	if (err)
 		goto err_out_disk_name_lnk;
@@ -569,9 +573,6 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
 				 NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
-		if (!PageUptodate(page))
-			goto fail;
 		if (PageError(page))
 			goto fail;
 		p->v = page;
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 1bea610078b3..e7b07006bc41 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -152,7 +152,7 @@ last_lba(struct block_device *bdev)
 }
 
 static inline int
-pmbr_part_valid(struct partition *part, u64 lastlba)
+pmbr_part_valid(struct partition *part)
 {
         if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
             le32_to_cpu(part->start_sect) == 1UL)
@@ -163,7 +163,6 @@ pmbr_part_valid(struct partition *part, u64 lastlba)
 /**
  * is_pmbr_valid(): test Protective MBR for validity
  * @mbr: pointer to a legacy mbr structure
- * @lastlba: last_lba for the whole device
  *
  * Description: Returns 1 if PMBR is valid, 0 otherwise.
  * Validity depends on two things:
@@ -171,13 +170,13 @@ pmbr_part_valid(struct partition *part, u64 lastlba)
  *  2) One partition of type 0xEE is found
  */
 static int
-is_pmbr_valid(legacy_mbr *mbr, u64 lastlba)
+is_pmbr_valid(legacy_mbr *mbr)
 {
 	int i;
 	if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
                 return 0;
 	for (i = 0; i < 4; i++)
-		if (pmbr_part_valid(&mbr->partition_record[i], lastlba))
+		if (pmbr_part_valid(&mbr->partition_record[i]))
                         return 1;
 	return 0;
 }
@@ -516,7 +515,7 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 	int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
 	gpt_header *pgpt = NULL, *agpt = NULL;
 	gpt_entry *pptes = NULL, *aptes = NULL;
-	legacy_mbr *legacymbr = NULL;
+	legacy_mbr *legacymbr;
 	u64 lastlba;
 	if (!bdev || !gpt || !ptes)
 		return 0;
@@ -528,9 +527,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
                 if (legacymbr) {
                         read_lba(bdev, 0, (u8 *) legacymbr,
                                  sizeof (*legacymbr));
-                        good_pmbr = is_pmbr_valid(legacymbr, lastlba);
+                        good_pmbr = is_pmbr_valid(legacymbr);
                         kfree(legacymbr);
-                        legacymbr=NULL;
                 }
                 if (!good_pmbr)
                         goto fail;
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 1a60926a4ccd..99873a2b4cbc 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -2,10 +2,10 @@
  * ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
  *
  * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
  *
- * Documentation is available at http://linux-ntfs.sf.net/ldm
+ * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
  *
  * This program is free software; you can redistribute it and/or modify it under
  * the terms of the GNU General Public License as published by the Free Software
@@ -62,7 +62,6 @@ static void _ldm_printk (const char *level, const char *function,
 	printk ("%s%s(): %s\n", level, function, buf);
 }
 
-
 /**
  * ldm_parse_hexbyte - Convert a ASCII hex number to a byte
  * @src:  Pointer to at least 2 characters to convert.
@@ -118,7 +117,6 @@ static bool ldm_parse_guid (const u8 *src, u8 *dest)
 	return true;
 }
 
-
 /**
  * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure
  * @data:  Raw database PRIVHEAD structure loaded from the device
@@ -130,46 +128,48 @@ static bool ldm_parse_guid (const u8 *src, u8 *dest)
  * Return:  'true'   @ph contains the PRIVHEAD data
  *          'false'  @ph contents are undefined
  */
-static bool ldm_parse_privhead (const u8 *data, struct privhead *ph)
+static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
 {
-	BUG_ON (!data || !ph);
+	bool is_vista = false;
 
-	if (MAGIC_PRIVHEAD != BE64 (data)) {
-		ldm_error ("Cannot find PRIVHEAD structure. LDM database is"
+	BUG_ON(!data || !ph);
+	if (MAGIC_PRIVHEAD != BE64(data)) {
+		ldm_error("Cannot find PRIVHEAD structure. LDM database is"
 			" corrupt. Aborting.");
 		return false;
 	}
-
-	ph->ver_major          = BE16 (data + 0x000C);
-	ph->ver_minor          = BE16 (data + 0x000E);
-	ph->logical_disk_start = BE64 (data + 0x011B);
-	ph->logical_disk_size  = BE64 (data + 0x0123);
-	ph->config_start       = BE64 (data + 0x012B);
-	ph->config_size        = BE64 (data + 0x0133);
-
-	if ((ph->ver_major != 2) || (ph->ver_minor != 11)) {
-		ldm_error ("Expected PRIVHEAD version %d.%d, got %d.%d."
-			" Aborting.", 2, 11, ph->ver_major, ph->ver_minor);
+	ph->ver_major = BE16(data + 0x000C);
+	ph->ver_minor = BE16(data + 0x000E);
+	ph->logical_disk_start = BE64(data + 0x011B);
+	ph->logical_disk_size = BE64(data + 0x0123);
+	ph->config_start = BE64(data + 0x012B);
+	ph->config_size = BE64(data + 0x0133);
+	/* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
+	if (ph->ver_major == 2 && ph->ver_minor == 12)
+		is_vista = true;
+	if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) {
+		ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d."
+			" Aborting.", ph->ver_major, ph->ver_minor);
 		return false;
 	}
+	ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major,
+			ph->ver_minor, is_vista ? "Vista" : "2000/XP");
 	if (ph->config_size != LDM_DB_SIZE) {	/* 1 MiB in sectors. */
-		/* Warn the user and continue, carefully */
-		ldm_info ("Database is normally %u bytes, it claims to "
+		/* Warn the user and continue, carefully. */
+		ldm_info("Database is normally %u bytes, it claims to "
 			"be %llu bytes.", LDM_DB_SIZE,
-			(unsigned long long)ph->config_size );
+			(unsigned long long)ph->config_size);
 	}
-	if ((ph->logical_disk_size == 0) ||
-	    (ph->logical_disk_start + ph->logical_disk_size > ph->config_start)) {
-		ldm_error ("PRIVHEAD disk size doesn't match real disk size");
+	if ((ph->logical_disk_size == 0) || (ph->logical_disk_start +
+			ph->logical_disk_size > ph->config_start)) {
+		ldm_error("PRIVHEAD disk size doesn't match real disk size");
 		return false;
 	}
-
-	if (!ldm_parse_guid (data + 0x0030, ph->disk_id)) {
-		ldm_error ("PRIVHEAD contains an invalid GUID.");
+	if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) {
+		ldm_error("PRIVHEAD contains an invalid GUID.");
 		return false;
 	}
-
-	ldm_debug ("Parsed PRIVHEAD successfully.");
+	ldm_debug("Parsed PRIVHEAD successfully.");
 	return true;
 }
 
@@ -409,7 +409,7 @@ out:
  * Return:  'true'   @toc1 contains validated TOCBLOCK info
  *          'false'  @toc1 contents are undefined
  */
-static bool ldm_validate_tocblocks (struct block_device *bdev,
+static bool ldm_validate_tocblocks(struct block_device *bdev,
 	unsigned long base, struct ldmdb *ldb)
 {
 	static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
@@ -417,54 +417,57 @@ static bool ldm_validate_tocblocks (struct block_device *bdev,
 	struct privhead *ph;
 	Sector sect;
 	u8 *data;
+	int i, nr_tbs;
 	bool result = false;
-	int i;
 
-	BUG_ON (!bdev || !ldb);
-
-	ph    = &ldb->ph;
+	BUG_ON(!bdev || !ldb);
+	ph = &ldb->ph;
 	tb[0] = &ldb->toc;
-	tb[1] = kmalloc (sizeof (*tb[1]), GFP_KERNEL);
-	tb[2] = kmalloc (sizeof (*tb[2]), GFP_KERNEL);
-	tb[3] = kmalloc (sizeof (*tb[3]), GFP_KERNEL);
-	if (!tb[1] || !tb[2] || !tb[3]) {
-		ldm_crit ("Out of memory.");
-		goto out;
+	tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
+	if (!tb[1]) {
+		ldm_crit("Out of memory.");
+		goto err;
 	}
-
-	for (i = 0; i < 4; i++)		/* Read and parse all four toc's. */
-	{
-		data = read_dev_sector (bdev, base + off[i], &sect);
+	tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1]));
+	tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2]));
+	/*
+	 * Try to read and parse all four TOCBLOCKs.
+	 *
+	 * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so
+	 * skip any that fail as long as we get at least one valid TOCBLOCK.
+	 */
+	for (nr_tbs = i = 0; i < 4; i++) {
+		data = read_dev_sector(bdev, base + off[i], &sect);
 		if (!data) {
-			ldm_crit ("Disk read failed.");
-			goto out;
+			ldm_error("Disk read failed for TOCBLOCK %d.", i);
+			continue;
 		}
-		result = ldm_parse_tocblock (data, tb[i]);
-		put_dev_sector (sect);
-		if (!result)
-			goto out;	/* Already logged */
+		if (ldm_parse_tocblock(data, tb[nr_tbs]))
+			nr_tbs++;
+		put_dev_sector(sect);
 	}
-
-	/* Range check the toc against a privhead. */
+	if (!nr_tbs) {
+		ldm_crit("Failed to find a valid TOCBLOCK.");
+		goto err;
+	}
+	/* Range check the TOCBLOCK against a privhead. */
 	if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) ||
-	    ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) > ph->config_size)) {
-		ldm_crit ("The bitmaps are out of range.  Giving up.");
-		goto out;
+			((tb[0]->bitmap2_start + tb[0]->bitmap2_size) >
+			ph->config_size)) {
+		ldm_crit("The bitmaps are out of range.  Giving up.");
+		goto err;
 	}
-
-	if (!ldm_compare_tocblocks (tb[0], tb[1]) ||	/* Compare all tocs. */
-	    !ldm_compare_tocblocks (tb[0], tb[2]) ||
-	    !ldm_compare_tocblocks (tb[0], tb[3])) {
-		ldm_crit ("The TOCBLOCKs don't match.");
-		goto out;
+	/* Compare all loaded TOCBLOCKs. */
+	for (i = 1; i < nr_tbs; i++) {
+		if (!ldm_compare_tocblocks(tb[0], tb[i])) {
+			ldm_crit("TOCBLOCKs 0 and %d do not match.", i);
+			goto err;
+		}
 	}
-
-	ldm_debug ("Validated TOCBLOCKs successfully.");
+	ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs);
 	result = true;
-out:
-	kfree (tb[1]);
-	kfree (tb[2]);
-	kfree (tb[3]);
+err:
+	kfree(tb[1]);
 	return result;
 }
 
@@ -566,7 +569,7 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
 
 	p = (struct partition*)(data + 0x01BE);
 	for (i = 0; i < 4; i++, p++)
-		if (SYS_IND (p) == WIN2K_DYNAMIC_PARTITION) {
+		if (SYS_IND (p) == LDM_PARTITION) {
 			result = true;
 			break;
 		}
@@ -975,44 +978,68 @@ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
  * Return:  'true'   @vb contains a Partition VBLK
  *          'false'  @vb contents are not defined
  */
-static bool ldm_parse_prt3 (const u8 *buffer, int buflen, struct vblk *vb)
+static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
 {
 	int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len;
 	struct vblk_part *part;
 
-	BUG_ON (!buffer || !vb);
-
-	r_objid  = ldm_relative (buffer, buflen, 0x18, 0);
-	r_name   = ldm_relative (buffer, buflen, 0x18, r_objid);
-	r_size   = ldm_relative (buffer, buflen, 0x34, r_name);
-	r_parent = ldm_relative (buffer, buflen, 0x34, r_size);
-	r_diskid = ldm_relative (buffer, buflen, 0x34, r_parent);
-
+	BUG_ON(!buffer || !vb);
+	r_objid = ldm_relative(buffer, buflen, 0x18, 0);
+	if (r_objid < 0) {
+		ldm_error("r_objid %d < 0", r_objid);
+		return false;
+	}
+	r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
+	if (r_name < 0) {
+		ldm_error("r_name %d < 0", r_name);
+		return false;
+	}
+	r_size = ldm_relative(buffer, buflen, 0x34, r_name);
+	if (r_size < 0) {
+		ldm_error("r_size %d < 0", r_size);
+		return false;
+	}
+	r_parent = ldm_relative(buffer, buflen, 0x34, r_size);
+	if (r_parent < 0) {
+		ldm_error("r_parent %d < 0", r_parent);
+		return false;
+	}
+	r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent);
+	if (r_diskid < 0) {
+		ldm_error("r_diskid %d < 0", r_diskid);
+		return false;
+	}
 	if (buffer[0x12] & VBLK_FLAG_PART_INDEX) {
-		r_index = ldm_relative (buffer, buflen, 0x34, r_diskid);
+		r_index = ldm_relative(buffer, buflen, 0x34, r_diskid);
+		if (r_index < 0) {
+			ldm_error("r_index %d < 0", r_index);
+			return false;
+		}
 		len = r_index;
 	} else {
 		r_index = 0;
 		len = r_diskid;
 	}
-	if (len < 0)
+	if (len < 0) {
+		ldm_error("len %d < 0", len);
 		return false;
-
+	}
 	len += VBLK_SIZE_PRT3;
-	if (len != BE32 (buffer + 0x14))
+	if (len > BE32(buffer + 0x14)) {
+		ldm_error("len %d > BE32(buffer + 0x14) %d", len,
+				BE32(buffer + 0x14));
 		return false;
-
+	}
 	part = &vb->vblk.part;
-	part->start         = BE64         (buffer + 0x24 + r_name);
-	part->volume_offset = BE64         (buffer + 0x2C + r_name);
-	part->size          = ldm_get_vnum (buffer + 0x34 + r_name);
-	part->parent_id     = ldm_get_vnum (buffer + 0x34 + r_size);
-	part->disk_id       = ldm_get_vnum (buffer + 0x34 + r_parent);
+	part->start = BE64(buffer + 0x24 + r_name);
+	part->volume_offset = BE64(buffer + 0x2C + r_name);
+	part->size = ldm_get_vnum(buffer + 0x34 + r_name);
+	part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
+	part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
 	if (vb->flags & VBLK_FLAG_PART_INDEX)
 		part->partnum = buffer[0x35 + r_diskid];
 	else
 		part->partnum = 0;
-
 	return true;
 }
 
@@ -1475,4 +1502,3 @@ out:
 	kfree (ldb);
 	return result;
 }
-
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 6e8d7952b8b5..d2e6a3046939 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -2,10 +2,10 @@
  * ldm - Part of the Linux-NTFS project.
  *
  * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
- * Copyright (C) 2001      Anton Altaparmakov <aia21@cantab.net>
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
  *
- * Documentation is available at http://linux-ntfs.sf.net/ldm
+ * Documentation is available at http://www.linux-ntfs.org/content/view/19/37/
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -93,7 +93,7 @@ struct parsed_partitions;
 
 #define OFF_VMDB		17		/* List of partitions. */
 
-#define WIN2K_DYNAMIC_PARTITION	0x42		/* Formerly SFS (Landis). */
+#define LDM_PARTITION		0x42		/* Formerly SFS (Landis). */
 
 #define TOC_BITMAP1		"config"	/* Names of the two defined */
 #define TOC_BITMAP2		"log"		/* bitmaps in the TOCBLOCK. */
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
new file mode 100644
index 000000000000..4eba27b78643
--- /dev/null
+++ b/fs/partitions/sysv68.c
@@ -0,0 +1,92 @@
+/*
+ *  fs/partitions/sysv68.c
+ *
+ *  Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be>
+ */
+
+#include "check.h"
+#include "sysv68.h"
+
+/*
+ *	Volume ID structure: on first 256-bytes sector of disk
+ */
+
+struct volumeid {
+	u8	vid_unused[248];
+	u8	vid_mac[8];	/* ASCII string "MOTOROLA" */
+};
+
+/*
+ *	config block: second 256-bytes sector on disk
+ */
+
+struct dkconfig {
+	u8	ios_unused0[128];
+	__be32	ios_slcblk;	/* Slice table block number */
+	__be16	ios_slccnt;	/* Number of entries in slice table */
+	u8	ios_unused1[122];
+};
+
+/*
+ *	combined volumeid and dkconfig block
+ */
+
+struct dkblk0 {
+	struct volumeid dk_vid;
+	struct dkconfig dk_ios;
+};
+
+/*
+ *	Slice Table Structure
+ */
+
+struct slice {
+	__be32	nblocks;		/* slice size (in blocks) */
+	__be32	blkoff;			/* block offset of slice */
+};
+
+
+int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
+{
+	int i, slices;
+	int slot = 1;
+	Sector sect;
+	unsigned char *data;
+	struct dkblk0 *b;
+	struct slice *slice;
+
+	data = read_dev_sector(bdev, 0, &sect);
+	if (!data)
+		return -1;
+
+	b = (struct dkblk0 *)data;
+	if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	slices = be16_to_cpu(b->dk_ios.ios_slccnt);
+	i = be32_to_cpu(b->dk_ios.ios_slcblk);
+	put_dev_sector(sect);
+
+	data = read_dev_sector(bdev, i, &sect);
+	if (!data)
+		return -1;
+
+	slices -= 1; /* last slice is the whole disk */
+	printk("sysV68: %s(s%u)", state->name, slices);
+	slice = (struct slice *)data;
+	for (i = 0; i < slices; i++, slice++) {
+		if (slot == state->limit)
+			break;
+		if (be32_to_cpu(slice->nblocks)) {
+			put_partition(state, slot,
+				be32_to_cpu(slice->blkoff),
+				be32_to_cpu(slice->nblocks));
+			printk("(s%u)", i);
+		}
+		slot++;
+	}
+	printk("\n");
+	put_dev_sector(sect);
+	return 1;
+}
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
new file mode 100644
index 000000000000..fa733f68431b
--- /dev/null
+++ b/fs/partitions/sysv68.h
@@ -0,0 +1 @@
+extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev);
diff --git a/fs/pipe.c b/fs/pipe.c
index ebafde7d6aba..3a89592bdf57 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -841,8 +841,18 @@ static int pipefs_delete_dentry(struct dentry *dentry)
 	return 0;
 }
 
+/*
+ * pipefs_dname() is called from d_path().
+ */
+static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+				dentry->d_inode->i_ino);
+}
+
 static struct dentry_operations pipefs_dentry_operations = {
 	.d_delete	= pipefs_delete_dentry,
+	.d_dname	= pipefs_dname,
 };
 
 static struct inode * get_pipe_inode(void)
@@ -888,8 +898,7 @@ struct file *create_write_pipe(void)
 	struct inode *inode;
 	struct file *f;
 	struct dentry *dentry;
-	char name[32];
-	struct qstr this;
+	struct qstr name = { .name = "" };
 
 	f = get_empty_filp();
 	if (!f)
@@ -899,11 +908,8 @@ struct file *create_write_pipe(void)
 	if (!inode)
 		goto err_file;
 
-	this.len = sprintf(name, "[%lu]", inode->i_ino);
-	this.name = name;
-	this.hash = 0;
 	err = -ENOMEM;
-	dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
+	dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
 	if (!dentry)
 		goto err_inode;
 
diff --git a/fs/pnode.c b/fs/pnode.c
index 56aacead8362..89940f243fc2 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -59,7 +59,7 @@ static int do_make_slave(struct vfsmount *mnt)
 	} else {
 		struct list_head *p = &mnt->mnt_slave_list;
 		while (!list_empty(p)) {
-                        slave_mnt = list_entry(p->next,
+                        slave_mnt = list_first_entry(p,
 					struct vfsmount, mnt_slave);
 			list_del_init(&slave_mnt->mnt_slave);
 			slave_mnt->mnt_master = NULL;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 07c9cdbcdcac..74f30e0c0381 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -410,9 +410,9 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
-	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
+	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %u %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %llu\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
 		task->pid,
 		tcomm,
 		state,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 989af5e55d1b..a5fa1fdafc4e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -61,9 +61,9 @@
 #include <linux/namei.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
+#include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
@@ -90,8 +90,8 @@
 #define PROC_NUMBUF 13
 
 struct pid_entry {
-	int len;
 	char *name;
+	int len;
 	mode_t mode;
 	const struct inode_operations *iop;
 	const struct file_operations *fop;
@@ -99,8 +99,8 @@ struct pid_entry {
 };
 
 #define NOD(NAME, MODE, IOP, FOP, OP) {			\
-	.len  = sizeof(NAME) - 1,			\
 	.name = (NAME),					\
+	.len  = sizeof(NAME) - 1,			\
 	.mode = MODE,					\
 	.iop  = IOP,					\
 	.fop  = FOP,					\
@@ -123,6 +123,9 @@ struct pid_entry {
 		NULL, &proc_info_file_operations,	\
 		{ .proc_read = &proc_##OTYPE } )
 
+int maps_protect;
+EXPORT_SYMBOL(maps_protect);
+
 static struct fs_struct *get_fs_struct(struct task_struct *task)
 {
 	struct fs_struct *fs;
@@ -275,17 +278,15 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
  */
 static int proc_pid_wchan(struct task_struct *task, char *buffer)
 {
-	char *modname;
-	const char *sym_name;
-	unsigned long wchan, size, offset;
-	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long wchan;
+	char symname[KSYM_NAME_LEN+1];
 
 	wchan = get_wchan(task);
 
-	sym_name = kallsyms_lookup(wchan, &size, &offset, &modname, namebuf);
-	if (sym_name)
-		return sprintf(buffer, "%s", sym_name);
-	return sprintf(buffer, "%lu", wchan);
+	if (lookup_symbol_name(wchan, symname) < 0)
+		return sprintf(buffer, "%lu", wchan);
+	else
+		return sprintf(buffer, "%s", symname);
 }
 #endif /* CONFIG_KALLSYMS */
 
@@ -310,7 +311,9 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 	struct timespec uptime;
 
 	do_posix_clock_monotonic_gettime(&uptime);
+	read_lock(&tasklist_lock);
 	points = badness(task, uptime.tv_sec);
+	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
 
@@ -344,11 +347,8 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr)
 		return -EPERM;
 
 	error = inode_change_ok(inode, attr);
-	if (!error) {
-		error = security_inode_setattr(dentry, attr);
-		if (!error)
-			error = inode_setattr(inode, attr);
-	}
+	if (!error)
+		error = inode_setattr(inode, attr);
 	return error;
 }
 
@@ -660,7 +660,6 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 	char buffer[PROC_NUMBUF];
 	size_t len;
 	int oom_adjust;
-	loff_t __ppos = *ppos;
 
 	if (!task)
 		return -ESRCH;
@@ -668,14 +667,8 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 	put_task_struct(task);
 
 	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
-	if (__ppos >= len)
-		return 0;
-	if (count > len-__ppos)
-		count = len-__ppos;
-	if (copy_to_user(buf, buffer + __ppos, count))
-		return -EFAULT;
-	*ppos = __ppos + count;
-	return count;
+
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
 
 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
@@ -715,6 +708,42 @@ static const struct file_operations proc_oom_adjust_operations = {
 	.write		= oom_adjust_write,
 };
 
+#ifdef CONFIG_MMU
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF], *end;
+	struct mm_struct *mm;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	if (!simple_strtol(buffer, &end, 0))
+		return -EINVAL;
+	if (*end == '\n')
+		end++;
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+	mm = get_task_mm(task);
+	if (mm) {
+		clear_refs_smap(mm);
+		mmput(mm);
+	}
+	put_task_struct(task);
+	if (end - buffer == 0)
+		return -EIO;
+	return end - buffer;
+}
+
+static struct file_operations proc_clear_refs_operations = {
+	.write		= clear_refs_write,
+};
+#endif
+
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -789,7 +818,6 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
 {
 	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
 	char __buf[20];
-	loff_t __ppos = *ppos;
 	size_t len;
 
 	if (!tsk)
@@ -797,14 +825,8 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
 	/* no need to print the trailing zero, so use only len */
 	len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
 	put_task_struct(tsk);
-	if (__ppos >= len)
-		return 0;
-	if (count > len - __ppos)
-		count = len - __ppos;
-	if (copy_to_user(buf, __buf + __ppos, count))
-		return -EFAULT;
-	*ppos = __ppos + count;
-	return count;
+
+	return simple_read_from_buffer(buf, count, ppos, __buf, len);
 }
 
 static ssize_t seccomp_write(struct file *file, const char __user *buf,
@@ -863,7 +885,6 @@ static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
 	char buffer[PROC_NUMBUF];
 	size_t len;
 	int make_it_fail;
-	loff_t __ppos = *ppos;
 
 	if (!task)
 		return -ESRCH;
@@ -871,14 +892,8 @@ static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
 	put_task_struct(task);
 
 	len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
-	if (__ppos >= len)
-		return 0;
-	if (count > len-__ppos)
-		count = len-__ppos;
-	if (copy_to_user(buf, buffer + __ppos, count))
-		return -EFAULT;
-	*ppos = __ppos + count;
-	return count;
+
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
 
 static ssize_t proc_fault_inject_write(struct file * file,
@@ -941,7 +956,7 @@ static int do_proc_readlink(struct dentry *dentry, struct vfsmount *mnt,
 
 	if (!tmp)
 		return -ENOMEM;
-		
+
 	inode = dentry->d_inode;
 	path = d_path(dentry, mnt, tmp, PAGE_SIZE);
 	len = PTR_ERR(path);
@@ -1121,7 +1136,8 @@ static struct dentry_operations pid_dentry_operations =
 
 /* Lookups */
 
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *, struct task_struct *, void *);
+typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
+				struct task_struct *, const void *);
 
 /*
  * Fill a directory entry.
@@ -1137,7 +1153,7 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *, struct tas
  */
 static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	char *name, int len,
-	instantiate_t instantiate, struct task_struct *task, void *ptr)
+	instantiate_t instantiate, struct task_struct *task, const void *ptr)
 {
 	struct dentry *child, *dir = filp->f_path.dentry;
 	struct inode *inode;
@@ -1199,7 +1215,10 @@ out:
 	return ~0U;
 }
 
-static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+#define PROC_FDINFO_MAX 64
+
+static int proc_fd_info(struct inode *inode, struct dentry **dentry,
+			struct vfsmount **mnt, char *info)
 {
 	struct task_struct *task = get_proc_task(inode);
 	struct files_struct *files = NULL;
@@ -1218,8 +1237,16 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
 		spin_lock(&files->file_lock);
 		file = fcheck_files(files, fd);
 		if (file) {
-			*mnt = mntget(file->f_path.mnt);
-			*dentry = dget(file->f_path.dentry);
+			if (mnt)
+				*mnt = mntget(file->f_path.mnt);
+			if (dentry)
+				*dentry = dget(file->f_path.dentry);
+			if (info)
+				snprintf(info, PROC_FDINFO_MAX,
+					 "pos:\t%lli\n"
+					 "flags:\t0%o\n",
+					 (long long) file->f_pos,
+					 file->f_flags);
 			spin_unlock(&files->file_lock);
 			put_files_struct(files);
 			return 0;
@@ -1230,6 +1257,12 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
 	return -ENOENT;
 }
 
+static int proc_fd_link(struct inode *inode, struct dentry **dentry,
+			struct vfsmount **mnt)
+{
+	return proc_fd_info(inode, dentry, mnt, NULL);
+}
+
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1272,9 +1305,9 @@ static struct dentry_operations tid_fd_dentry_operations =
 };
 
 static struct dentry *proc_fd_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, void *ptr)
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
-	unsigned fd = *(unsigned *)ptr;
+	unsigned fd = *(const unsigned *)ptr;
 	struct file *file;
 	struct files_struct *files;
  	struct inode *inode;
@@ -1325,7 +1358,9 @@ out_iput:
 	goto out;
 }
 
-static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *proc_lookupfd_common(struct inode *dir,
+					   struct dentry *dentry,
+					   instantiate_t instantiate)
 {
 	struct task_struct *task = get_proc_task(dir);
 	unsigned fd = name_to_int(dentry);
@@ -1336,23 +1371,15 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (fd == ~0U)
 		goto out;
 
-	result = proc_fd_instantiate(dir, dentry, task, &fd);
+	result = instantiate(dir, dentry, task, &fd);
 out:
 	put_task_struct(task);
 out_no_task:
 	return result;
 }
 
-static int proc_fd_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	struct task_struct *task, int fd)
-{
-	char name[PROC_NUMBUF];
-	int len = snprintf(name, sizeof(name), "%d", fd);
-	return proc_fill_cache(filp, dirent, filldir, name, len,
-				proc_fd_instantiate, task, &fd);
-}
-
-static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
+static int proc_readfd_common(struct file * filp, void * dirent,
+			      filldir_t filldir, instantiate_t instantiate)
 {
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
@@ -1388,12 +1415,17 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 			for (fd = filp->f_pos-2;
 			     fd < fdt->max_fds;
 			     fd++, filp->f_pos++) {
+				char name[PROC_NUMBUF];
+				int len;
 
 				if (!fcheck_files(files, fd))
 					continue;
 				rcu_read_unlock();
 
-				if (proc_fd_fill_cache(filp, dirent, filldir, p, fd) < 0) {
+				len = snprintf(name, sizeof(name), "%d", fd);
+				if (proc_fill_cache(filp, dirent, filldir,
+						    name, len, instantiate,
+						    p, &fd) < 0) {
 					rcu_read_lock();
 					break;
 				}
@@ -1408,23 +1440,119 @@ out_no_task:
 	return retval;
 }
 
+static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
+				    struct nameidata *nd)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
+}
+
+static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+}
+
+static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
+				      size_t len, loff_t *ppos)
+{
+	char tmp[PROC_FDINFO_MAX];
+	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, NULL, tmp);
+	if (!err)
+		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
+	return err;
+}
+
+static const struct file_operations proc_fdinfo_file_operations = {
+	.open		= nonseekable_open,
+	.read		= proc_fdinfo_read,
+};
+
 static const struct file_operations proc_fd_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_readfd,
 };
 
 /*
+ * /proc/pid/fd needs a special permission handler so that a process can still
+ * access /proc/self/fd after it has executed a setuid().
+ */
+static int proc_fd_permission(struct inode *inode, int mask,
+				struct nameidata *nd)
+{
+	int rv;
+
+	rv = generic_permission(inode, mask, NULL);
+	if (rv == 0)
+		return 0;
+	if (task_pid(current) == proc_pid(inode))
+		rv = 0;
+	return rv;
+}
+
+/*
  * proc directories can do almost nothing..
  */
 static const struct inode_operations proc_fd_inode_operations = {
 	.lookup		= proc_lookupfd,
+	.permission	= proc_fd_permission,
+	.setattr	= proc_setattr,
+};
+
+static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+	unsigned fd = *(unsigned *)ptr;
+ 	struct inode *inode;
+ 	struct proc_inode *ei;
+	struct dentry *error = ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+	ei = PROC_I(inode);
+	ei->fd = fd;
+	inode->i_mode = S_IFREG | S_IRUSR;
+	inode->i_fop = &proc_fdinfo_file_operations;
+	dentry->d_op = &tid_fd_dentry_operations;
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, NULL))
+		error = NULL;
+
+ out:
+	return error;
+}
+
+static struct dentry *proc_lookupfdinfo(struct inode *dir,
+					struct dentry *dentry,
+					struct nameidata *nd)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
+}
+
+static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return proc_readfd_common(filp, dirent, filldir,
+				  proc_fdinfo_instantiate);
+}
+
+static const struct file_operations proc_fdinfo_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_readfdinfo,
+};
+
+/*
+ * proc directories can do almost nothing..
+ */
+static const struct inode_operations proc_fdinfo_inode_operations = {
+	.lookup		= proc_lookupfdinfo,
 	.setattr	= proc_setattr,
 };
 
+
 static struct dentry *proc_pident_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, void *ptr)
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
-	struct pid_entry *p = ptr;
+	const struct pid_entry *p = ptr;
 	struct inode *inode;
 	struct proc_inode *ei;
 	struct dentry *error = ERR_PTR(-EINVAL);
@@ -1453,13 +1581,13 @@ out:
 
 static struct dentry *proc_pident_lookup(struct inode *dir, 
 					 struct dentry *dentry,
-					 struct pid_entry *ents,
+					 const struct pid_entry *ents,
 					 unsigned int nents)
 {
 	struct inode *inode;
 	struct dentry *error;
 	struct task_struct *task = get_proc_task(dir);
-	struct pid_entry *p, *last;
+	const struct pid_entry *p, *last;
 
 	error = ERR_PTR(-ENOENT);
 	inode = NULL;
@@ -1488,8 +1616,8 @@ out_no_task:
 	return error;
 }
 
-static int proc_pident_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	struct task_struct *task, struct pid_entry *p)
+static int proc_pident_fill_cache(struct file *filp, void *dirent,
+	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
 {
 	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
 				proc_pident_instantiate, task, p);
@@ -1497,14 +1625,14 @@ static int proc_pident_fill_cache(struct file *filp, void *dirent, filldir_t fil
 
 static int proc_pident_readdir(struct file *filp,
 		void *dirent, filldir_t filldir,
-		struct pid_entry *ents, unsigned int nents)
+		const struct pid_entry *ents, unsigned int nents)
 {
 	int i;
 	int pid;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task = get_proc_task(inode);
-	struct pid_entry *p, *last;
+	const struct pid_entry *p, *last;
 	ino_t ino;
 	int ret;
 
@@ -1619,7 +1747,7 @@ static const struct file_operations proc_pid_attr_operations = {
 	.write		= proc_pid_attr_write,
 };
 
-static struct pid_entry attr_dir_stuff[] = {
+static const struct pid_entry attr_dir_stuff[] = {
 	REG("current",    S_IRUGO|S_IWUGO, pid_attr),
 	REG("prev",       S_IRUGO,	   pid_attr),
 	REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
@@ -1685,7 +1813,7 @@ static const struct inode_operations proc_self_inode_operations = {
  * that properly belong to the /proc filesystem, as they describe
  * describe something that is process related.
  */
-static struct pid_entry proc_base_stuff[] = {
+static const struct pid_entry proc_base_stuff[] = {
 	NOD("self", S_IFLNK|S_IRWXUGO,
 		&proc_self_inode_operations, NULL, {}),
 };
@@ -1714,9 +1842,9 @@ static struct dentry_operations proc_base_dentry_operations =
 };
 
 static struct dentry *proc_base_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, void *ptr)
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
-	struct pid_entry *p = ptr;
+	const struct pid_entry *p = ptr;
 	struct inode *inode;
 	struct proc_inode *ei;
 	struct dentry *error = ERR_PTR(-EINVAL);
@@ -1764,7 +1892,7 @@ static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
 {
 	struct dentry *error;
 	struct task_struct *task = get_proc_task(dir);
-	struct pid_entry *p, *last;
+	const struct pid_entry *p, *last;
 
 	error = ERR_PTR(-ENOENT);
 
@@ -1790,8 +1918,8 @@ out_no_task:
 	return error;
 }
 
-static int proc_base_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	struct task_struct *task, struct pid_entry *p)
+static int proc_base_fill_cache(struct file *filp, void *dirent,
+	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
 {
 	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
 				proc_base_instantiate, task, p);
@@ -1828,9 +1956,10 @@ static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
 static const struct file_operations proc_task_operations;
 static const struct inode_operations proc_task_inode_operations;
 
-static struct pid_entry tgid_base_stuff[] = {
+static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, task),
 	DIR("fd",         S_IRUSR|S_IXUSR, fd),
+	DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
 	INF("environ",    S_IRUSR, pid_environ),
 	INF("auxv",       S_IRUSR, pid_auxv),
 	INF("status",     S_IRUGO, pid_status),
@@ -1851,6 +1980,7 @@ static struct pid_entry tgid_base_stuff[] = {
 	REG("mounts",     S_IRUGO, mounts),
 	REG("mountstats", S_IRUSR, mountstats),
 #ifdef CONFIG_MMU
+	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",      S_IRUGO, smaps),
 #endif
 #ifdef CONFIG_SECURITY
@@ -1970,7 +2100,7 @@ out:
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,
 					   struct dentry * dentry,
-					   struct task_struct *task, void *ptr)
+					   struct task_struct *task, const void *ptr)
 {
 	struct dentry *error = ERR_PTR(-ENOENT);
 	struct inode *inode;
@@ -1983,7 +2113,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
 	inode->i_op = &proc_tgid_base_inode_operations;
 	inode->i_fop = &proc_tgid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
-	inode->i_nlink = 4;
+	inode->i_nlink = 5;
 #ifdef CONFIG_SECURITY
 	inode->i_nlink += 1;
 #endif
@@ -2085,7 +2215,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		goto out_no_task;
 
 	for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
-		struct pid_entry *p = &proc_base_stuff[nr];
+		const struct pid_entry *p = &proc_base_stuff[nr];
 		if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
 			goto out;
 	}
@@ -2111,8 +2241,9 @@ out_no_task:
 /*
  * Tasks
  */
-static struct pid_entry tid_base_stuff[] = {
+static const struct pid_entry tid_base_stuff[] = {
 	DIR("fd",        S_IRUSR|S_IXUSR, fd),
+	DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
 	INF("environ",   S_IRUSR, pid_environ),
 	INF("auxv",      S_IRUSR, pid_auxv),
 	INF("status",    S_IRUGO, pid_status),
@@ -2132,6 +2263,7 @@ static struct pid_entry tid_base_stuff[] = {
 	LNK("exe",       exe),
 	REG("mounts",    S_IRUGO, mounts),
 #ifdef CONFIG_MMU
+	REG("clear_refs", S_IWUSR, clear_refs),
 	REG("smaps",     S_IRUGO, smaps),
 #endif
 #ifdef CONFIG_SECURITY
@@ -2180,7 +2312,7 @@ static const struct inode_operations proc_tid_base_inode_operations = {
 };
 
 static struct dentry *proc_task_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, void *ptr)
+	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
 	struct dentry *error = ERR_PTR(-ENOENT);
 	struct inode *inode;
@@ -2192,7 +2324,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
 	inode->i_op = &proc_tid_base_inode_operations;
 	inode->i_fop = &proc_tid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
-	inode->i_nlink = 3;
+	inode->i_nlink = 4;
 #ifdef CONFIG_SECURITY
 	inode->i_nlink += 1;
 #endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 775fb21294d8..8a40e15f5ecb 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -398,6 +398,7 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
 			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
 				unsigned int ino = de->low_ino;
 
+				de_get(de);
 				spin_unlock(&proc_subdir_lock);
 				error = -EINVAL;
 				inode = proc_get_inode(dir->i_sb, ino, de);
@@ -414,6 +415,7 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
 		d_add(dentry, inode);
 		return NULL;
 	}
+	de_put(de);
 	return ERR_PTR(error);
 }
 
@@ -476,14 +478,21 @@ int proc_readdir(struct file * filp,
 			}
 
 			do {
+				struct proc_dir_entry *next;
+
 				/* filldir passes info to user space */
+				de_get(de);
 				spin_unlock(&proc_subdir_lock);
 				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
-					    de->low_ino, de->mode >> 12) < 0)
+					    de->low_ino, de->mode >> 12) < 0) {
+					de_put(de);
 					goto out;
+				}
 				spin_lock(&proc_subdir_lock);
 				filp->f_pos++;
-				de = de->next;
+				next = de->next;
+				de_put(de);
+				de = next;
 			} while (de);
 			spin_unlock(&proc_subdir_lock);
 	}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index c372eb151a3a..d5ce65c68d7b 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -21,7 +21,7 @@
 
 #include "internal.h"
 
-static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de)
+struct proc_dir_entry *de_get(struct proc_dir_entry *de)
 {
 	if (de)
 		atomic_inc(&de->count);
@@ -31,7 +31,7 @@ static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de)
 /*
  * Decrements the use count and checks for deferred deletion.
  */
-static void de_put(struct proc_dir_entry *de)
+void de_put(struct proc_dir_entry *de)
 {
 	if (de) {	
 		lock_kernel();		
@@ -109,9 +109,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
  
 int __init proc_init_inodecache(void)
@@ -147,13 +145,6 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 {
 	struct inode * inode;
 
-	/*
-	 * Increment the use count so the dir entry can't disappear.
-	 */
-	de_get(de);
-
-	WARN_ON(de && de->deleted);
-
 	if (de != NULL && !try_module_get(de->owner))
 		goto out_mod;
 
@@ -185,7 +176,6 @@ out_ino:
 	if (de != NULL)
 		module_put(de->owner);
 out_mod:
-	de_put(de);
 	return NULL;
 }			
 
@@ -200,6 +190,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
 	s->s_op = &proc_sops;
 	s->s_time_gran = 1;
 	
+	de_get(&proc_root);
 	root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
 	if (!root_inode)
 		goto out_no_root;
@@ -213,6 +204,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
 out_no_root:
 	printk("proc_read_super: get root inode failed\n");
 	iput(root_inode);
+	de_put(&proc_root);
 	return -ENOMEM;
 }
 MODULE_LICENSE("GPL");
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f771889183c3..b215c3524fa6 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,6 +37,8 @@ do {						\
 extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
 
+extern int maps_protect;
+
 extern void create_seq_entry(char *name, mode_t mode, const struct file_operations *f);
 extern int proc_exe_link(struct inode *, struct dentry **, struct vfsmount **);
 extern int proc_tid_stat(struct task_struct *,  char *);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index abdf068bc27f..eca471bc8512 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -38,7 +38,7 @@ static int property_read_proc(char *page, char **start, off_t off,
 		n = count;
 	else
 		*eof = 1;
-	memcpy(page, pp->value + off, n);
+	memcpy(page, (char *)pp->value + off, n);
 	*start = page;
 	return n;
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e2c4c0a5c90d..5fd49e47f83a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -35,7 +35,6 @@
 #include <linux/signal.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/times.h>
 #include <linux/profile.h>
@@ -398,8 +397,6 @@ static const struct file_operations proc_modules_operations = {
 #endif
 
 #ifdef CONFIG_SLAB
-extern struct seq_operations slabinfo_op;
-extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *);
 static int slabinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &slabinfo_op);
@@ -431,18 +428,11 @@ static int slabstats_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static int slabstats_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *m = file->private_data;
-	kfree(m->private);
-	return seq_release(inode, file);
-}
-
 static const struct file_operations proc_slabstats_operations = {
 	.open		= slabstats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= slabstats_release,
+	.release	= seq_release_private,
 };
 #endif
 #endif
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 20e8cbb34364..680c429bfa22 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -429,11 +429,8 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 		return -EPERM;
 
 	error = inode_change_ok(inode, attr);
-	if (!error) {
-		error = security_inode_setattr(dentry, attr);
-		if (!error)
-			error = inode_setattr(inode, attr);
-	}
+	if (!error)
+		error = inode_setattr(inode, attr);
 
 	return error;
 }
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index c1bbfbeb035e..b3a473b0a191 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -108,6 +108,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 {
 	struct list_head *p;
 	loff_t l = *pos;
+
+	mutex_lock(&tty_mutex);
 	list_for_each(p, &tty_drivers)
 		if (!l--)
 			return list_entry(p, struct tty_driver, tty_drivers);
@@ -124,6 +126,7 @@ static void *t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void t_stop(struct seq_file *m, void *v)
 {
+	mutex_unlock(&tty_mutex);
 }
 
 static struct seq_operations tty_drivers_op = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7445980c8022..c24d81a5a040 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -3,6 +3,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
+#include <linux/ptrace.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 
@@ -120,6 +121,14 @@ struct mem_size_stats
 	unsigned long shared_dirty;
 	unsigned long private_clean;
 	unsigned long private_dirty;
+	unsigned long referenced;
+};
+
+struct pmd_walker {
+	struct vm_area_struct *vma;
+	void *private;
+	void (*action)(struct vm_area_struct *, pmd_t *, unsigned long,
+		       unsigned long, void *);
 };
 
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
@@ -134,6 +143,9 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
 	dev_t dev = 0;
 	int len;
 
+	if (maps_protect && !ptrace_may_attach(task))
+		return -EACCES;
+
 	if (file) {
 		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 		dev = inode->i_sb->s_dev;
@@ -181,18 +193,20 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
 
 	if (mss)
 		seq_printf(m,
-			   "Size:          %8lu kB\n"
-			   "Rss:           %8lu kB\n"
-			   "Shared_Clean:  %8lu kB\n"
-			   "Shared_Dirty:  %8lu kB\n"
-			   "Private_Clean: %8lu kB\n"
-			   "Private_Dirty: %8lu kB\n",
+			   "Size:           %8lu kB\n"
+			   "Rss:            %8lu kB\n"
+			   "Shared_Clean:   %8lu kB\n"
+			   "Shared_Dirty:   %8lu kB\n"
+			   "Private_Clean:  %8lu kB\n"
+			   "Private_Dirty:  %8lu kB\n"
+			   "Referenced:     %8lu kB\n",
 			   (vma->vm_end - vma->vm_start) >> 10,
 			   mss->resident >> 10,
 			   mss->shared_clean  >> 10,
 			   mss->shared_dirty  >> 10,
 			   mss->private_clean >> 10,
-			   mss->private_dirty >> 10);
+			   mss->private_dirty >> 10,
+			   mss->referenced >> 10);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
@@ -205,15 +219,16 @@ static int show_map(struct seq_file *m, void *v)
 }
 
 static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+			    unsigned long addr, unsigned long end,
+			    void *private)
 {
+	struct mem_size_stats *mss = private;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	do {
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;
 		if (!pte_present(ptent))
 			continue;
@@ -224,6 +239,9 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (!page)
 			continue;
 
+		/* Accumulate the size in pages that have been accessed. */
+		if (pte_young(ptent) || PageReferenced(page))
+			mss->referenced += PAGE_SIZE;
 		if (page_mapcount(page) >= 2) {
 			if (pte_dirty(ptent))
 				mss->shared_dirty += PAGE_SIZE;
@@ -235,57 +253,99 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			else
 				mss->private_clean += PAGE_SIZE;
 		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 }
 
-static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+				 unsigned long addr, unsigned long end,
+				 void *private)
+{
+	pte_t *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/* Clear accessed and referenced bits. */
+		ptep_test_and_clear_young(vma, addr, pte);
+		ClearPageReferenced(page);
+	}
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
+}
+
+static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud,
+				  unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
 	unsigned long next;
 
-	pmd = pmd_offset(pud, addr);
-	do {
+	for (pmd = pmd_offset(pud, addr); addr != end;
+	     pmd++, addr = next) {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		smaps_pte_range(vma, pmd, addr, next, mss);
-	} while (pmd++, addr = next, addr != end);
+		walker->action(walker->vma, pmd, addr, next, walker->private);
+	}
 }
 
-static inline void smaps_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd,
+				  unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
 
-	pud = pud_offset(pgd, addr);
-	do {
+	for (pud = pud_offset(pgd, addr); addr != end;
+	     pud++, addr = next) {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		smaps_pmd_range(vma, pud, addr, next, mss);
-	} while (pud++, addr = next, addr != end);
+		walk_pmd_range(walker, pud, addr, next);
+	}
 }
 
-static inline void smaps_pgd_range(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end,
-				struct mem_size_stats *mss)
+/*
+ * walk_page_range - walk the page tables of a VMA with a callback
+ * @vma - VMA to walk
+ * @action - callback invoked for every bottom-level (PTE) page table
+ * @private - private data passed to the callback function
+ *
+ * Recursively walk the page table for the memory area in a VMA, calling
+ * a callback for every bottom-level (PTE) page table.
+ */
+static inline void walk_page_range(struct vm_area_struct *vma,
+				   void (*action)(struct vm_area_struct *,
+						  pmd_t *, unsigned long,
+						  unsigned long, void *),
+				   void *private)
 {
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	struct pmd_walker walker = {
+		.vma		= vma,
+		.private	= private,
+		.action		= action,
+	};
 	pgd_t *pgd;
 	unsigned long next;
 
-	pgd = pgd_offset(vma->vm_mm, addr);
-	do {
+	for (pgd = pgd_offset(vma->vm_mm, addr); addr != end;
+	     pgd++, addr = next) {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		smaps_pud_range(vma, pgd, addr, next, mss);
-	} while (pgd++, addr = next, addr != end);
+		walk_pud_range(&walker, pgd, addr, next);
+	}
 }
 
 static int show_smap(struct seq_file *m, void *v)
@@ -295,10 +355,22 @@ static int show_smap(struct seq_file *m, void *v)
 
 	memset(&mss, 0, sizeof mss);
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-		smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
+		walk_page_range(vma, smaps_pte_range, &mss);
 	return show_map_internal(m, v, &mss);
 }
 
+void clear_refs_smap(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	down_read(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		if (vma->vm_mm && !is_vm_hugetlb_page(vma))
+			walk_page_range(vma, clear_refs_pte_range, NULL);
+	flush_tlb_mm(mm);
+	up_read(&mm->mmap_sem);
+}
+
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
@@ -444,11 +516,22 @@ const struct file_operations proc_maps_operations = {
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
 
+static int show_numa_map_checked(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
+
+	if (maps_protect && !ptrace_may_attach(task))
+		return -EACCES;
+
+	return show_numa_map(m, v);
+}
+
 static struct seq_operations proc_pid_numa_maps_op = {
         .start  = m_start,
         .next   = m_next,
         .stop   = m_stop,
-        .show   = show_numa_map
+        .show   = show_numa_map_checked
 };
 
 static int numa_maps_open(struct inode *inode, struct file *file)
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 7cddf6b8635a..d8b8c7183c24 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/mount.h>
+#include <linux/ptrace.h>
 #include <linux/seq_file.h>
 #include "internal.h"
 
@@ -143,6 +144,12 @@ out:
 static int show_map(struct seq_file *m, void *_vml)
 {
 	struct vm_list_struct *vml = _vml;
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
+
+	if (maps_protect && !ptrace_may_attach(task))
+		return -EACCES;
+
 	return nommu_vma_show(m, vml->vma);
 }
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index d96050728c43..523e1098ae88 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -514,7 +514,7 @@ static int __init parse_crash_elf64_headers(void)
 	/* Do some basic Verification. */
 	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
 		(ehdr.e_type != ET_CORE) ||
-		!elf_check_arch(&ehdr) ||
+		!vmcore_elf_check_arch(&ehdr) ||
 		ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
 		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
 		ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 83bc8e7824cd..8d256eb11813 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -536,9 +536,7 @@ static void init_once(void *foo, struct kmem_cache * cachep,
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
 
 static int init_inodecache(void)
diff --git a/fs/quota.c b/fs/quota.c
index b9dae76a0b6e..9f237d6182c9 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -11,7 +11,6 @@
 #include <asm/current.h>
 #include <asm/uaccess.h>
 #include <linux/kernel.h>
-#include <linux/smp_lock.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
@@ -158,7 +157,6 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t
 static void quota_sync_sb(struct super_block *sb, int type)
 {
 	int cnt;
-	struct inode *discard[MAXQUOTAS];
 
 	sb->s_qcop->quota_sync(sb, type);
 	/* This is not very clever (and fast) but currently I don't know about
@@ -168,29 +166,21 @@ static void quota_sync_sb(struct super_block *sb, int type)
 		sb->s_op->sync_fs(sb, 1);
 	sync_blockdev(sb->s_bdev);
 
-	/* Now when everything is written we can discard the pagecache so
-	 * that userspace sees the changes. We need i_mutex and so we could
-	 * not do it inside dqonoff_mutex. Moreover we need to be carefull
-	 * about races with quotaoff() (that is the reason why we have own
-	 * reference to inode). */
+	/*
+	 * Now when everything is written we can discard the pagecache so
+	 * that userspace sees the changes.
+	 */
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		discard[cnt] = NULL;
 		if (type != -1 && cnt != type)
 			continue;
 		if (!sb_has_quota_enabled(sb, cnt))
 			continue;
-		discard[cnt] = igrab(sb_dqopt(sb)->files[cnt]);
+		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
+		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
+		mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
 	}
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (discard[cnt]) {
-			mutex_lock(&discard[cnt]->i_mutex);
-			truncate_inode_pages(&discard[cnt]->i_data, 0);
-			mutex_unlock(&discard[cnt]->i_mutex);
-			iput(discard[cnt]);
-		}
-	}
 }
 
 void sync_dquots(struct super_block *sb, int type)
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index d3fd7c6732d2..9345a46ffb32 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -16,7 +16,6 @@
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/ramfs.h>
 #include <linux/quotaops.h>
@@ -180,7 +179,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
 			return ret;
 	}
 
-	ret = vmtruncate(inode, size);
+	ret = vmtruncate(inode, newsize);
 
 	return ret;
 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index ff1f7639707b..d40d22b347b7 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -30,10 +30,9 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/ramfs.h>
-
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 1f8dc373ede7..4d03008f015b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -37,10 +37,10 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 
 	mutex_lock(&inode->i_mutex);
 	switch (origin) {
-		case 2:
+		case SEEK_END:
 			offset += inode->i_size;
 			break;
-		case 1:
+		case SEEK_CUR:
 			offset += file->f_pos;
 	}
 	retval = -EINVAL;
@@ -63,10 +63,10 @@ loff_t remote_llseek(struct file *file, loff_t offset, int origin)
 
 	lock_kernel();
 	switch (origin) {
-		case 2:
+		case SEEK_END:
 			offset += i_size_read(file->f_path.dentry->d_inode);
 			break;
-		case 1:
+		case SEEK_CUR:
 			offset += file->f_pos;
 	}
 	retval = -EINVAL;
@@ -94,10 +94,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 
 	lock_kernel();
 	switch (origin) {
-		case 2:
+		case SEEK_END:
 			offset += i_size_read(file->f_path.dentry->d_inode);
 			break;
-		case 1:
+		case SEEK_CUR:
 			offset += file->f_pos;
 	}
 	retval = -EINVAL;
@@ -139,7 +139,7 @@ asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
 		goto bad;
 
 	retval = -EINVAL;
-	if (origin <= 2) {
+	if (origin <= SEEK_MAX) {
 		loff_t res = vfs_llseek(file, offset, origin);
 		retval = res;
 		if (res != (loff_t)retval)
@@ -166,7 +166,7 @@ asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
 		goto bad;
 
 	retval = -EINVAL;
-	if (origin > 2)
+	if (origin > SEEK_MAX)
 		goto out_putf;
 
 	offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
diff --git a/fs/readdir.c b/fs/readdir.c
index f39f5b313252..efe52e676577 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -4,13 +4,13 @@
  *  Copyright (C) 1995  Linus Torvalds
  */
 
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
 #include <linux/file.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/dirent.h>
 #include <linux/security.h>
@@ -52,7 +52,6 @@ EXPORT_SYMBOL(vfs_readdir);
  * case (the low-level handlers don't need to care about this).
  */
 #define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
-#define ROUND_UP(x) (((x)+sizeof(long)-1) & ~(sizeof(long)-1))
 
 #ifdef __ARCH_WANT_OLD_READDIR
 
@@ -147,7 +146,7 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
 	struct linux_dirent __user * dirent;
 	struct getdents_callback * buf = (struct getdents_callback *) __buf;
 	unsigned long d_ino;
-	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 2);
+	int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long));
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
@@ -220,8 +219,6 @@ out:
 	return error;
 }
 
-#define ROUND_UP64(x) (((x)+sizeof(u64)-1) & ~(sizeof(u64)-1))
-
 struct getdents_callback64 {
 	struct linux_dirent64 __user * current_dir;
 	struct linux_dirent64 __user * previous;
@@ -234,7 +231,7 @@ static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
 {
 	struct linux_dirent64 __user *dirent;
 	struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
-	int reclen = ROUND_UP64(NAME_OFFSET(dirent) + namlen + 1);
+	int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64));
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 96a2f8889da3..ffbfc2caaf20 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -7,11 +7,10 @@
 #include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/stat.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <asm/uaccess.h>
 
-extern struct reiserfs_key MIN_KEY;
+extern const struct reiserfs_key MIN_KEY;
 
 static int reiserfs_readdir(struct file *, void *, filldir_t);
 static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index abfada2f52db..9e451a68580f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -6,7 +6,6 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
-#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
@@ -1060,20 +1059,12 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
 	   maping blocks, since there is none, so we just zero out remaining
 	   parts of first and last pages in write area (if needed) */
 	if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
-		if (from != 0) {	/* First page needs to be partially zeroed */
-			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
-			memset(kaddr, 0, from);
-			kunmap_atomic(kaddr, KM_USER0);
-			flush_dcache_page(prepared_pages[0]);
-		}
-		if (to != PAGE_CACHE_SIZE) {	/* Last page needs to be partially zeroed */
-			char *kaddr =
-			    kmap_atomic(prepared_pages[num_pages - 1],
-					KM_USER0);
-			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
-			kunmap_atomic(kaddr, KM_USER0);
-			flush_dcache_page(prepared_pages[num_pages - 1]);
-		}
+		if (from != 0)		/* First page needs to be partially zeroed */
+			zero_user_page(prepared_pages[0], 0, from, KM_USER0);
+
+		if (to != PAGE_CACHE_SIZE)	/* Last page needs to be partially zeroed */
+			zero_user_page(prepared_pages[num_pages-1], to,
+					PAGE_CACHE_SIZE - to, KM_USER0);
 
 		/* Since all blocks are new - use already calculated value */
 		return blocks;
@@ -1200,13 +1191,9 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
 					ll_rw_block(READ, 1, &bh);
 					*wait_bh++ = bh;
 				} else {	/* Not mapped, zero it */
-					char *kaddr =
-					    kmap_atomic(prepared_pages[0],
-							KM_USER0);
-					memset(kaddr + block_start, 0,
-					       from - block_start);
-					kunmap_atomic(kaddr, KM_USER0);
-					flush_dcache_page(prepared_pages[0]);
+					zero_user_page(prepared_pages[0],
+						       block_start,
+						       from - block_start, KM_USER0);
 					set_buffer_uptodate(bh);
 				}
 			}
@@ -1238,13 +1225,8 @@ static int reiserfs_prepare_file_region_for_write(struct inode *inode
 					ll_rw_block(READ, 1, &bh);
 					*wait_bh++ = bh;
 				} else {	/* Not mapped, zero it */
-					char *kaddr =
-					    kmap_atomic(prepared_pages
-							[num_pages - 1],
-							KM_USER0);
-					memset(kaddr + to, 0, block_end - to);
-					kunmap_atomic(kaddr, KM_USER0);
-					flush_dcache_page(prepared_pages[num_pages - 1]);
+					zero_user_page(prepared_pages[num_pages-1],
+							to, block_end - to, KM_USER0);
 					set_buffer_uptodate(bh);
 				}
 			}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9fcbfe316977..1272d11399fb 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2148,13 +2148,8 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
 		length = offset & (blocksize - 1);
 		/* if we are not on a block boundary */
 		if (length) {
-			char *kaddr;
-
 			length = blocksize - length;
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + offset, 0, length);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
+			zero_user_page(page, offset, length, KM_USER0);
 			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
 				mark_buffer_dirty(bh);
 			}
@@ -2370,7 +2365,6 @@ static int reiserfs_write_full_page(struct page *page,
 	 ** last byte in the file
 	 */
 	if (page->index >= end_index) {
-		char *kaddr;
 		unsigned last_offset;
 
 		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
@@ -2379,10 +2373,7 @@ static int reiserfs_write_full_page(struct page *page,
 			unlock_page(page);
 			return 0;
 		}
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE - last_offset);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, last_offset, PAGE_CACHE_SIZE - last_offset, KM_USER0);
 	}
 	bh = head;
 	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 7280a23ef344..f25086aeef5f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1110,7 +1110,7 @@ static int flush_commit_list(struct super_block *s,
 	if (!barrier) {
 		/* If there was a write error in the journal - we can't commit
 		 * this transaction - it will be invalid and, if successful,
-		 * will just end up propogating the write error out to
+		 * will just end up propagating the write error out to
 		 * the file system. */
 		if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
 			if (buffer_dirty(jl->j_commit_bh))
@@ -1125,7 +1125,7 @@ static int flush_commit_list(struct super_block *s,
 
 	/* If there was a write error in the journal - we can't commit this
 	 * transaction - it will be invalid and, if successful, will just end
-	 * up propogating the write error out to the filesystem. */
+	 * up propagating the write error out to the filesystem. */
 	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
 		reiserfs_warning(s, "journal-615: buffer write failed");
@@ -2918,7 +2918,7 @@ static void queue_log_writer(struct super_block *s)
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
 		schedule();
-	current->state = TASK_RUNNING;
+	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&journal->j_join_wait, &wait);
 }
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index a2161840bc7c..b378eea332ca 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -16,7 +16,6 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
-#include <linux/smp_lock.h>
 #include <linux/quotaops.h>
 
 #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index ecc9943202fc..9aa7a06e093f 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -16,11 +16,10 @@
 #include <asm/uaccess.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_fs_sb.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 
-#if defined( REISERFS_PROC_INFO )
+#ifdef CONFIG_REISERFS_PROC_INFO
 
 /*
  * LOCKING:
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 315684793d1d..976cc7887a0d 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -131,6 +131,10 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 			/* don't use read_bitmap_block since it will cache
 			 * the uninitialized bitmap */
 			bh = sb_bread(s, i * s->s_blocksize * 8);
+			if (!bh) {
+				vfree(bitmap);
+				return -EIO;
+			}
 			memset(bh->b_data, 0, sb_blocksize(sb));
 			reiserfs_test_and_set_le_bit(0, bh->b_data);
 			reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index afb21ea45302..b6f12593c39d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -53,7 +53,6 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/reiserfs_fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/quotaops.h>
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f13a7f164dc6..b4ac9119200e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -18,7 +18,6 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
@@ -433,12 +432,13 @@ int remove_save_link(struct inode *inode, int truncate)
 static void reiserfs_kill_sb(struct super_block *s)
 {
 	if (REISERFS_SB(s)) {
+#ifdef CONFIG_REISERFS_FS_XATTR
 		if (REISERFS_SB(s)->xattr_root) {
 			d_invalidate(REISERFS_SB(s)->xattr_root);
 			dput(REISERFS_SB(s)->xattr_root);
 			REISERFS_SB(s)->xattr_root = NULL;
 		}
-
+#endif
 		if (REISERFS_SB(s)->priv_root) {
 			d_invalidate(REISERFS_SB(s)->priv_root);
 			dput(REISERFS_SB(s)->priv_root);
@@ -511,15 +511,12 @@ static void init_once(void *foo, struct kmem_cache * cachep, unsigned long flags
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		INIT_LIST_HEAD(&ei->i_prealloc_list);
-		inode_init_once(&ei->vfs_inode);
+	INIT_LIST_HEAD(&ei->i_prealloc_list);
+	inode_init_once(&ei->vfs_inode);
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
-		ei->i_acl_access = NULL;
-		ei->i_acl_default = NULL;
+	ei->i_acl_access = NULL;
+	ei->i_acl_default = NULL;
 #endif
-	}
 }
 
 static int init_inodecache(void)
@@ -1563,9 +1560,10 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
 	/* Preallocate by 16 blocks (17-1) at once */
 	REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
+#ifdef CONFIG_REISERFS_FS_XATTR
 	/* Initialize the rwsem for xattr dir */
 	init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
-
+#endif
 	/* setup default block allocator options */
 	reiserfs_init_alloc_options(s);
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index c8178b7b9212..bf6e58214538 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -68,7 +68,7 @@ static struct dentry *get_xa_root(struct super_block *sb, int flags)
 	if (!privroot)
 		return ERR_PTR(-ENODATA);
 
-	mutex_lock(&privroot->d_inode->i_mutex);
+	mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
 	if (REISERFS_SB(sb)->xattr_root) {
 		xaroot = dget(REISERFS_SB(sb)->xattr_root);
 		goto out;
@@ -410,11 +410,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
-
 		if (PageError(page))
 			goto fail;
 	}
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index fd601014813e..2284e03342c6 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -566,13 +566,11 @@ static void romfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
-static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags)
 {
-	struct romfs_inode_info *ei = (struct romfs_inode_info *) foo;
+	struct romfs_inode_info *ei = foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/select.c b/fs/select.c
index fe0893afd931..a974082b0824 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -14,10 +14,10 @@
  *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
  */
 
+#include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/poll.h>
 #include <linux/personality.h> /* for STICKY_TIMEOUTS */
 #include <linux/file.h>
@@ -26,7 +26,6 @@
 
 #include <asm/uaccess.h>
 
-#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
 #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
 
 struct poll_table_page {
@@ -65,7 +64,7 @@ EXPORT_SYMBOL(poll_initwait);
 
 static void free_poll_entry(struct poll_table_entry *entry)
 {
-	remove_wait_queue(entry->wait_address,&entry->wait);
+	remove_wait_queue(entry->wait_address, &entry->wait);
 	fput(entry->filp);
 }
 
@@ -129,7 +128,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	entry->filp = filp;
 	entry->wait_address = wait_address;
 	init_waitqueue_entry(&entry->wait, current);
-	add_wait_queue(wait_address,&entry->wait);
+	add_wait_queue(wait_address, &entry->wait);
 }
 
 #define FDS_IN(fds, n)		(fds->in + n)
@@ -399,7 +398,7 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
 			timeout = -1;	/* infinite */
 		else {
-			timeout = ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
+			timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
 			timeout += tv.tv_sec * HZ;
 		}
 	}
@@ -454,7 +453,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
 			timeout = -1;	/* infinite */
 		else {
-			timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
+			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
 			timeout += ts.tv_sec * HZ;
 		}
 	}
@@ -776,7 +775,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
 			timeout = -1;	/* infinite */
 		else {
-			timeout = ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
+			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
 			timeout += ts.tv_sec * HZ;
 		}
 	}
diff --git a/fs/signalfd.c b/fs/signalfd.c
new file mode 100644
index 000000000000..f1da89203a9a
--- /dev/null
+++ b/fs/signalfd.c
@@ -0,0 +1,379 @@
+/*
+ *  fs/signalfd.c
+ *
+ *  Copyright (C) 2003  Linus Torvalds
+ *
+ *  Mon Mar 5, 2007: Davide Libenzi <davidel@xmailserver.org>
+ *      Changed ->read() to return a siginfo strcture instead of signal number.
+ *      Fixed locking in ->poll().
+ *      Added sighand-detach notification.
+ *      Added fd re-use in sys_signalfd() syscall.
+ *      Now using anonymous inode source.
+ *      Thanks to Oleg Nesterov for useful code review and suggestions.
+ *      More comments and suggestions from Arnd Bergmann.
+ * Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
+ *      Retrieve multiple signals with one read() call
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/signalfd.h>
+
+struct signalfd_ctx {
+	struct list_head lnk;
+	wait_queue_head_t wqh;
+	sigset_t sigmask;
+	struct task_struct *tsk;
+};
+
+struct signalfd_lockctx {
+	struct task_struct *tsk;
+	unsigned long flags;
+};
+
+/*
+ * Tries to acquire the sighand lock. We do not increment the sighand
+ * use count, and we do not even pin the task struct, so we need to
+ * do it inside an RCU read lock, and we must be prepared for the
+ * ctx->tsk going to NULL (in signalfd_deliver()), and for the sighand
+ * being detached. We return 0 if the sighand has been detached, or
+ * 1 if we were able to pin the sighand lock.
+ */
+static int signalfd_lock(struct signalfd_ctx *ctx, struct signalfd_lockctx *lk)
+{
+	struct sighand_struct *sighand = NULL;
+
+	rcu_read_lock();
+	lk->tsk = rcu_dereference(ctx->tsk);
+	if (likely(lk->tsk != NULL))
+		sighand = lock_task_sighand(lk->tsk, &lk->flags);
+	rcu_read_unlock();
+
+	if (sighand && !ctx->tsk) {
+		unlock_task_sighand(lk->tsk, &lk->flags);
+		sighand = NULL;
+	}
+
+	return sighand != NULL;
+}
+
+static void signalfd_unlock(struct signalfd_lockctx *lk)
+{
+	unlock_task_sighand(lk->tsk, &lk->flags);
+}
+
+/*
+ * This must be called with the sighand lock held.
+ */
+void signalfd_deliver(struct task_struct *tsk, int sig)
+{
+	struct sighand_struct *sighand = tsk->sighand;
+	struct signalfd_ctx *ctx, *tmp;
+
+	BUG_ON(!sig);
+	list_for_each_entry_safe(ctx, tmp, &sighand->signalfd_list, lnk) {
+		/*
+		 * We use a negative signal value as a way to broadcast that the
+		 * sighand has been orphaned, so that we can notify all the
+		 * listeners about this. Remember the ctx->sigmask is inverted,
+		 * so if the user is interested in a signal, that corresponding
+		 * bit will be zero.
+		 */
+		if (sig < 0) {
+			if (ctx->tsk == tsk) {
+				ctx->tsk = NULL;
+				list_del_init(&ctx->lnk);
+				wake_up(&ctx->wqh);
+			}
+		} else {
+			if (!sigismember(&ctx->sigmask, sig))
+				wake_up(&ctx->wqh);
+		}
+	}
+}
+
+static void signalfd_cleanup(struct signalfd_ctx *ctx)
+{
+	struct signalfd_lockctx lk;
+
+	/*
+	 * This is tricky. If the sighand is gone, we do not need to remove
+	 * context from the list, the list itself won't be there anymore.
+	 */
+	if (signalfd_lock(ctx, &lk)) {
+		list_del(&ctx->lnk);
+		signalfd_unlock(&lk);
+	}
+	kfree(ctx);
+}
+
+static int signalfd_release(struct inode *inode, struct file *file)
+{
+	signalfd_cleanup(file->private_data);
+	return 0;
+}
+
+static unsigned int signalfd_poll(struct file *file, poll_table *wait)
+{
+	struct signalfd_ctx *ctx = file->private_data;
+	unsigned int events = 0;
+	struct signalfd_lockctx lk;
+
+	poll_wait(file, &ctx->wqh, wait);
+
+	/*
+	 * Let the caller get a POLLIN in this case, ala socket recv() when
+	 * the peer disconnects.
+	 */
+	if (signalfd_lock(ctx, &lk)) {
+		if (next_signal(&lk.tsk->pending, &ctx->sigmask) > 0 ||
+		    next_signal(&lk.tsk->signal->shared_pending,
+				&ctx->sigmask) > 0)
+			events |= POLLIN;
+		signalfd_unlock(&lk);
+	} else
+		events |= POLLIN;
+
+	return events;
+}
+
+/*
+ * Copied from copy_siginfo_to_user() in kernel/signal.c
+ */
+static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
+			     siginfo_t const *kinfo)
+{
+	long err;
+
+	BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128);
+
+	/*
+	 * Unused memebers should be zero ...
+	 */
+	err = __clear_user(uinfo, sizeof(*uinfo));
+
+	/*
+	 * If you change siginfo_t structure, please be sure
+	 * this code is fixed accordingly.
+	 */
+	err |= __put_user(kinfo->si_signo, &uinfo->signo);
+	err |= __put_user(kinfo->si_errno, &uinfo->err);
+	err |= __put_user((short)kinfo->si_code, &uinfo->code);
+	switch (kinfo->si_code & __SI_MASK) {
+	case __SI_KILL:
+		err |= __put_user(kinfo->si_pid, &uinfo->pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->uid);
+		break;
+	case __SI_TIMER:
+		 err |= __put_user(kinfo->si_tid, &uinfo->tid);
+		 err |= __put_user(kinfo->si_overrun, &uinfo->overrun);
+		 err |= __put_user((long)kinfo->si_ptr, &uinfo->svptr);
+		break;
+	case __SI_POLL:
+		err |= __put_user(kinfo->si_band, &uinfo->band);
+		err |= __put_user(kinfo->si_fd, &uinfo->fd);
+		break;
+	case __SI_FAULT:
+		err |= __put_user((long)kinfo->si_addr, &uinfo->addr);
+#ifdef __ARCH_SI_TRAPNO
+		err |= __put_user(kinfo->si_trapno, &uinfo->trapno);
+#endif
+		break;
+	case __SI_CHLD:
+		err |= __put_user(kinfo->si_pid, &uinfo->pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->uid);
+		err |= __put_user(kinfo->si_status, &uinfo->status);
+		err |= __put_user(kinfo->si_utime, &uinfo->utime);
+		err |= __put_user(kinfo->si_stime, &uinfo->stime);
+		break;
+	case __SI_RT: /* This is not generated by the kernel as of now. */
+	case __SI_MESGQ: /* But this is */
+		err |= __put_user(kinfo->si_pid, &uinfo->pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->uid);
+		err |= __put_user((long)kinfo->si_ptr, &uinfo->svptr);
+		break;
+	default: /* this is just in case for now ... */
+		err |= __put_user(kinfo->si_pid, &uinfo->pid);
+		err |= __put_user(kinfo->si_uid, &uinfo->uid);
+		break;
+	}
+
+	return err ? -EFAULT: sizeof(*uinfo);
+}
+
+static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
+				int nonblock)
+{
+	ssize_t ret;
+	struct signalfd_lockctx lk;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (!signalfd_lock(ctx, &lk))
+		return 0;
+
+	ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
+	switch (ret) {
+	case 0:
+		if (!nonblock)
+			break;
+		ret = -EAGAIN;
+	default:
+		signalfd_unlock(&lk);
+		return ret;
+	}
+
+	add_wait_queue(&ctx->wqh, &wait);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
+		signalfd_unlock(&lk);
+		if (ret != 0)
+			break;
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		schedule();
+		ret = signalfd_lock(ctx, &lk);
+		if (unlikely(!ret)) {
+			/*
+			 * Let the caller read zero byte, ala socket
+			 * recv() when the peer disconnect. This test
+			 * must be done before doing a dequeue_signal(),
+			 * because if the sighand has been orphaned,
+			 * the dequeue_signal() call is going to crash
+			 * because ->sighand will be long gone.
+			 */
+			 break;
+		}
+	}
+
+	remove_wait_queue(&ctx->wqh, &wait);
+	__set_current_state(TASK_RUNNING);
+
+	return ret;
+}
+
+/*
+ * Returns either the size of a "struct signalfd_siginfo", or zero if the
+ * sighand we are attached to, has been orphaned. The "count" parameter
+ * must be at least the size of a "struct signalfd_siginfo".
+ */
+static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
+			     loff_t *ppos)
+{
+	struct signalfd_ctx *ctx = file->private_data;
+	struct signalfd_siginfo __user *siginfo;
+	int nonblock = file->f_flags & O_NONBLOCK;
+	ssize_t ret, total = 0;
+	siginfo_t info;
+
+	count /= sizeof(struct signalfd_siginfo);
+	if (!count)
+		return -EINVAL;
+
+	siginfo = (struct signalfd_siginfo __user *) buf;
+
+	do {
+		ret = signalfd_dequeue(ctx, &info, nonblock);
+		if (unlikely(ret <= 0))
+			break;
+		ret = signalfd_copyinfo(siginfo, &info);
+		if (ret < 0)
+			break;
+		siginfo++;
+		total += ret;
+		nonblock = 1;
+	} while (--count);
+
+	return total ? total : ret;
+}
+
+static const struct file_operations signalfd_fops = {
+	.release	= signalfd_release,
+	.poll		= signalfd_poll,
+	.read		= signalfd_read,
+};
+
+/*
+ * Create a file descriptor that is associated with our signal
+ * state. We can pass it around to others if we want to, but
+ * it will always be _our_ signal state.
+ */
+asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
+{
+	int error;
+	sigset_t sigmask;
+	struct signalfd_ctx *ctx;
+	struct sighand_struct *sighand;
+	struct file *file;
+	struct inode *inode;
+	struct signalfd_lockctx lk;
+
+	if (sizemask != sizeof(sigset_t) ||
+	    copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
+		return error = -EINVAL;
+	sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+	signotset(&sigmask);
+
+	if (ufd == -1) {
+		ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+		if (!ctx)
+			return -ENOMEM;
+
+		init_waitqueue_head(&ctx->wqh);
+		ctx->sigmask = sigmask;
+		ctx->tsk = current;
+
+		sighand = current->sighand;
+		/*
+		 * Add this fd to the list of signal listeners.
+		 */
+		spin_lock_irq(&sighand->siglock);
+		list_add_tail(&ctx->lnk, &sighand->signalfd_list);
+		spin_unlock_irq(&sighand->siglock);
+
+		/*
+		 * When we call this, the initialization must be complete, since
+		 * anon_inode_getfd() will install the fd.
+		 */
+		error = anon_inode_getfd(&ufd, &inode, &file, "[signalfd]",
+					 &signalfd_fops, ctx);
+		if (error)
+			goto err_fdalloc;
+	} else {
+		file = fget(ufd);
+		if (!file)
+			return -EBADF;
+		ctx = file->private_data;
+		if (file->f_op != &signalfd_fops) {
+			fput(file);
+			return -EINVAL;
+		}
+		/*
+		 * We need to be prepared of the fact that the sighand this fd
+		 * is attached to, has been detched. In that case signalfd_lock()
+		 * will return 0, and we'll just skip setting the new mask.
+		 */
+		if (signalfd_lock(ctx, &lk)) {
+			ctx->sigmask = sigmask;
+			signalfd_unlock(&lk);
+		}
+		wake_up(&ctx->wqh);
+		fput(file);
+	}
+
+	return ufd;
+
+err_fdalloc:
+	signalfd_cleanup(ctx);
+	return error;
+}
+
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 50136b1a3eca..48da4fa6b7d4 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -13,6 +13,7 @@
 #include <linux/smp_lock.h>
 #include <linux/ctype.h>
 #include <linux/net.h>
+#include <linux/sched.h>
 
 #include <linux/smb_fs.h>
 #include <linux/smb_mount.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index f161797160c4..aea3f8aa54c0 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -17,6 +17,7 @@
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/net.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 5faba4f1c9ab..6724a6cf01ff 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -25,6 +25,7 @@
 #include <linux/net.h>
 #include <linux/vfs.h>
 #include <linux/highuid.h>
+#include <linux/sched.h>
 #include <linux/smb_fs.h>
 #include <linux/smbno.h>
 #include <linux/smb_mount.h>
@@ -69,10 +70,8 @@ static void smb_destroy_inode(struct inode *inode)
 static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags)
 {
 	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
-	unsigned long flagmask = SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR;
 
-	if ((flags & flagmask) == SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index 723f7c667661..3f54a0f80fae 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -6,10 +6,12 @@
  *  Please add a note about your changes to smbfs in the ChangeLog file.
  */
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/net.h>
+#include <linux/sched.h>
 
 #include <linux/smb_fs.h>
 #include <linux/smbno.h>
@@ -22,8 +24,6 @@
 /* #define SMB_SLAB_DEBUG	(SLAB_RED_ZONE | SLAB_POISON) */
 #define SMB_SLAB_DEBUG	0
 
-#define ROUND_UP(x) (((x)+3) & ~3)
-
 /* cache for request structures */
 static struct kmem_cache *req_cachep;
 
@@ -200,8 +200,8 @@ static int smb_setup_trans2request(struct smb_request *req)
 
 	const int smb_parameters = 15;
 	const int header = SMB_HEADER_LEN + 2 * smb_parameters + 2;
-	const int oparam = ROUND_UP(header + 3);
-	const int odata  = ROUND_UP(oparam + req->rq_lparm);
+	const int oparam = ALIGN(header + 3, sizeof(u32));
+	const int odata  = ALIGN(oparam + req->rq_lparm, sizeof(u32));
 	const int bcc = (req->rq_data ? odata + req->rq_ldata :
 					oparam + req->rq_lparm) - header;
 
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 89eaf31f1d46..67176af8515f 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -16,7 +16,6 @@
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/dcache.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <linux/kthread.h>
@@ -299,8 +298,6 @@ out:
  */
 static int smbiod(void *unused)
 {
-	allow_signal(SIGKILL);
-
 	VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
 
 	for (;;) {
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
index 92ea6b2367d7..e48bd8235a8e 100644
--- a/fs/smbfs/sock.c
+++ b/fs/smbfs/sock.c
@@ -17,7 +17,6 @@
 #include <linux/net.h>
 #include <linux/mm.h>
 #include <linux/netdevice.h>
-#include <linux/smp_lock.h>
 #include <linux/workqueue.h>
 #include <net/scm.h>
 #include <net/tcp_states.h>
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index fea20ceb8a5f..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -13,7 +13,6 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/net.h>
 #include <linux/namei.h>
 
diff --git a/fs/splice.c b/fs/splice.c
index 5428b0ff3b6f..12f28281d2b1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -289,12 +289,10 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		nr_pages = PIPE_BUFFERS;
 
 	/*
-	 * Initiate read-ahead on this page range. however, don't call into
-	 * read-ahead if this is a non-zero offset (we are likely doing small
-	 * chunk splice and the page is already there) for a single page.
+	 * Don't try to 2nd guess the read-ahead logic, call into
+	 * page_cache_readahead() like the page cache reads would do.
 	 */
-	if (!loff || nr_pages > 1)
-		page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
+	page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
 
 	/*
 	 * Now fill in the holes:
@@ -378,10 +376,11 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			 * If in nonblock mode then dont block on waiting
 			 * for an in-flight io page
 			 */
-			if (flags & SPLICE_F_NONBLOCK)
-				break;
-
-			lock_page(page);
+			if (flags & SPLICE_F_NONBLOCK) {
+				if (TestSetPageLocked(page))
+					break;
+			} else
+				lock_page(page);
 
 			/*
 			 * page was truncated, stop here. if this isn't the
diff --git a/fs/stat.c b/fs/stat.c
index 38a8cb2a28de..68510068a641 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <linux/file.h>
-#include <linux/smp_lock.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
diff --git a/fs/super.c b/fs/super.c
index 8341e4e1d738..5260d620c555 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -107,6 +107,7 @@ out:
 static inline void destroy_super(struct super_block *s)
 {
 	security_sb_free(s);
+	kfree(s->s_subtype);
 	kfree(s);
 }
 
@@ -907,6 +908,29 @@ out:
 
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
+static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
+{
+	int err;
+	const char *subtype = strchr(fstype, '.');
+	if (subtype) {
+		subtype++;
+		err = -EINVAL;
+		if (!subtype[0])
+			goto err;
+	} else
+		subtype = "";
+
+	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!mnt->mnt_sb->s_subtype)
+		goto err;
+	return mnt;
+
+ err:
+	mntput(mnt);
+	return ERR_PTR(err);
+}
+
 struct vfsmount *
 do_kern_mount(const char *fstype, int flags, const char *name, void *data)
 {
@@ -915,6 +939,9 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
 	if (!type)
 		return ERR_PTR(-ENODEV);
 	mnt = vfs_kern_mount(type, flags, name, data);
+	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+	    !mnt->mnt_sb->s_subtype)
+		mnt = fs_set_subtype(mnt, fstype);
 	put_filesystem(type);
 	return mnt;
 }
diff --git a/fs/sync.c b/fs/sync.c
index 5cb9e7e43383..2f97576355b8 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -229,7 +229,7 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 			!S_ISLNK(i_mode))
 		goto out_put;
 
-	ret = do_sync_file_range(file, offset, endbyte, flags);
+	ret = do_sync_mapping_range(file->f_mapping, offset, endbyte, flags);
 out_put:
 	fput_light(file, fput_needed);
 out:
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 8ea2a51ce883..d3b9f5f07db1 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -59,7 +59,7 @@ read(struct file * file, char __user * userbuf, size_t count, loff_t * off)
 	if (copy_to_user(userbuf, buffer, count))
 		return -EFAULT;
 
-	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
+	pr_debug("offs = %lld, *off = %lld, count = %zd\n", offs, *off, count);
 
 	*off = offs + count;
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index db0413a411d6..b502c7197ec0 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -13,8 +13,7 @@
 
 #include "sysfs.h"
 
-#define to_subsys(k) container_of(k,struct subsystem,kset.kobj)
-#define to_sattr(a) container_of(a,struct subsys_attribute,attr)
+#define to_sattr(a) container_of(a,struct subsys_attribute, attr)
 
 /*
  * Subsystem file operations.
@@ -24,12 +23,12 @@
 static ssize_t 
 subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
 {
-	struct subsystem * s = to_subsys(kobj);
+	struct kset *kset = to_kset(kobj);
 	struct subsys_attribute * sattr = to_sattr(attr);
 	ssize_t ret = -EIO;
 
 	if (sattr->show)
-		ret = sattr->show(s,page);
+		ret = sattr->show(kset, page);
 	return ret;
 }
 
@@ -37,12 +36,12 @@ static ssize_t
 subsys_attr_store(struct kobject * kobj, struct attribute * attr, 
 		  const char * page, size_t count)
 {
-	struct subsystem * s = to_subsys(kobj);
+	struct kset *kset = to_kset(kobj);
 	struct subsys_attribute * sattr = to_sattr(attr);
 	ssize_t ret = -EIO;
 
 	if (sattr->store)
-		ret = sattr->store(s,page,count);
+		ret = sattr->store(kset, page, count);
 	return ret;
 }
 
@@ -112,36 +111,6 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 	return ret;
 }
 
-
-/**
- *	flush_read_buffer - push buffer to userspace.
- *	@buffer:	data buffer for file.
- *	@buf:		user-passed buffer.
- *	@count:		number of bytes requested.
- *	@ppos:		file position.
- *
- *	Copy the buffer we filled in fill_read_buffer() to userspace.
- *	This is done at the reader's leisure, copying and advancing 
- *	the amount they specify each time.
- *	This may be called continuously until the buffer is empty.
- */
-static int flush_read_buffer(struct sysfs_buffer * buffer, char __user * buf,
-			     size_t count, loff_t * ppos)
-{
-	int error;
-
-	if (*ppos > buffer->count)
-		return 0;
-
-	if (count > (buffer->count - *ppos))
-		count = buffer->count - *ppos;
-
-	error = copy_to_user(buf,buffer->page + *ppos,count);
-	if (!error)
-		*ppos += count;
-	return error ? -EFAULT : count;
-}
-
 /**
  *	sysfs_read_file - read an attribute. 
  *	@file:	file pointer.
@@ -178,7 +147,8 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	}
 	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
 		 __FUNCTION__, count, *ppos, buffer->page);
-	retval = flush_read_buffer(buffer,buf,count,ppos);
+	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
+					 buffer->count);
 out:
 	up(&buffer->sem);
 	return retval;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 4de5c6b89918..bdd30e74de6b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -13,6 +13,7 @@
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
+#include <linux/sched.h>
 #include <asm/semaphore.h>
 #include "sysfs.h"
 
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index ebf7007fa161..e566b387fcf9 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -54,17 +54,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page = read_mapping_page(mapping, n, NULL);
-	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
+	if (!IS_ERR(page))
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
-	}
 	return page;
-
-fail:
-	dir_put_page(page);
-	return ERR_PTR(-EIO);
 }
 
 static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9311cac186fe..564411693394 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -322,9 +322,7 @@ static void init_once(void *p, struct kmem_cache *cachep, unsigned long flags)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-			SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&si->vfs_inode);
+	inode_init_once(&si->vfs_inode);
 }
 
 const struct super_operations sysv_sops = {
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 4e48abbd2b5d..6bd850b7641a 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -13,7 +13,6 @@
  */
 
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include "sysv.h"
 
 static int add_nondir(struct dentry *dentry, struct inode *inode)
diff --git a/fs/timerfd.c b/fs/timerfd.c
new file mode 100644
index 000000000000..af9eca5c0230
--- /dev/null
+++ b/fs/timerfd.c
@@ -0,0 +1,225 @@
+/*
+ *  fs/timerfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *
+ *
+ *  Thanks to Thomas Gleixner for code reviews and useful comments.
+ *
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/anon_inodes.h>
+#include <linux/timerfd.h>
+
+struct timerfd_ctx {
+	struct hrtimer tmr;
+	ktime_t tintv;
+	wait_queue_head_t wqh;
+	int expired;
+};
+
+/*
+ * This gets called when the timer event triggers. We set the "expired"
+ * flag, but we do not re-arm the timer (in case it's necessary,
+ * tintv.tv64 != 0) until the timer is read.
+ */
+static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+{
+	struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	ctx->expired = 1;
+	wake_up_locked(&ctx->wqh);
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return HRTIMER_NORESTART;
+}
+
+static void timerfd_setup(struct timerfd_ctx *ctx, int clockid, int flags,
+			  const struct itimerspec *ktmr)
+{
+	enum hrtimer_mode htmode;
+	ktime_t texp;
+
+	htmode = (flags & TFD_TIMER_ABSTIME) ?
+		HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
+
+	texp = timespec_to_ktime(ktmr->it_value);
+	ctx->expired = 0;
+	ctx->tintv = timespec_to_ktime(ktmr->it_interval);
+	hrtimer_init(&ctx->tmr, clockid, htmode);
+	ctx->tmr.expires = texp;
+	ctx->tmr.function = timerfd_tmrproc;
+	if (texp.tv64 != 0)
+		hrtimer_start(&ctx->tmr, texp, htmode);
+}
+
+static int timerfd_release(struct inode *inode, struct file *file)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+
+	hrtimer_cancel(&ctx->tmr);
+	kfree(ctx);
+	return 0;
+}
+
+static unsigned int timerfd_poll(struct file *file, poll_table *wait)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+	unsigned int events = 0;
+	unsigned long flags;
+
+	poll_wait(file, &ctx->wqh, wait);
+
+	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	if (ctx->expired)
+		events |= POLLIN;
+	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+	return events;
+}
+
+static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct timerfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	u32 ticks = 0;
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (count < sizeof(ticks))
+		return -EINVAL;
+	spin_lock_irq(&ctx->wqh.lock);
+	res = -EAGAIN;
+	if (!ctx->expired && !(file->f_flags & O_NONBLOCK)) {
+		__add_wait_queue(&ctx->wqh, &wait);
+		for (res = 0;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ctx->expired) {
+				res = 0;
+				break;
+			}
+			if (signal_pending(current)) {
+				res = -ERESTARTSYS;
+				break;
+			}
+			spin_unlock_irq(&ctx->wqh.lock);
+			schedule();
+			spin_lock_irq(&ctx->wqh.lock);
+		}
+		__remove_wait_queue(&ctx->wqh, &wait);
+		__set_current_state(TASK_RUNNING);
+	}
+	if (ctx->expired) {
+		ctx->expired = 0;
+		if (ctx->tintv.tv64 != 0) {
+			/*
+			 * If tintv.tv64 != 0, this is a periodic timer that
+			 * needs to be re-armed. We avoid doing it in the timer
+			 * callback to avoid DoS attacks specifying a very
+			 * short timer period.
+			 */
+			ticks = (u32)
+				hrtimer_forward(&ctx->tmr,
+						hrtimer_cb_get_time(&ctx->tmr),
+						ctx->tintv);
+			hrtimer_restart(&ctx->tmr);
+		} else
+			ticks = 1;
+	}
+	spin_unlock_irq(&ctx->wqh.lock);
+	if (ticks)
+		res = put_user(ticks, buf) ? -EFAULT: sizeof(ticks);
+	return res;
+}
+
+static const struct file_operations timerfd_fops = {
+	.release	= timerfd_release,
+	.poll		= timerfd_poll,
+	.read		= timerfd_read,
+};
+
+asmlinkage long sys_timerfd(int ufd, int clockid, int flags,
+			    const struct itimerspec __user *utmr)
+{
+	int error;
+	struct timerfd_ctx *ctx;
+	struct file *file;
+	struct inode *inode;
+	struct itimerspec ktmr;
+
+	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
+		return -EFAULT;
+
+	if (clockid != CLOCK_MONOTONIC &&
+	    clockid != CLOCK_REALTIME)
+		return -EINVAL;
+	if (!timespec_valid(&ktmr.it_value) ||
+	    !timespec_valid(&ktmr.it_interval))
+		return -EINVAL;
+
+	if (ufd == -1) {
+		ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+		if (!ctx)
+			return -ENOMEM;
+
+		init_waitqueue_head(&ctx->wqh);
+
+		timerfd_setup(ctx, clockid, flags, &ktmr);
+
+		/*
+		 * When we call this, the initialization must be complete, since
+		 * anon_inode_getfd() will install the fd.
+		 */
+		error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]",
+					 &timerfd_fops, ctx);
+		if (error)
+			goto err_tmrcancel;
+	} else {
+		file = fget(ufd);
+		if (!file)
+			return -EBADF;
+		ctx = file->private_data;
+		if (file->f_op != &timerfd_fops) {
+			fput(file);
+			return -EINVAL;
+		}
+		/*
+		 * We need to stop the existing timer before reprogramming
+		 * it to the new values.
+		 */
+		for (;;) {
+			spin_lock_irq(&ctx->wqh.lock);
+			if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
+				break;
+			spin_unlock_irq(&ctx->wqh.lock);
+			cpu_relax();
+		}
+		/*
+		 * Re-program the timer to the new value ...
+		 */
+		timerfd_setup(ctx, clockid, flags, &ktmr);
+
+		spin_unlock_irq(&ctx->wqh.lock);
+		fput(file);
+	}
+
+	return ufd;
+
+err_tmrcancel:
+	hrtimer_cancel(&ctx->tmr);
+	kfree(ctx);
+	return error;
+}
+
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index ea521f846d97..4cec91015681 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -427,9 +427,9 @@ static void udf_table_free_blocks(struct super_block * sb,
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	uint32_t start, end;
-	uint32_t nextoffset, oextoffset, elen;
-	kernel_lb_addr nbloc, obloc, eloc;
-	struct buffer_head *obh, *nbh;
+	uint32_t elen;
+	kernel_lb_addr eloc;
+	struct extent_position oepos, epos;
 	int8_t etype;
 	int i;
 
@@ -457,14 +457,13 @@ static void udf_table_free_blocks(struct super_block * sb,
 	start = bloc.logicalBlockNum + offset;
 	end = bloc.logicalBlockNum + offset + count - 1;
 
-	oextoffset = nextoffset = sizeof(struct unallocSpaceEntry);
+	epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
 	elen = 0;
-	obloc = nbloc = UDF_I_LOCATION(table);
-
-	obh = nbh = NULL;
+	epos.block = oepos.block = UDF_I_LOCATION(table);
+	epos.bh = oepos.bh = NULL;
 
 	while (count && (etype =
-		udf_next_aext(table, &nbloc, &nextoffset, &eloc, &elen, &nbh, 1)) != -1)
+		udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1)
 	{
 		if (((eloc.logicalBlockNum + (elen >> sb->s_blocksize_bits)) ==
 			start))
@@ -482,7 +481,7 @@ static void udf_table_free_blocks(struct super_block * sb,
 				start += count;
 				count = 0;
 			}
-			udf_write_aext(table, obloc, &oextoffset, eloc, elen, obh, 1);
+			udf_write_aext(table, &oepos, eloc, elen, 1);
 		}
 		else if (eloc.logicalBlockNum == (end + 1))
 		{
@@ -502,20 +501,20 @@ static void udf_table_free_blocks(struct super_block * sb,
 				end -= count;
 				count = 0;
 			}
-			udf_write_aext(table, obloc, &oextoffset, eloc, elen, obh, 1);
+			udf_write_aext(table, &oepos, eloc, elen, 1);
 		}
 
-		if (nbh != obh)
+		if (epos.bh != oepos.bh)
 		{
 			i = -1;
-			obloc = nbloc;
-			udf_release_data(obh);
-			atomic_inc(&nbh->b_count);
-			obh = nbh;
-			oextoffset = 0;
+			oepos.block = epos.block;
+			brelse(oepos.bh);
+			get_bh(epos.bh);
+			oepos.bh = epos.bh;
+			oepos.offset = 0;
 		}
 		else
-			oextoffset = nextoffset;
+			oepos.offset = epos.offset;
 	}
 
 	if (count)
@@ -547,55 +546,53 @@ static void udf_table_free_blocks(struct super_block * sb,
 			adsize = sizeof(long_ad);
 		else
 		{
-			udf_release_data(obh);
-			udf_release_data(nbh);
+			brelse(oepos.bh);
+			brelse(epos.bh);
 			goto error_return;
 		}
 
-		if (nextoffset + (2 * adsize) > sb->s_blocksize)
+		if (epos.offset + (2 * adsize) > sb->s_blocksize)
 		{
 			char *sptr, *dptr;
 			int loffset;
 	
-			udf_release_data(obh);
-			obh = nbh;
-			obloc = nbloc;
-			oextoffset = nextoffset;
+			brelse(oepos.bh);
+			oepos = epos;
 
 			/* Steal a block from the extent being free'd */
-			nbloc.logicalBlockNum = eloc.logicalBlockNum;
+			epos.block.logicalBlockNum = eloc.logicalBlockNum;
 			eloc.logicalBlockNum ++;
 			elen -= sb->s_blocksize;
 
-			if (!(nbh = udf_tread(sb,
-				udf_get_lb_pblock(sb, nbloc, 0))))
+			if (!(epos.bh = udf_tread(sb,
+				udf_get_lb_pblock(sb, epos.block, 0))))
 			{
-				udf_release_data(obh);
+				brelse(oepos.bh);
 				goto error_return;
 			}
-			aed = (struct allocExtDesc *)(nbh->b_data);
-			aed->previousAllocExtLocation = cpu_to_le32(obloc.logicalBlockNum);
-			if (nextoffset + adsize > sb->s_blocksize)
+			aed = (struct allocExtDesc *)(epos.bh->b_data);
+			aed->previousAllocExtLocation = cpu_to_le32(oepos.block.logicalBlockNum);
+			if (epos.offset + adsize > sb->s_blocksize)
 			{
-				loffset = nextoffset;
+				loffset = epos.offset;
 				aed->lengthAllocDescs = cpu_to_le32(adsize);
-				sptr = UDF_I_DATA(inode) + nextoffset -
+				sptr = UDF_I_DATA(inode) + epos.offset -
 					udf_file_entry_alloc_offset(inode) +
 					UDF_I_LENEATTR(inode) - adsize;
-				dptr = nbh->b_data + sizeof(struct allocExtDesc);
+				dptr = epos.bh->b_data + sizeof(struct allocExtDesc);
 				memcpy(dptr, sptr, adsize);
-				nextoffset = sizeof(struct allocExtDesc) + adsize;
+				epos.offset = sizeof(struct allocExtDesc) + adsize;
 			}
 			else
 			{
-				loffset = nextoffset + adsize;
+				loffset = epos.offset + adsize;
 				aed->lengthAllocDescs = cpu_to_le32(0);
-				sptr = (obh)->b_data + nextoffset;
-				nextoffset = sizeof(struct allocExtDesc);
+				sptr = oepos.bh->b_data + epos.offset;
+				epos.offset = sizeof(struct allocExtDesc);
 
-				if (obh)
+				if (oepos.bh)
 				{
-					aed = (struct allocExtDesc *)(obh)->b_data;
+					aed = (struct allocExtDesc *)oepos.bh->b_data;
 					aed->lengthAllocDescs =
 						cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) + adsize);
 				}
@@ -606,11 +603,11 @@ static void udf_table_free_blocks(struct super_block * sb,
 				}
 			}
 			if (UDF_SB_UDFREV(sb) >= 0x0200)
-				udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
-					nbloc.logicalBlockNum, sizeof(tag));
+				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 3, 1,
+					epos.block.logicalBlockNum, sizeof(tag));
 			else
-				udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
-					nbloc.logicalBlockNum, sizeof(tag));
+				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, 2, 1,
+					epos.block.logicalBlockNum, sizeof(tag));
 			switch (UDF_I_ALLOCTYPE(table))
 			{
 				case ICBTAG_FLAG_AD_SHORT:
@@ -619,7 +616,7 @@ static void udf_table_free_blocks(struct super_block * sb,
 					sad->extLength = cpu_to_le32(
 						EXT_NEXT_EXTENT_ALLOCDECS |
 						sb->s_blocksize);
-					sad->extPosition = cpu_to_le32(nbloc.logicalBlockNum);
+					sad->extPosition = cpu_to_le32(epos.block.logicalBlockNum);
 					break;
 				}
 				case ICBTAG_FLAG_AD_LONG:
@@ -628,14 +625,14 @@ static void udf_table_free_blocks(struct super_block * sb,
 					lad->extLength = cpu_to_le32(
 						EXT_NEXT_EXTENT_ALLOCDECS |
 						sb->s_blocksize);
-					lad->extLocation = cpu_to_lelb(nbloc);
+					lad->extLocation = cpu_to_lelb(epos.block);
 					break;
 				}
 			}
-			if (obh)
+			if (oepos.bh)
 			{
-				udf_update_tag(obh->b_data, loffset);
-				mark_buffer_dirty(obh);
+				udf_update_tag(oepos.bh->b_data, loffset);
+				mark_buffer_dirty(oepos.bh);
 			}
 			else
 				mark_inode_dirty(table);
@@ -643,26 +640,26 @@ static void udf_table_free_blocks(struct super_block * sb,
 
 		if (elen) /* It's possible that stealing the block emptied the extent */
 		{
-			udf_write_aext(table, nbloc, &nextoffset, eloc, elen, nbh, 1);
+			udf_write_aext(table, &epos, eloc, elen, 1);
 
-			if (!nbh)
+			if (!epos.bh)
 			{
 				UDF_I_LENALLOC(table) += adsize;
 				mark_inode_dirty(table);
 			}
 			else
 			{
-				aed = (struct allocExtDesc *)nbh->b_data;
+				aed = (struct allocExtDesc *)epos.bh->b_data;
 				aed->lengthAllocDescs =
 					cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) + adsize);
-				udf_update_tag(nbh->b_data, nextoffset);
-				mark_buffer_dirty(nbh);
+				udf_update_tag(epos.bh->b_data, epos.offset);
+				mark_buffer_dirty(epos.bh);
 			}
 		}
 	}
 
-	udf_release_data(nbh);
-	udf_release_data(obh);
+	brelse(epos.bh);
+	brelse(oepos.bh);
 
 error_return:
 	sb->s_dirt = 1;
@@ -677,9 +674,9 @@ static int udf_table_prealloc_blocks(struct super_block * sb,
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	int alloc_count = 0;
-	uint32_t extoffset, elen, adsize;
-	kernel_lb_addr bloc, eloc;
-	struct buffer_head *bh;
+	uint32_t elen, adsize;
+	kernel_lb_addr eloc;
+	struct extent_position epos;
 	int8_t etype = -1;
 
 	if (first_block < 0 || first_block >= UDF_SB_PARTLEN(sb, partition))
@@ -693,14 +690,13 @@ static int udf_table_prealloc_blocks(struct super_block * sb,
 		return 0;
 
 	mutex_lock(&sbi->s_alloc_mutex);
-	extoffset = sizeof(struct unallocSpaceEntry);
-	bloc = UDF_I_LOCATION(table);
-
-	bh = NULL;
+	epos.offset = sizeof(struct unallocSpaceEntry);
+	epos.block = UDF_I_LOCATION(table);
+	epos.bh = NULL;
 	eloc.logicalBlockNum = 0xFFFFFFFF;
 
 	while (first_block != eloc.logicalBlockNum && (etype =
-		udf_next_aext(table, &bloc, &extoffset, &eloc, &elen, &bh, 1)) != -1)
+		udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1)
 	{
 		udf_debug("eloc=%d, elen=%d, first_block=%d\n",
 			eloc.logicalBlockNum, elen, first_block);
@@ -709,7 +705,7 @@ static int udf_table_prealloc_blocks(struct super_block * sb,
 
 	if (first_block == eloc.logicalBlockNum)
 	{
-		extoffset -= adsize;
+		epos.offset -= adsize;
 
 		alloc_count = (elen >> sb->s_blocksize_bits);
 		if (inode && DQUOT_PREALLOC_BLOCK(inode, alloc_count > block_count ? block_count : alloc_count))
@@ -719,15 +715,15 @@ static int udf_table_prealloc_blocks(struct super_block * sb,
 			alloc_count = block_count;
 			eloc.logicalBlockNum += alloc_count;
 			elen -= (alloc_count << sb->s_blocksize_bits);
-			udf_write_aext(table, bloc, &extoffset, eloc, (etype << 30) | elen, bh, 1);
+			udf_write_aext(table, &epos, eloc, (etype << 30) | elen, 1);
 		}
 		else
-			udf_delete_aext(table, bloc, extoffset, eloc, (etype << 30) | elen, bh);
+			udf_delete_aext(table, epos, eloc, (etype << 30) | elen);
 	}
 	else
 		alloc_count = 0;
 
-	udf_release_data(bh);
+	brelse(epos.bh);
 
 	if (alloc_count && UDF_SB_LVIDBH(sb))
 	{
@@ -747,9 +743,9 @@ static int udf_table_new_block(struct super_block * sb,
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
 	uint32_t newblock = 0, adsize;
-	uint32_t extoffset, goal_extoffset, elen, goal_elen = 0;
-	kernel_lb_addr bloc, goal_bloc, eloc, goal_eloc;
-	struct buffer_head *bh, *goal_bh;
+	uint32_t elen, goal_elen = 0;
+	kernel_lb_addr eloc, goal_eloc;
+	struct extent_position epos, goal_epos;
 	int8_t etype;
 
 	*err = -ENOSPC;
@@ -770,14 +766,12 @@ static int udf_table_new_block(struct super_block * sb,
 	   We store the buffer_head, bloc, and extoffset of the current closest
 	   match and use that when we are done.
 	*/
-
-	extoffset = sizeof(struct unallocSpaceEntry);
-	bloc = UDF_I_LOCATION(table);
-
-	goal_bh = bh = NULL;
+	epos.offset = sizeof(struct unallocSpaceEntry);
+	epos.block = UDF_I_LOCATION(table);
+	epos.bh = goal_epos.bh = NULL;
 
 	while (spread && (etype =
-		udf_next_aext(table, &bloc, &extoffset, &eloc, &elen, &bh, 1)) != -1)
+		udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1)
 	{
 		if (goal >= eloc.logicalBlockNum)
 		{
@@ -793,24 +787,24 @@ static int udf_table_new_block(struct super_block * sb,
 		if (nspread < spread)
 		{
 			spread = nspread;
-			if (goal_bh != bh)
+			if (goal_epos.bh != epos.bh)
 			{
-				udf_release_data(goal_bh);
-				goal_bh = bh;
-				atomic_inc(&goal_bh->b_count);
+				brelse(goal_epos.bh);
+				goal_epos.bh = epos.bh;
+				get_bh(goal_epos.bh);
 			}
-			goal_bloc = bloc;
-			goal_extoffset = extoffset - adsize;
+			goal_epos.block = epos.block;
+			goal_epos.offset = epos.offset - adsize;
 			goal_eloc = eloc;
 			goal_elen = (etype << 30) | elen;
 		}
 	}
 
-	udf_release_data(bh);
+	brelse(epos.bh);
 
 	if (spread == 0xFFFFFFFF)
 	{
-		udf_release_data(goal_bh);
+		brelse(goal_epos.bh);
 		mutex_unlock(&sbi->s_alloc_mutex);
 		return 0;
 	}
@@ -826,17 +820,17 @@ static int udf_table_new_block(struct super_block * sb,
 
 	if (inode && DQUOT_ALLOC_BLOCK(inode, 1))
 	{
-		udf_release_data(goal_bh);
+		brelse(goal_epos.bh);
 		mutex_unlock(&sbi->s_alloc_mutex);
 		*err = -EDQUOT;
 		return 0;
 	}
 
 	if (goal_elen)
-		udf_write_aext(table, goal_bloc, &goal_extoffset, goal_eloc, goal_elen, goal_bh, 1);
+		udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1);
 	else
-		udf_delete_aext(table, goal_bloc, goal_extoffset, goal_eloc, goal_elen, goal_bh);
-	udf_release_data(goal_bh);
+		udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
+	brelse(goal_epos.bh);
 
 	if (UDF_SB_LVIDBH(sb))
 	{
@@ -921,11 +915,14 @@ inline int udf_new_block(struct super_block * sb,
 	struct inode * inode,
 	uint16_t partition, uint32_t goal, int *err)
 {
+	int ret;
+
 	if (UDF_SB_PARTFLAGS(sb, partition) & UDF_PART_FLAG_UNALLOC_BITMAP)
 	{
-		return udf_bitmap_new_block(sb, inode,
+		ret = udf_bitmap_new_block(sb, inode,
 			UDF_SB_PARTMAPS(sb)[partition].s_uspace.s_bitmap,
 			partition, goal, err);
+		return ret;
 	}
 	else if (UDF_SB_PARTFLAGS(sb, partition) & UDF_PART_FLAG_UNALLOC_TABLE)
 	{
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 2391c9150c49..e45f86b5e7b0 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -111,11 +111,13 @@ do_udf_readdir(struct inode * dir, struct file *filp, filldir_t filldir, void *d
 	uint16_t liu;
 	uint8_t lfi;
 	loff_t size = (udf_ext0_offset(dir) + dir->i_size) >> 2;
-	struct buffer_head * bh = NULL, * tmp, * bha[16];
-	kernel_lb_addr bloc, eloc;
-	uint32_t extoffset, elen, offset;
+	struct buffer_head *tmp, *bha[16];
+	kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
 	int i, num;
 	unsigned int dt_type;
+	struct extent_position epos = { NULL, 0, {0, 0}};
 
 	if (nf_pos >= size)
 		return 0;
@@ -127,23 +129,22 @@ do_udf_readdir(struct inode * dir, struct file *filp, filldir_t filldir, void *d
 	if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_IN_ICB)
 		fibh.sbh = fibh.ebh = NULL;
 	else if (inode_bmap(dir, nf_pos >> (dir->i_sb->s_blocksize_bits - 2),
-		&bloc, &extoffset, &eloc, &elen, &offset, &bh) == (EXT_RECORDED_ALLOCATED >> 30))
+		&epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30))
 	{
-		offset >>= dir->i_sb->s_blocksize_bits;
 		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen)
 		{
 			if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_SHORT)
-				extoffset -= sizeof(short_ad);
+				epos.offset -= sizeof(short_ad);
 			else if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_LONG)
-				extoffset -= sizeof(long_ad);
+				epos.offset -= sizeof(long_ad);
 		}
 		else
 			offset = 0;
 
 		if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block)))
 		{
-			udf_release_data(bh);
+			brelse(epos.bh);
 			return -EIO;
 		}
 	
@@ -171,7 +172,7 @@ do_udf_readdir(struct inode * dir, struct file *filp, filldir_t filldir, void *d
 	}
 	else
 	{
-		udf_release_data(bh);
+		brelse(epos.bh);
 		return -ENOENT;
 	}
 
@@ -179,14 +180,14 @@ do_udf_readdir(struct inode * dir, struct file *filp, filldir_t filldir, void *d
 	{
 		filp->f_pos = nf_pos + 1;
 
-		fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &bloc, &extoffset, &eloc, &elen, &offset, &bh);
+		fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset);
 
 		if (!fi)
 		{
 			if (fibh.sbh != fibh.ebh)
-				udf_release_data(fibh.ebh);
-			udf_release_data(fibh.sbh);
-			udf_release_data(bh);
+				brelse(fibh.ebh);
+			brelse(fibh.sbh);
+			brelse(epos.bh);
 			return 0;
 		}
 
@@ -244,9 +245,9 @@ do_udf_readdir(struct inode * dir, struct file *filp, filldir_t filldir, void *d
 			if (filldir(dirent, fname, flen, filp->f_pos, iblock, dt_type) < 0)
 			{
 				if (fibh.sbh != fibh.ebh)
-					udf_release_data(fibh.ebh);
-				udf_release_data(fibh.sbh);
-				udf_release_data(bh);
+					brelse(fibh.ebh);
+				brelse(fibh.sbh);
+				brelse(epos.bh);
 	 			return 0;
 			}
 		}
@@ -255,9 +256,9 @@ do_udf_readdir(struct inode * dir, struct file *filp, filldir_t filldir, void *d
 	filp->f_pos = nf_pos + 1;
 
 	if (fibh.sbh != fibh.ebh)
-		udf_release_data(fibh.ebh);
-	udf_release_data(fibh.sbh);
-	udf_release_data(bh);
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	brelse(epos.bh);
 
 	return 0;
 }
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index fe751a2a0e47..198caa33027a 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -36,14 +36,14 @@ udf_filead_read(struct inode *dir, uint8_t *tmpad, uint8_t ad_size,
 
 	if (!ad)
 	{
-		udf_release_data(*bh);
+		brelse(*bh);
 		*error = 1;
 		return NULL;
 	}
 
 	if (*offset == dir->i_sb->s_blocksize)
 	{
-		udf_release_data(*bh);
+		brelse(*bh);
 		block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
 		if (!block)
 			return NULL;
@@ -57,7 +57,7 @@ udf_filead_read(struct inode *dir, uint8_t *tmpad, uint8_t ad_size,
 		remainder = dir->i_sb->s_blocksize - loffset;
 		memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
 
-		udf_release_data(*bh);
+		brelse(*bh);
 		block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
 		if (!block)
 			return NULL;
@@ -75,9 +75,9 @@ struct fileIdentDesc *
 udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 	struct udf_fileident_bh *fibh,
 	struct fileIdentDesc *cfi,
-	kernel_lb_addr *bloc, uint32_t *extoffset, 
+	struct extent_position *epos,
 	kernel_lb_addr *eloc, uint32_t *elen,
-	uint32_t *offset, struct buffer_head **bh)
+	sector_t *offset)
 {
 	struct fileIdentDesc *fi;
 	int i, num, block;
@@ -105,13 +105,11 @@ udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 
 	if (fibh->eoffset == dir->i_sb->s_blocksize)
 	{
-		int lextoffset = *extoffset;
+		int lextoffset = epos->offset;
 
-		if (udf_next_aext(dir, bloc, extoffset, eloc, elen, bh, 1) !=
+		if (udf_next_aext(dir, epos, eloc, elen, 1) !=
 			(EXT_RECORDED_ALLOCATED >> 30))
-		{
 			return NULL;
-		}
 
 		block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
 
@@ -120,9 +118,9 @@ udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 		if ((*offset << dir->i_sb->s_blocksize_bits) >= *elen)
 			*offset = 0;
 		else
-			*extoffset = lextoffset;
+			epos->offset = lextoffset;
 
-		udf_release_data(fibh->sbh);
+		brelse(fibh->sbh);
 		if (!(fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block)))
 			return NULL;
 		fibh->soffset = fibh->eoffset = 0;
@@ -151,7 +149,7 @@ udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 	}
 	else if (fibh->sbh != fibh->ebh)
 	{
-		udf_release_data(fibh->sbh);
+		brelse(fibh->sbh);
 		fibh->sbh = fibh->ebh;
 	}
 
@@ -169,13 +167,11 @@ udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 	}
 	else if (fibh->eoffset > dir->i_sb->s_blocksize)
 	{
-		int lextoffset = *extoffset;
+		int lextoffset = epos->offset;
 
-		if (udf_next_aext(dir, bloc, extoffset, eloc, elen, bh, 1) !=
+		if (udf_next_aext(dir, epos, eloc, elen, 1) !=
 			(EXT_RECORDED_ALLOCATED >> 30))
-		{
 			return NULL;
-		}
 
 		block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
 
@@ -184,7 +180,7 @@ udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 		if ((*offset << dir->i_sb->s_blocksize_bits) >= *elen)
 			*offset = 0;
 		else
-			*extoffset = lextoffset;
+			epos->offset = lextoffset;
 
 		fibh->soffset -= dir->i_sb->s_blocksize;
 		fibh->eoffset -= dir->i_sb->s_blocksize;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 40d5047defea..51b5764685e7 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -36,6 +36,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
+#include <linux/aio.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
index 5887d78cde43..6ded93e7c44f 100644
--- a/fs/udf/fsync.c
+++ b/fs/udf/fsync.c
@@ -21,7 +21,6 @@
 #include "udfdecl.h"
 
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 
 static int udf_fsync_inode(struct inode *, int);
 
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ae21a0e59e95..1f0129405cf4 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -49,10 +49,10 @@ MODULE_LICENSE("GPL");
 static mode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
-static struct buffer_head *inode_getblk(struct inode *, long, int *,
+static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
 	long *, int *);
-static int8_t udf_insert_aext(struct inode *, kernel_lb_addr, int,
-	kernel_lb_addr, uint32_t, struct buffer_head *);
+static int8_t udf_insert_aext(struct inode *, struct extent_position,
+	kernel_lb_addr, uint32_t);
 static void udf_split_extents(struct inode *, int *, int, int,
 	kernel_long_ad [EXTENT_MERGE_SIZE], int *);
 static void udf_prealloc_extents(struct inode *, int, int,
@@ -61,7 +61,7 @@ static void udf_merge_extents(struct inode *,
 	 kernel_long_ad [EXTENT_MERGE_SIZE], int *);
 static void udf_update_extents(struct inode *,
 	kernel_long_ad [EXTENT_MERGE_SIZE], int, int,
-	kernel_lb_addr, uint32_t, struct buffer_head **);
+	struct extent_position *);
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
 /*
@@ -194,10 +194,11 @@ void udf_expand_file_adinicb(struct inode * inode, int newsize, int * err)
 struct buffer_head * udf_expand_dir_adinicb(struct inode *inode, int *block, int *err)
 {
 	int newblock;
-	struct buffer_head *sbh = NULL, *dbh = NULL;
-	kernel_lb_addr bloc, eloc;
-	uint32_t elen, extoffset;
+	struct buffer_head *dbh = NULL;
+	kernel_lb_addr eloc;
+	uint32_t elen;
 	uint8_t alloctype;
+	struct extent_position epos;
 
 	struct udf_fileident_bh sfibh, dfibh;
 	loff_t f_pos = udf_ext0_offset(inode) >> 2;
@@ -237,16 +238,16 @@ struct buffer_head * udf_expand_dir_adinicb(struct inode *inode, int *block, int
 	mark_buffer_dirty_inode(dbh, inode);
 
 	sfibh.soffset = sfibh.eoffset = (f_pos & ((inode->i_sb->s_blocksize - 1) >> 2)) << 2;
-	sbh = sfibh.sbh = sfibh.ebh = NULL;
+	sfibh.sbh = sfibh.ebh = NULL;
 	dfibh.soffset = dfibh.eoffset = 0;
 	dfibh.sbh = dfibh.ebh = dbh;
 	while ( (f_pos < size) )
 	{
 		UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB;
-		sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL, NULL, NULL, NULL, NULL, NULL);
+		sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL, NULL, NULL, NULL);
 		if (!sfi)
 		{
-			udf_release_data(dbh);
+			brelse(dbh);
 			return NULL;
 		}
 		UDF_I_ALLOCTYPE(inode) = alloctype;
@@ -258,7 +259,7 @@ struct buffer_head * udf_expand_dir_adinicb(struct inode *inode, int *block, int
 			sfi->fileIdent + le16_to_cpu(sfi->lengthOfImpUse)))
 		{
 			UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB;
-			udf_release_data(dbh);
+			brelse(dbh);
 			return NULL;
 		}
 	}
@@ -266,16 +267,17 @@ struct buffer_head * udf_expand_dir_adinicb(struct inode *inode, int *block, int
 
 	memset(UDF_I_DATA(inode) + UDF_I_LENEATTR(inode), 0, UDF_I_LENALLOC(inode));
 	UDF_I_LENALLOC(inode) = 0;
-	bloc = UDF_I_LOCATION(inode);
 	eloc.logicalBlockNum = *block;
 	eloc.partitionReferenceNum = UDF_I_LOCATION(inode).partitionReferenceNum;
 	elen = inode->i_size;
 	UDF_I_LENEXTENTS(inode) = elen;
-	extoffset = udf_file_entry_alloc_offset(inode);
-	udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &sbh, 0);
+	epos.bh = NULL;
+	epos.block = UDF_I_LOCATION(inode);
+	epos.offset = udf_file_entry_alloc_offset(inode);
+	udf_add_aext(inode, &epos, eloc, elen, 0);
 	/* UniqueID stuff */
 
-	udf_release_data(sbh);
+	brelse(epos.bh);
 	mark_inode_dirty(inode);
 	return dbh;
 }
@@ -354,53 +356,153 @@ udf_getblk(struct inode *inode, long block, int create, int *err)
 	return NULL;
 }
 
-static struct buffer_head * inode_getblk(struct inode * inode, long block,
+/* Extend the file by 'blocks' blocks, return the number of extents added */
+int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
+	kernel_long_ad *last_ext, sector_t blocks)
+{
+	sector_t add;
+	int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
+	struct super_block *sb = inode->i_sb;
+	kernel_lb_addr prealloc_loc = {0, 0};
+	int prealloc_len = 0;
+
+	/* The previous extent is fake and we should not extend by anything
+	 * - there's nothing to do... */
+	if (!blocks && fake)
+		return 0;
+	/* Round the last extent up to a multiple of block size */
+	if (last_ext->extLength & (sb->s_blocksize - 1)) {
+		last_ext->extLength =
+			(last_ext->extLength & UDF_EXTENT_FLAG_MASK) |
+			(((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) +
+				sb->s_blocksize - 1) & ~(sb->s_blocksize - 1));
+		UDF_I_LENEXTENTS(inode) =
+			(UDF_I_LENEXTENTS(inode) + sb->s_blocksize - 1) &
+				~(sb->s_blocksize - 1);
+	}
+	/* Last extent are just preallocated blocks? */
+	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_ALLOCATED) {
+		/* Save the extent so that we can reattach it to the end */
+		prealloc_loc = last_ext->extLocation;
+		prealloc_len = last_ext->extLength;
+		/* Mark the extent as a hole */
+		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+			(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
+		last_ext->extLocation.logicalBlockNum = 0;
+       		last_ext->extLocation.partitionReferenceNum = 0;
+	}
+	/* Can we merge with the previous extent? */
+	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) {
+		add = ((1<<30) - sb->s_blocksize - (last_ext->extLength &
+			UDF_EXTENT_LENGTH_MASK)) >> sb->s_blocksize_bits;
+		if (add > blocks)
+			add = blocks;
+		blocks -= add;
+		last_ext->extLength += add << sb->s_blocksize_bits;
+	}
+
+	if (fake) {
+		udf_add_aext(inode, last_pos, last_ext->extLocation,
+			last_ext->extLength, 1);
+		count++;
+	}
+	else
+		udf_write_aext(inode, last_pos, last_ext->extLocation, last_ext->extLength, 1);
+	/* Managed to do everything necessary? */
+	if (!blocks)
+		goto out;
+
+	/* All further extents will be NOT_RECORDED_NOT_ALLOCATED */
+	last_ext->extLocation.logicalBlockNum = 0;
+       	last_ext->extLocation.partitionReferenceNum = 0;
+	add = (1 << (30-sb->s_blocksize_bits)) - 1;
+	last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | (add << sb->s_blocksize_bits);
+	/* Create enough extents to cover the whole hole */
+	while (blocks > add) {
+		blocks -= add;
+		if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+			last_ext->extLength, 1) == -1)
+			return -1;
+		count++;
+	}
+	if (blocks) {
+		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+			(blocks << sb->s_blocksize_bits);
+		if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+			last_ext->extLength, 1) == -1)
+			return -1;
+		count++;
+	}
+out:
+	/* Do we have some preallocated blocks saved? */
+	if (prealloc_len) {
+		if (udf_add_aext(inode, last_pos, prealloc_loc, prealloc_len, 1) == -1)
+			return -1;
+		last_ext->extLocation = prealloc_loc;
+		last_ext->extLength = prealloc_len;
+		count++;
+	}
+	/* last_pos should point to the last written extent... */
+	if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_SHORT)
+		last_pos->offset -= sizeof(short_ad);
+	else if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_LONG)
+		last_pos->offset -= sizeof(long_ad);
+	else
+		return -1;
+	return count;
+}
+
+static struct buffer_head * inode_getblk(struct inode * inode, sector_t block,
 	int *err, long *phys, int *new)
 {
-	struct buffer_head *pbh = NULL, *cbh = NULL, *nbh = NULL, *result = NULL;
+	static sector_t last_block;
+	struct buffer_head *result = NULL;
 	kernel_long_ad laarr[EXTENT_MERGE_SIZE];
-	uint32_t pextoffset = 0, cextoffset = 0, nextoffset = 0;
+	struct extent_position prev_epos, cur_epos, next_epos;
 	int count = 0, startnum = 0, endnum = 0;
-	uint32_t elen = 0;
-	kernel_lb_addr eloc, pbloc, cbloc, nbloc;
+	uint32_t elen = 0, tmpelen;
+	kernel_lb_addr eloc, tmpeloc;
 	int c = 1;
-	uint64_t lbcount = 0, b_off = 0;
-	uint32_t newblocknum, newblock, offset = 0;
+	loff_t lbcount = 0, b_off = 0;
+	uint32_t newblocknum, newblock;
+	sector_t offset = 0;
 	int8_t etype;
 	int goal = 0, pgoal = UDF_I_LOCATION(inode).logicalBlockNum;
-	char lastblock = 0;
+	int lastblock = 0;
 
-	pextoffset = cextoffset = nextoffset = udf_file_entry_alloc_offset(inode);
-	b_off = (uint64_t)block << inode->i_sb->s_blocksize_bits;
-	pbloc = cbloc = nbloc = UDF_I_LOCATION(inode);
+	prev_epos.offset = udf_file_entry_alloc_offset(inode);
+	prev_epos.block = UDF_I_LOCATION(inode);
+	prev_epos.bh = NULL;
+	cur_epos = next_epos = prev_epos;
+	b_off = (loff_t)block << inode->i_sb->s_blocksize_bits;
 
 	/* find the extent which contains the block we are looking for.
        alternate between laarr[0] and laarr[1] for locations of the
        current extent, and the previous extent */
 	do
 	{
-		if (pbh != cbh)
+		if (prev_epos.bh != cur_epos.bh)
 		{
-			udf_release_data(pbh);
-			atomic_inc(&cbh->b_count);
-			pbh = cbh;
+			brelse(prev_epos.bh);
+			get_bh(cur_epos.bh);
+			prev_epos.bh = cur_epos.bh;
 		}
-		if (cbh != nbh)
+		if (cur_epos.bh != next_epos.bh)
 		{
-			udf_release_data(cbh);
-			atomic_inc(&nbh->b_count);
-			cbh = nbh;
+			brelse(cur_epos.bh);
+			get_bh(next_epos.bh);
+			cur_epos.bh = next_epos.bh;
 		}
 
 		lbcount += elen;
 
-		pbloc = cbloc;
-		cbloc = nbloc;
+		prev_epos.block = cur_epos.block;
+		cur_epos.block = next_epos.block;
 
-		pextoffset = cextoffset;
-		cextoffset = nextoffset;
+		prev_epos.offset = cur_epos.offset;
+		cur_epos.offset = next_epos.offset;
 
-		if ((etype = udf_next_aext(inode, &nbloc, &nextoffset, &eloc, &elen, &nbh, 1)) == -1)
+		if ((etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 1)) == -1)
 			break;
 
 		c = !c;
@@ -418,6 +520,12 @@ static struct buffer_head * inode_getblk(struct inode * inode, long block,
 
 	b_off -= lbcount;
 	offset = b_off >> inode->i_sb->s_blocksize_bits;
+	/*
+	 * Move prev_epos and cur_epos into indirect extent if we are at
+	 * the pointer to it
+	 */
+	udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, 0);
+	udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, 0);
 
 	/* if the extent is allocated and recorded, return the block
        if the extent is not a multiple of the blocksize, round up */
@@ -429,54 +537,77 @@ static struct buffer_head * inode_getblk(struct inode * inode, long block,
 			elen = EXT_RECORDED_ALLOCATED |
 				((elen + inode->i_sb->s_blocksize - 1) &
 				~(inode->i_sb->s_blocksize - 1));
-			etype = udf_write_aext(inode, nbloc, &cextoffset, eloc, elen, nbh, 1);
+			etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1);
 		}
-		udf_release_data(pbh);
-		udf_release_data(cbh);
-		udf_release_data(nbh);
+		brelse(prev_epos.bh);
+		brelse(cur_epos.bh);
+		brelse(next_epos.bh);
 		newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset);
 		*phys = newblock;
 		return NULL;
 	}
 
+	last_block = block;
+	/* Are we beyond EOF? */
 	if (etype == -1)
 	{
-		endnum = startnum = ((count > 1) ? 1 : count);
-		if (laarr[c].extLength & (inode->i_sb->s_blocksize - 1))
-		{
-			laarr[c].extLength =
-				(laarr[c].extLength & UDF_EXTENT_FLAG_MASK) |
-				(((laarr[c].extLength & UDF_EXTENT_LENGTH_MASK) +
-					inode->i_sb->s_blocksize - 1) &
-				~(inode->i_sb->s_blocksize - 1));
-			UDF_I_LENEXTENTS(inode) =
-				(UDF_I_LENEXTENTS(inode) + inode->i_sb->s_blocksize - 1) &
-					~(inode->i_sb->s_blocksize - 1);
+		int ret;
+
+		if (count) {
+			if (c)
+				laarr[0] = laarr[1];
+			startnum = 1;
+		}
+		else {
+			/* Create a fake extent when there's not one */
+			memset(&laarr[0].extLocation, 0x00, sizeof(kernel_lb_addr));
+			laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+			/* Will udf_extend_file() create real extent from a fake one? */
+			startnum = (offset > 0);
+		}
+		/* Create extents for the hole between EOF and offset */
+		ret = udf_extend_file(inode, &prev_epos, laarr, offset);
+		if (ret == -1) {
+			brelse(prev_epos.bh);
+			brelse(cur_epos.bh);
+			brelse(next_epos.bh);
+			/* We don't really know the error here so we just make
+			 * something up */
+			*err = -ENOSPC;
+			return NULL;
 		}
-		c = !c;
-		laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
-			((offset + 1) << inode->i_sb->s_blocksize_bits);
-		memset(&laarr[c].extLocation, 0x00, sizeof(kernel_lb_addr));
-		count ++;
-		endnum ++;
+		c = 0;
+		offset = 0;
+		count += ret;
+		/* We are not covered by a preallocated extent? */
+		if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) != EXT_NOT_RECORDED_ALLOCATED) {
+			/* Is there any real extent? - otherwise we overwrite
+			 * the fake one... */
+			if (count)
+				c = !c;
+			laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+				inode->i_sb->s_blocksize;
+			memset(&laarr[c].extLocation, 0x00, sizeof(kernel_lb_addr));
+			count ++;
+			endnum ++;
+		}
+		endnum = c+1;
 		lastblock = 1;
 	}
-	else
+	else {
 		endnum = startnum = ((count > 2) ? 2 : count);
 
-	/* if the current extent is in position 0, swap it with the previous */
-	if (!c && count != 1)
-	{
-		laarr[2] = laarr[0];
-		laarr[0] = laarr[1];
-		laarr[1] = laarr[2];
-		c = 1;
-	}
+		/* if the current extent is in position 0, swap it with the previous */
+		if (!c && count != 1)
+		{
+			laarr[2] = laarr[0];
+			laarr[0] = laarr[1];
+			laarr[1] = laarr[2];
+			c = 1;
+		}
 
-	/* if the current block is located in a extent, read the next extent */
-	if (etype != -1)
-	{
-		if ((etype = udf_next_aext(inode, &nbloc, &nextoffset, &eloc, &elen, &nbh, 0)) != -1)
+		/* if the current block is located in an extent, read the next extent */
+		if ((etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 0)) != -1)
 		{
 			laarr[c+1].extLength = (etype << 30) | elen;
 			laarr[c+1].extLocation = eloc;
@@ -484,11 +615,10 @@ static struct buffer_head * inode_getblk(struct inode * inode, long block,
 			startnum ++;
 			endnum ++;
 		}
-		else
+		else {
 			lastblock = 1;
+		}
 	}
-	udf_release_data(cbh);
-	udf_release_data(nbh);
 
 	/* if the current extent is not recorded but allocated, get the
 		block in the extent corresponding to the requested block */
@@ -508,7 +638,7 @@ static struct buffer_head * inode_getblk(struct inode * inode, long block,
 		if (!(newblocknum = udf_new_block(inode->i_sb, inode,
 			UDF_I_LOCATION(inode).partitionReferenceNum, goal, err)))
 		{
-			udf_release_data(pbh);
+			brelse(prev_epos.bh);
 			*err = -ENOSPC;
 			return NULL;
 		}
@@ -529,11 +659,11 @@ static struct buffer_head * inode_getblk(struct inode * inode, long block,
 	udf_merge_extents(inode, laarr, &endnum);
 
 	/* write back the new extents, inserting new extents if the new number
-       of extents is greater than the old number, and deleting extents if
-       the new number of extents is less than the old number */
-	udf_update_extents(inode, laarr, startnum, endnum, pbloc, pextoffset, &pbh);
+	of extents is greater than the old number, and deleting extents if
+	the new number of extents is less than the old number */
+	udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
 
-	udf_release_data(pbh);
+	brelse(prev_epos.bh);
 
 	if (!(newblock = udf_get_pblock(inode->i_sb, newblocknum,
 		UDF_I_LOCATION(inode).partitionReferenceNum, 0)))
@@ -795,7 +925,7 @@ static void udf_merge_extents(struct inode *inode,
 
 static void udf_update_extents(struct inode *inode,
 	kernel_long_ad laarr[EXTENT_MERGE_SIZE], int startnum, int endnum,
-	kernel_lb_addr pbloc, uint32_t pextoffset, struct buffer_head **pbh)
+	struct extent_position *epos)
 {
 	int start = 0, i;
 	kernel_lb_addr tmploc;
@@ -804,28 +934,26 @@ static void udf_update_extents(struct inode *inode,
 	if (startnum > endnum)
 	{
 		for (i=0; i<(startnum-endnum); i++)
-		{
-			udf_delete_aext(inode, pbloc, pextoffset, laarr[i].extLocation,
-				laarr[i].extLength, *pbh);
-		}
+			udf_delete_aext(inode, *epos, laarr[i].extLocation,
+				laarr[i].extLength);
 	}
 	else if (startnum < endnum)
 	{
 		for (i=0; i<(endnum-startnum); i++)
 		{
-			udf_insert_aext(inode, pbloc, pextoffset, laarr[i].extLocation,
-				laarr[i].extLength, *pbh);
-			udf_next_aext(inode, &pbloc, &pextoffset, &laarr[i].extLocation,
-				&laarr[i].extLength, pbh, 1);
+			udf_insert_aext(inode, *epos, laarr[i].extLocation,
+				laarr[i].extLength);
+			udf_next_aext(inode, epos, &laarr[i].extLocation,
+				&laarr[i].extLength, 1);
 			start ++;
 		}
 	}
 
 	for (i=start; i<endnum; i++)
 	{
-		udf_next_aext(inode, &pbloc, &pextoffset, &tmploc, &tmplen, pbh, 0);
-		udf_write_aext(inode, pbloc, &pextoffset, laarr[i].extLocation,
-			laarr[i].extLength, *pbh, 1);
+		udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
+		udf_write_aext(inode, epos, laarr[i].extLocation,
+			laarr[i].extLength, 1);
 	}
 }
 
@@ -931,7 +1059,7 @@ __udf_read_inode(struct inode *inode)
 	{
 		printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed ident=%d\n",
 			inode->i_ino, ident);
-		udf_release_data(bh);
+		brelse(bh);
 		make_bad_inode(inode);
 		return;
 	}
@@ -960,35 +1088,36 @@ __udf_read_inode(struct inode *inode)
 						ident == TAG_IDENT_EFE)
 					{
 						memcpy(&UDF_I_LOCATION(inode), &loc, sizeof(kernel_lb_addr));
-						udf_release_data(bh);
-						udf_release_data(ibh);
-						udf_release_data(nbh);
+						brelse(bh);
+						brelse(ibh);
+						brelse(nbh);
 						__udf_read_inode(inode);
 						return;
 					}
 					else
 					{
-						udf_release_data(nbh);
-						udf_release_data(ibh);
+						brelse(nbh);
+						brelse(ibh);
 					}
 				}
 				else
-					udf_release_data(ibh);
+					brelse(ibh);
 			}
 		}
 		else
-			udf_release_data(ibh);
+			brelse(ibh);
 	}
 	else if (le16_to_cpu(fe->icbTag.strategyType) != 4)
 	{
 		printk(KERN_ERR "udf: unsupported strategy type: %d\n",
 			le16_to_cpu(fe->icbTag.strategyType));
-		udf_release_data(bh);
+		brelse(bh);
 		make_bad_inode(inode);
 		return;
 	}
 	udf_fill_inode(inode, bh);
-	udf_release_data(bh);
+
+	brelse(bh);
 }
 
 static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
@@ -1331,7 +1460,7 @@ udf_update_inode(struct inode *inode, int do_sync)
 				use->descTag.tagChecksum += ((uint8_t *)&(use->descTag))[i];
 
 		mark_buffer_dirty(bh);
-		udf_release_data(bh);
+		brelse(bh);
 		return err;
 	}
 
@@ -1520,7 +1649,7 @@ udf_update_inode(struct inode *inode, int do_sync)
 			err = -EIO;
 		}
 	}
-	udf_release_data(bh);
+	brelse(bh);
 	return err;
 }
 
@@ -1556,8 +1685,8 @@ udf_iget(struct super_block *sb, kernel_lb_addr ino)
 	return NULL;
 }
 
-int8_t udf_add_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
-	kernel_lb_addr eloc, uint32_t elen, struct buffer_head **bh, int inc)
+int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
+	kernel_lb_addr eloc, uint32_t elen, int inc)
 {
 	int adsize;
 	short_ad *sad = NULL;
@@ -1566,10 +1695,10 @@ int8_t udf_add_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
 	int8_t etype;
 	uint8_t *ptr;
 
-	if (!*bh)
-		ptr = UDF_I_DATA(inode) + *extoffset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode);
+	if (!epos->bh)
+		ptr = UDF_I_DATA(inode) + epos->offset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode);
 	else
-		ptr = (*bh)->b_data + *extoffset;
+		ptr = epos->bh->b_data + epos->offset;
 
 	if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_SHORT)
 		adsize = sizeof(short_ad);
@@ -1578,20 +1707,20 @@ int8_t udf_add_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
 	else
 		return -1;
 
-	if (*extoffset + (2 * adsize) > inode->i_sb->s_blocksize)
+	if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize)
 	{
 		char *sptr, *dptr;
 		struct buffer_head *nbh;
 		int err, loffset;
-		kernel_lb_addr obloc = *bloc;
+		kernel_lb_addr obloc = epos->block;
 
-		if (!(bloc->logicalBlockNum = udf_new_block(inode->i_sb, NULL,
+		if (!(epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
 			obloc.partitionReferenceNum, obloc.logicalBlockNum, &err)))
 		{
 			return -1;
 		}
 		if (!(nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
-			*bloc, 0))))
+			epos->block, 0))))
 		{
 			return -1;
 		}
@@ -1604,25 +1733,25 @@ int8_t udf_add_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
 		aed = (struct allocExtDesc *)(nbh->b_data);
 		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
 			aed->previousAllocExtLocation = cpu_to_le32(obloc.logicalBlockNum);
-		if (*extoffset + adsize > inode->i_sb->s_blocksize)
+		if (epos->offset + adsize > inode->i_sb->s_blocksize)
 		{
-			loffset = *extoffset;
+			loffset = epos->offset;
 			aed->lengthAllocDescs = cpu_to_le32(adsize);
 			sptr = ptr - adsize;
 			dptr = nbh->b_data + sizeof(struct allocExtDesc);
 			memcpy(dptr, sptr, adsize);
-			*extoffset = sizeof(struct allocExtDesc) + adsize;
+			epos->offset = sizeof(struct allocExtDesc) + adsize;
 		}
 		else
 		{
-			loffset = *extoffset + adsize;
+			loffset = epos->offset + adsize;
 			aed->lengthAllocDescs = cpu_to_le32(0);
 			sptr = ptr;
-			*extoffset = sizeof(struct allocExtDesc);
+			epos->offset = sizeof(struct allocExtDesc);
 
-			if (*bh)
+			if (epos->bh)
 			{
-				aed = (struct allocExtDesc *)(*bh)->b_data;
+				aed = (struct allocExtDesc *)epos->bh->b_data;
 				aed->lengthAllocDescs =
 					cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) + adsize);
 			}
@@ -1634,10 +1763,10 @@ int8_t udf_add_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
 		}
 		if (UDF_SB_UDFREV(inode->i_sb) >= 0x0200)
 			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
-				bloc->logicalBlockNum, sizeof(tag));
+				epos->block.logicalBlockNum, sizeof(tag));
 		else
 			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
-				bloc->logicalBlockNum, sizeof(tag));
+				epos->block.logicalBlockNum, sizeof(tag));
 		switch (UDF_I_ALLOCTYPE(inode))
 		{
 			case ICBTAG_FLAG_AD_SHORT:
@@ -1646,7 +1775,7 @@ int8_t udf_add_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
 				sad->extLength = cpu_to_le32(
 					EXT_NEXT_EXTENT_ALLOCDECS |
 					inode->i_sb->s_blocksize);
-				sad->extPosition = cpu_to_le32(bloc->logicalBlockNum);
+				sad->extPosition = cpu_to_le32(epos->block.logicalBlockNum);
 				break;
 			}
 			case ICBTAG_FLAG_AD_LONG:
@@ -1655,60 +1784,57 @@ int8_t udf_add_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
 				lad->extLength = cpu_to_le32(
 					EXT_NEXT_EXTENT_ALLOCDECS |
 					inode->i_sb->s_blocksize);
-				lad->extLocation = cpu_to_lelb(*bloc);
+				lad->extLocation = cpu_to_lelb(epos->block);
 				memset(lad->impUse, 0x00, sizeof(lad->impUse));
 				break;
 			}
 		}
-		if (*bh)
+		if (epos->bh)
 		{
 			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201)
-				udf_update_tag((*bh)->b_data, loffset);
+				udf_update_tag(epos->bh->b_data, loffset);
 			else
-				udf_update_tag((*bh)->b_data, sizeof(struct allocExtDesc));
-			mark_buffer_dirty_inode(*bh, inode);
-			udf_release_data(*bh);
+				udf_update_tag(epos->bh->b_data, sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(epos->bh, inode);
+			brelse(epos->bh);
 		}
 		else
 			mark_inode_dirty(inode);
-		*bh = nbh;
+		epos->bh = nbh;
 	}
 
-	etype = udf_write_aext(inode, *bloc, extoffset, eloc, elen, *bh, inc);
+	etype = udf_write_aext(inode, epos, eloc, elen, inc);
 
-	if (!*bh)
+	if (!epos->bh)
 	{
 		UDF_I_LENALLOC(inode) += adsize;
 		mark_inode_dirty(inode);
 	}
 	else
 	{
-		aed = (struct allocExtDesc *)(*bh)->b_data;
+		aed = (struct allocExtDesc *)epos->bh->b_data;
 		aed->lengthAllocDescs =
 			cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) + adsize);
 		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201)
-			udf_update_tag((*bh)->b_data, *extoffset + (inc ? 0 : adsize));
+			udf_update_tag(epos->bh->b_data, epos->offset + (inc ? 0 : adsize));
 		else
-			udf_update_tag((*bh)->b_data, sizeof(struct allocExtDesc));
-		mark_buffer_dirty_inode(*bh, inode);
+			udf_update_tag(epos->bh->b_data, sizeof(struct allocExtDesc));
+		mark_buffer_dirty_inode(epos->bh, inode);
 	}
 
 	return etype;
 }
 
-int8_t udf_write_aext(struct inode *inode, kernel_lb_addr bloc, int *extoffset,
-    kernel_lb_addr eloc, uint32_t elen, struct buffer_head *bh, int inc)
+int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
+    kernel_lb_addr eloc, uint32_t elen, int inc)
 {
 	int adsize;
 	uint8_t *ptr;
 
-	if (!bh)
-		ptr = UDF_I_DATA(inode) + *extoffset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode);
+	if (!epos->bh)
+		ptr = UDF_I_DATA(inode) + epos->offset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode);
 	else
-	{
-		ptr = bh->b_data + *extoffset;
-		atomic_inc(&bh->b_count);
-	}
+		ptr = epos->bh->b_data + epos->offset;
 
 	switch (UDF_I_ALLOCTYPE(inode))
 	{
@@ -1733,40 +1859,39 @@ int8_t udf_write_aext(struct inode *inode, kernel_lb_addr bloc, int *extoffset,
 			return -1;
 	}
 
-	if (bh)
+	if (epos->bh)
 	{
 		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201)
 		{
-			struct allocExtDesc *aed = (struct allocExtDesc *)(bh)->b_data;
-			udf_update_tag((bh)->b_data,
+			struct allocExtDesc *aed = (struct allocExtDesc *)epos->bh->b_data;
+			udf_update_tag(epos->bh->b_data,
 				le32_to_cpu(aed->lengthAllocDescs) + sizeof(struct allocExtDesc));
 		}
-		mark_buffer_dirty_inode(bh, inode);
-		udf_release_data(bh);
+		mark_buffer_dirty_inode(epos->bh, inode);
 	}
 	else
 		mark_inode_dirty(inode);
 
 	if (inc)
-		*extoffset += adsize;
+		epos->offset += adsize;
 	return (elen >> 30);
 }
 
-int8_t udf_next_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
-	kernel_lb_addr *eloc, uint32_t *elen, struct buffer_head **bh, int inc)
+int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
+	kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
 	int8_t etype;
 
-	while ((etype = udf_current_aext(inode, bloc, extoffset, eloc, elen, bh, inc)) ==
+	while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
 		(EXT_NEXT_EXTENT_ALLOCDECS >> 30))
 	{
-		*bloc = *eloc;
-		*extoffset = sizeof(struct allocExtDesc);
-		udf_release_data(*bh);
-		if (!(*bh = udf_tread(inode->i_sb, udf_get_lb_pblock(inode->i_sb, *bloc, 0))))
+		epos->block = *eloc;
+		epos->offset = sizeof(struct allocExtDesc);
+		brelse(epos->bh);
+		if (!(epos->bh = udf_tread(inode->i_sb, udf_get_lb_pblock(inode->i_sb, epos->block, 0))))
 		{
 			udf_debug("reading block %d failed!\n",
-				udf_get_lb_pblock(inode->i_sb, *bloc, 0));
+				udf_get_lb_pblock(inode->i_sb, epos->block, 0));
 			return -1;
 		}
 	}
@@ -1774,26 +1899,26 @@ int8_t udf_next_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
 	return etype;
 }
 
-int8_t udf_current_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffset,
-	kernel_lb_addr *eloc, uint32_t *elen, struct buffer_head **bh, int inc)
+int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
+	kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
 	int alen;
 	int8_t etype;
 	uint8_t *ptr;
 
-	if (!*bh)
+	if (!epos->bh)
 	{
-		if (!(*extoffset))
-			*extoffset = udf_file_entry_alloc_offset(inode);
-		ptr = UDF_I_DATA(inode) + *extoffset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode);
+		if (!epos->offset)
+			epos->offset = udf_file_entry_alloc_offset(inode);
+		ptr = UDF_I_DATA(inode) + epos->offset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode);
 		alen = udf_file_entry_alloc_offset(inode) + UDF_I_LENALLOC(inode);
 	}
 	else
 	{
-		if (!(*extoffset))
-			*extoffset = sizeof(struct allocExtDesc);
-		ptr = (*bh)->b_data + *extoffset;
-		alen = sizeof(struct allocExtDesc) + le32_to_cpu(((struct allocExtDesc *)(*bh)->b_data)->lengthAllocDescs);
+		if (!epos->offset)
+			epos->offset = sizeof(struct allocExtDesc);
+		ptr = epos->bh->b_data + epos->offset;
+		alen = sizeof(struct allocExtDesc) + le32_to_cpu(((struct allocExtDesc *)epos->bh->b_data)->lengthAllocDescs);
 	}
 
 	switch (UDF_I_ALLOCTYPE(inode))
@@ -1802,7 +1927,7 @@ int8_t udf_current_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffse
 		{
 			short_ad *sad;
 
-			if (!(sad = udf_get_fileshortad(ptr, alen, extoffset, inc)))
+			if (!(sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc)))
 				return -1;
 
 			etype = le32_to_cpu(sad->extLength) >> 30;
@@ -1815,7 +1940,7 @@ int8_t udf_current_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffse
 		{
 			long_ad *lad;
 
-			if (!(lad = udf_get_filelongad(ptr, alen, extoffset, inc)))
+			if (!(lad = udf_get_filelongad(ptr, alen, &epos->offset, inc)))
 				return -1;
 
 			etype = le32_to_cpu(lad->extLength) >> 30;
@@ -1834,41 +1959,40 @@ int8_t udf_current_aext(struct inode *inode, kernel_lb_addr *bloc, int *extoffse
 }
 
 static int8_t
-udf_insert_aext(struct inode *inode, kernel_lb_addr bloc, int extoffset,
-		kernel_lb_addr neloc, uint32_t nelen, struct buffer_head *bh)
+udf_insert_aext(struct inode *inode, struct extent_position epos,
+		kernel_lb_addr neloc, uint32_t nelen)
 {
 	kernel_lb_addr oeloc;
 	uint32_t oelen;
 	int8_t etype;
 
-	if (bh)
-		atomic_inc(&bh->b_count);
+	if (epos.bh)
+		get_bh(epos.bh);
 
-	while ((etype = udf_next_aext(inode, &bloc, &extoffset, &oeloc, &oelen, &bh, 0)) != -1)
+	while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1)
 	{
-		udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 1);
+		udf_write_aext(inode, &epos, neloc, nelen, 1);
 
 		neloc = oeloc;
 		nelen = (etype << 30) | oelen;
 	}
-	udf_add_aext(inode, &bloc, &extoffset, neloc, nelen, &bh, 1);
-	udf_release_data(bh);
+	udf_add_aext(inode, &epos, neloc, nelen, 1);
+	brelse(epos.bh);
 	return (nelen >> 30);
 }
 
-int8_t udf_delete_aext(struct inode *inode, kernel_lb_addr nbloc, int nextoffset,
-	kernel_lb_addr eloc, uint32_t elen, struct buffer_head *nbh)
+int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
+	kernel_lb_addr eloc, uint32_t elen)
 {
-	struct buffer_head *obh;
-	kernel_lb_addr obloc;
-	int oextoffset, adsize;
+	struct extent_position oepos;
+	int adsize;
 	int8_t etype;
 	struct allocExtDesc *aed;
 
-	if (nbh)
+	if (epos.bh)
 	{
-		atomic_inc(&nbh->b_count);
-		atomic_inc(&nbh->b_count);
+		get_bh(epos.bh);
+		get_bh(epos.bh);
 	}
 
 	if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_SHORT)
@@ -1878,80 +2002,77 @@ int8_t udf_delete_aext(struct inode *inode, kernel_lb_addr nbloc, int nextoffset
 	else
 		adsize = 0;
 
-	obh = nbh;
-	obloc = nbloc;
-	oextoffset = nextoffset;
-
-	if (udf_next_aext(inode, &nbloc, &nextoffset, &eloc, &elen, &nbh, 1) == -1)
+	oepos = epos;
+	if (udf_next_aext(inode, &epos, &eloc, &elen, 1) == -1)
 		return -1;
 
-	while ((etype = udf_next_aext(inode, &nbloc, &nextoffset, &eloc, &elen, &nbh, 1)) != -1)
+	while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1)
 	{
-		udf_write_aext(inode, obloc, &oextoffset, eloc, (etype << 30) | elen, obh, 1);
-		if (obh != nbh)
+		udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1);
+		if (oepos.bh != epos.bh)
 		{
-			obloc = nbloc;
-			udf_release_data(obh);
-			atomic_inc(&nbh->b_count);
-			obh = nbh;
-			oextoffset = nextoffset - adsize;
+			oepos.block = epos.block;
+			brelse(oepos.bh);
+			get_bh(epos.bh);
+			oepos.bh = epos.bh;
+			oepos.offset = epos.offset - adsize;
 		}
 	}
 	memset(&eloc, 0x00, sizeof(kernel_lb_addr));
 	elen = 0;
 
-	if (nbh != obh)
+	if (epos.bh != oepos.bh)
 	{
-		udf_free_blocks(inode->i_sb, inode, nbloc, 0, 1);
-		udf_write_aext(inode, obloc, &oextoffset, eloc, elen, obh, 1);
-		udf_write_aext(inode, obloc, &oextoffset, eloc, elen, obh, 1);
-		if (!obh)
+		udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1);
+		udf_write_aext(inode, &oepos, eloc, elen, 1);
+		udf_write_aext(inode, &oepos, eloc, elen, 1);
+		if (!oepos.bh)
 		{
 			UDF_I_LENALLOC(inode) -= (adsize * 2);
 			mark_inode_dirty(inode);
 		}
 		else
 		{
-			aed = (struct allocExtDesc *)(obh)->b_data;
+			aed = (struct allocExtDesc *)oepos.bh->b_data;
 			aed->lengthAllocDescs =
 				cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) - (2*adsize));
 			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201)
-				udf_update_tag((obh)->b_data, oextoffset - (2*adsize));
+				udf_update_tag(oepos.bh->b_data, oepos.offset - (2*adsize));
 			else
-				udf_update_tag((obh)->b_data, sizeof(struct allocExtDesc));
-			mark_buffer_dirty_inode(obh, inode);
+				udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(oepos.bh, inode);
 		}
 	}
 	else
 	{
-		udf_write_aext(inode, obloc, &oextoffset, eloc, elen, obh, 1);
-		if (!obh)
+		udf_write_aext(inode, &oepos, eloc, elen, 1);
+		if (!oepos.bh)
 		{
 			UDF_I_LENALLOC(inode) -= adsize;
 			mark_inode_dirty(inode);
 		}
 		else
 		{
-			aed = (struct allocExtDesc *)(obh)->b_data;
+			aed = (struct allocExtDesc *)oepos.bh->b_data;
 			aed->lengthAllocDescs =
 				cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) - adsize);
 			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201)
-				udf_update_tag((obh)->b_data, oextoffset - adsize);
+				udf_update_tag(oepos.bh->b_data, epos.offset - adsize);
 			else
-				udf_update_tag((obh)->b_data, sizeof(struct allocExtDesc));
-			mark_buffer_dirty_inode(obh, inode);
+				udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(oepos.bh, inode);
 		}
 	}
 	
-	udf_release_data(nbh);
-	udf_release_data(obh);
+	brelse(epos.bh);
+	brelse(oepos.bh);
 	return (elen >> 30);
 }
 
-int8_t inode_bmap(struct inode *inode, int block, kernel_lb_addr *bloc, uint32_t *extoffset,
-	kernel_lb_addr *eloc, uint32_t *elen, uint32_t *offset, struct buffer_head **bh)
+int8_t inode_bmap(struct inode *inode, sector_t block, struct extent_position *pos,
+	kernel_lb_addr *eloc, uint32_t *elen, sector_t *offset)
 {
-	uint64_t lbcount = 0, bcount = (uint64_t)block << inode->i_sb->s_blocksize_bits;
+	loff_t lbcount = 0, bcount = (loff_t)block << inode->i_sb->s_blocksize_bits;
 	int8_t etype;
 
 	if (block < 0)
@@ -1960,42 +2081,44 @@ int8_t inode_bmap(struct inode *inode, int block, kernel_lb_addr *bloc, uint32_t
 		return -1;
 	}
 
-	*extoffset = 0;
+	pos->offset = 0;
+	pos->block = UDF_I_LOCATION(inode);
+	pos->bh = NULL;
 	*elen = 0;
-	*bloc = UDF_I_LOCATION(inode);
 
 	do
 	{
-		if ((etype = udf_next_aext(inode, bloc, extoffset, eloc, elen, bh, 1)) == -1)
+		if ((etype = udf_next_aext(inode, pos, eloc, elen, 1)) == -1)
 		{
-			*offset = bcount - lbcount;
+			*offset = (bcount - lbcount) >> inode->i_sb->s_blocksize_bits;
 			UDF_I_LENEXTENTS(inode) = lbcount;
 			return -1;
 		}
 		lbcount += *elen;
 	} while (lbcount <= bcount);
 
-	*offset = bcount + *elen - lbcount;
+	*offset = (bcount + *elen - lbcount) >> inode->i_sb->s_blocksize_bits;
 
 	return etype;
 }
 
-long udf_block_map(struct inode *inode, long block)
+long udf_block_map(struct inode *inode, sector_t block)
 {
-	kernel_lb_addr eloc, bloc;
-	uint32_t offset, extoffset, elen;
-	struct buffer_head *bh = NULL;
+	kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = { NULL, 0, { 0, 0}};
 	int ret;
 
 	lock_kernel();
 
-	if (inode_bmap(inode, block, &bloc, &extoffset, &eloc, &elen, &offset, &bh) == (EXT_RECORDED_ALLOCATED >> 30))
-		ret = udf_get_lb_pblock(inode->i_sb, eloc, offset >> inode->i_sb->s_blocksize_bits);
+	if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30))
+		ret = udf_get_lb_pblock(inode->i_sb, eloc, offset);
 	else
 		ret = 0;
 
 	unlock_kernel();
-	udf_release_data(bh);
+	brelse(epos.bh);
 
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV))
 		return udf_fixed_to_variable(ret);
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index cc8ca3254db1..a2b2a98ce78a 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -274,12 +274,6 @@ udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc, uint32_t offset, ui
 		loc.logicalBlockNum + offset, ident);
 }
 
-void udf_release_data(struct buffer_head *bh)
-{
-	if (bh)
-		brelse(bh);
-}
-
 void udf_update_tag(char *data, int length)
 {
 	tag *tptr = (tag *)data;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index fe361cd19a98..51fe307dc0ec 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -30,6 +30,7 @@
 #include <linux/quotaops.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/sched.h>
 
 static inline int udf_match(int len1, const char *name1, int len2, const char *name2)
 {
@@ -155,9 +156,10 @@ udf_find_entry(struct inode *dir, struct dentry *dentry,
 	uint8_t lfi;
 	uint16_t liu;
 	loff_t size;
-	kernel_lb_addr bloc, eloc;
-	uint32_t extoffset, elen, offset;
-	struct buffer_head *bh = NULL;
+	kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = { NULL, 0, { 0, 0}};
 
 	size = (udf_ext0_offset(dir) + dir->i_size) >> 2;
 	f_pos = (udf_ext0_offset(dir) >> 2);
@@ -166,42 +168,41 @@ udf_find_entry(struct inode *dir, struct dentry *dentry,
 	if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_IN_ICB)
 		fibh->sbh = fibh->ebh = NULL;
 	else if (inode_bmap(dir, f_pos >> (dir->i_sb->s_blocksize_bits - 2),
-		&bloc, &extoffset, &eloc, &elen, &offset, &bh) == (EXT_RECORDED_ALLOCATED >> 30))
+		&epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30))
 	{
-		offset >>= dir->i_sb->s_blocksize_bits;
 		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen)
 		{
 			if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_SHORT)
-				extoffset -= sizeof(short_ad);
+				epos.offset -= sizeof(short_ad);
 			else if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_LONG)
-				extoffset -= sizeof(long_ad);
+				epos.offset -= sizeof(long_ad);
 		}
 		else
 			offset = 0;
 
 		if (!(fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block)))
 		{
-			udf_release_data(bh);
+			brelse(epos.bh);
 			return NULL;
 		}
 	}
 	else
 	{
-		udf_release_data(bh);
+		brelse(epos.bh);
 		return NULL;
 	}
 
 	while ( (f_pos < size) )
 	{
-		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &bloc, &extoffset, &eloc, &elen, &offset, &bh);
+		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, &elen, &offset);
 
 		if (!fi)
 		{
 			if (fibh->sbh != fibh->ebh)
-				udf_release_data(fibh->ebh);
-			udf_release_data(fibh->sbh);
-			udf_release_data(bh);
+				brelse(fibh->ebh);
+			brelse(fibh->sbh);
+			brelse(epos.bh);
 			return NULL;
 		}
 
@@ -247,15 +248,15 @@ udf_find_entry(struct inode *dir, struct dentry *dentry,
 		{
 			if (udf_match(flen, fname, dentry->d_name.len, dentry->d_name.name))
 			{
-				udf_release_data(bh);
+				brelse(epos.bh);
 				return fi;
 			}
 		}
 	}
 	if (fibh->sbh != fibh->ebh)
-		udf_release_data(fibh->ebh);
-	udf_release_data(fibh->sbh);
-	udf_release_data(bh);
+		brelse(fibh->ebh);
+	brelse(fibh->sbh);
+	brelse(epos.bh);
 	return NULL;
 }
 
@@ -321,8 +322,8 @@ udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	if (udf_find_entry(dir, dentry, &fibh, &cfi))
 	{
 		if (fibh.sbh != fibh.ebh)
-			udf_release_data(fibh.ebh);
-		udf_release_data(fibh.sbh);
+			brelse(fibh.ebh);
+		brelse(fibh.sbh);
 
 		inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation));
 		if ( !inode )
@@ -353,9 +354,10 @@ udf_add_entry(struct inode *dir, struct dentry *dentry,
 	uint8_t lfi;
 	uint16_t liu;
 	int block;
-	kernel_lb_addr bloc, eloc;
-	uint32_t extoffset, elen, offset;
-	struct buffer_head *bh = NULL;
+	kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = { NULL, 0, { 0, 0 }};
 
 	sb = dir->i_sb;
 
@@ -384,23 +386,22 @@ udf_add_entry(struct inode *dir, struct dentry *dentry,
 	if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_IN_ICB)
 		fibh->sbh = fibh->ebh = NULL;
 	else if (inode_bmap(dir, f_pos >> (dir->i_sb->s_blocksize_bits - 2),
-		&bloc, &extoffset, &eloc, &elen, &offset, &bh) == (EXT_RECORDED_ALLOCATED >> 30))
+		&epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30))
 	{
-		offset >>= dir->i_sb->s_blocksize_bits;
 		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen)
 		{
 			if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_SHORT)
-				extoffset -= sizeof(short_ad);
+				epos.offset -= sizeof(short_ad);
 			else if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_LONG)
-				extoffset -= sizeof(long_ad);
+				epos.offset -= sizeof(long_ad);
 		}
 		else
 			offset = 0;
 
 		if (!(fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block)))
 		{
-			udf_release_data(bh);
+			brelse(epos.bh);
 			*err = -EIO;
 			return NULL;
 		}
@@ -418,14 +419,14 @@ udf_add_entry(struct inode *dir, struct dentry *dentry,
 
 	while ( (f_pos < size) )
 	{
-		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &bloc, &extoffset, &eloc, &elen, &offset, &bh);
+		fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, &elen, &offset);
 
 		if (!fi)
 		{
 			if (fibh->sbh != fibh->ebh)
-				udf_release_data(fibh->ebh);
-			udf_release_data(fibh->sbh);
-			udf_release_data(bh);
+				brelse(fibh->ebh);
+			brelse(fibh->sbh);
+			brelse(epos.bh);
 			*err = -EIO;
 			return NULL;
 		}
@@ -455,7 +456,7 @@ udf_add_entry(struct inode *dir, struct dentry *dentry,
 		{
 			if (((sizeof(struct fileIdentDesc) + liu + lfi + 3) & ~3) == nfidlen)
 			{
-				udf_release_data(bh);
+				brelse(epos.bh);
 				cfi->descTag.tagSerialNum = cpu_to_le16(1);
 				cfi->fileVersionNum = cpu_to_le16(1);
 				cfi->fileCharacteristics = 0;
@@ -478,9 +479,9 @@ udf_add_entry(struct inode *dir, struct dentry *dentry,
 			udf_match(flen, fname, dentry->d_name.len, dentry->d_name.name))
 		{
 			if (fibh->sbh != fibh->ebh)
-				udf_release_data(fibh->ebh);
-			udf_release_data(fibh->sbh);
-			udf_release_data(bh);
+				brelse(fibh->ebh);
+			brelse(fibh->sbh);
+			brelse(epos.bh);
 			*err = -EEXIST;
 			return NULL;
 		}
@@ -492,25 +493,25 @@ add:
 	if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_IN_ICB &&
 		sb->s_blocksize - fibh->eoffset < nfidlen)
 	{
-		udf_release_data(bh);
-		bh = NULL;
+		brelse(epos.bh);
+		epos.bh = NULL;
 		fibh->soffset -= udf_ext0_offset(dir);
 		fibh->eoffset -= udf_ext0_offset(dir);
 		f_pos -= (udf_ext0_offset(dir) >> 2);
 		if (fibh->sbh != fibh->ebh)
-			udf_release_data(fibh->ebh);
-		udf_release_data(fibh->sbh);
+			brelse(fibh->ebh);
+		brelse(fibh->sbh);
 		if (!(fibh->sbh = fibh->ebh = udf_expand_dir_adinicb(dir, &block, err)))
 			return NULL;
-		bloc = UDF_I_LOCATION(dir);
+		epos.block = UDF_I_LOCATION(dir);
 		eloc.logicalBlockNum = block;
 		eloc.partitionReferenceNum = UDF_I_LOCATION(dir).partitionReferenceNum;
 		elen = dir->i_sb->s_blocksize;
-		extoffset = udf_file_entry_alloc_offset(dir);
+		epos.offset = udf_file_entry_alloc_offset(dir);
 		if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_SHORT)
-			extoffset += sizeof(short_ad);
+			epos.offset += sizeof(short_ad);
 		else if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_LONG)
-			extoffset += sizeof(long_ad);
+			epos.offset += sizeof(long_ad);
 	}
 
 	if (sb->s_blocksize - fibh->eoffset >= nfidlen)
@@ -519,7 +520,7 @@ add:
 		fibh->eoffset += nfidlen;
 		if (fibh->sbh != fibh->ebh)
 		{
-			udf_release_data(fibh->sbh);
+			brelse(fibh->sbh);
 			fibh->sbh = fibh->ebh;
 		}
 
@@ -541,7 +542,7 @@ add:
 		fibh->eoffset += nfidlen - sb->s_blocksize;
 		if (fibh->sbh != fibh->ebh)
 		{
-			udf_release_data(fibh->sbh);
+			brelse(fibh->sbh);
 			fibh->sbh = fibh->ebh;
 		}
 
@@ -550,14 +551,14 @@ add:
 
 		if (!(fibh->ebh = udf_bread(dir, f_pos >> (dir->i_sb->s_blocksize_bits - 2), 1, err)))
 		{
-			udf_release_data(bh);
-			udf_release_data(fibh->sbh);
+			brelse(epos.bh);
+			brelse(fibh->sbh);
 			return NULL;
 		}
 
 		if (!(fibh->soffset))
 		{
-			if (udf_next_aext(dir, &bloc, &extoffset, &eloc, &elen, &bh, 1) ==
+			if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
 				(EXT_RECORDED_ALLOCATED >> 30))
 			{
 				block = eloc.logicalBlockNum + ((elen - 1) >>
@@ -566,7 +567,7 @@ add:
 			else
 				block ++;
 
-			udf_release_data(fibh->sbh);
+			brelse(fibh->sbh);
 			fibh->sbh = fibh->ebh;
 			fi = (struct fileIdentDesc *)(fibh->sbh->b_data);
 		}
@@ -587,7 +588,7 @@ add:
 	cfi->lengthOfImpUse = cpu_to_le16(0);
 	if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name))
 	{
-		udf_release_data(bh);
+		brelse(epos.bh);
 		dir->i_size += nfidlen;
 		if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_IN_ICB)
 			UDF_I_LENALLOC(dir) += nfidlen;
@@ -596,10 +597,10 @@ add:
 	}
 	else
 	{
-		udf_release_data(bh);
+		brelse(epos.bh);
 		if (fibh->sbh != fibh->ebh)
-			udf_release_data(fibh->ebh);
-		udf_release_data(fibh->sbh);
+			brelse(fibh->ebh);
+		brelse(fibh->sbh);
 		*err = -EIO;
 		return NULL;
 	}
@@ -656,8 +657,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, struct
 		mark_inode_dirty(dir);
 	}
 	if (fibh.sbh != fibh.ebh)
-		udf_release_data(fibh.ebh);
-	udf_release_data(fibh.sbh);
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
 	unlock_kernel();
 	d_instantiate(dentry, inode);
 	return 0;
@@ -701,8 +702,8 @@ static int udf_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_t
 	mark_inode_dirty(inode);
 
 	if (fibh.sbh != fibh.ebh)
-		udf_release_data(fibh.ebh);
-	udf_release_data(fibh.sbh);
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
 	d_instantiate(dentry, inode);
 	err = 0;
 out:
@@ -743,7 +744,7 @@ static int udf_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 		cpu_to_le32(UDF_I_UNIQUE(dir) & 0x00000000FFFFFFFFUL);
 	cfi.fileCharacteristics = FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
 	udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
-	udf_release_data(fibh.sbh);
+	brelse(fibh.sbh);
 	inode->i_mode = S_IFDIR | mode;
 	if (dir->i_mode & S_ISGID)
 		inode->i_mode |= S_ISGID;
@@ -766,8 +767,8 @@ static int udf_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	mark_inode_dirty(dir);
 	d_instantiate(dentry, inode);
 	if (fibh.sbh != fibh.ebh)
-		udf_release_data(fibh.ebh);
-	udf_release_data(fibh.sbh);
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
 	err = 0;
 out:
 	unlock_kernel();
@@ -781,9 +782,10 @@ static int empty_dir(struct inode *dir)
 	loff_t f_pos;
 	loff_t size = (udf_ext0_offset(dir) + dir->i_size) >> 2;
 	int block;
-	kernel_lb_addr bloc, eloc;
-	uint32_t extoffset, elen, offset;
-	struct buffer_head *bh = NULL;
+	kernel_lb_addr eloc;
+	uint32_t elen;
+	sector_t offset;
+	struct extent_position epos = { NULL, 0, { 0, 0}};
 
 	f_pos = (udf_ext0_offset(dir) >> 2);
 
@@ -792,59 +794,58 @@ static int empty_dir(struct inode *dir)
 	if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_IN_ICB)
 		fibh.sbh = fibh.ebh = NULL;
 	else if (inode_bmap(dir, f_pos >> (dir->i_sb->s_blocksize_bits - 2),
-		&bloc, &extoffset, &eloc, &elen, &offset, &bh) == (EXT_RECORDED_ALLOCATED >> 30))
+		&epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30))
 	{
-		offset >>= dir->i_sb->s_blocksize_bits;
 		block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
 		if ((++offset << dir->i_sb->s_blocksize_bits) < elen)
 		{
 			if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_SHORT)
-				extoffset -= sizeof(short_ad);
+				epos.offset -= sizeof(short_ad);
 			else if (UDF_I_ALLOCTYPE(dir) == ICBTAG_FLAG_AD_LONG)
-				extoffset -= sizeof(long_ad);
+				epos.offset -= sizeof(long_ad);
 		}
 		else
 			offset = 0;
 
 		if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block)))
 		{
-			udf_release_data(bh);
+			brelse(epos.bh);
 			return 0;
 		}
 	}
 	else
 	{
-		udf_release_data(bh);
+		brelse(epos.bh);
 		return 0;
 	}
 
 
 	while ( (f_pos < size) )
 	{
-		fi = udf_fileident_read(dir, &f_pos, &fibh, &cfi, &bloc, &extoffset, &eloc, &elen, &offset, &bh);
+		fi = udf_fileident_read(dir, &f_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset);
 
 		if (!fi)
 		{
 			if (fibh.sbh != fibh.ebh)
-				udf_release_data(fibh.ebh);
-			udf_release_data(fibh.sbh);
-			udf_release_data(bh);
+				brelse(fibh.ebh);
+			brelse(fibh.sbh);
+			brelse(epos.bh);
 			return 0;
 		}
 
 		if (cfi.lengthFileIdent && (cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) == 0)
 		{
 			if (fibh.sbh != fibh.ebh)
-				udf_release_data(fibh.ebh);
-			udf_release_data(fibh.sbh);
-			udf_release_data(bh);
+				brelse(fibh.ebh);
+			brelse(fibh.sbh);
+			brelse(epos.bh);
 			return 0;
 		}
 	}
 	if (fibh.sbh != fibh.ebh)
-		udf_release_data(fibh.ebh);
-	udf_release_data(fibh.sbh);
-	udf_release_data(bh);
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
+	brelse(epos.bh);
 	return 1;
 }
 
@@ -878,14 +879,14 @@ static int udf_rmdir(struct inode * dir, struct dentry * dentry)
 			inode->i_nlink);
 	clear_nlink(inode);
 	inode->i_size = 0;
-	inode_dec_link_count(inode);
+	inode_dec_link_count(dir);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
 	mark_inode_dirty(dir);
 
 end_rmdir:
 	if (fibh.sbh != fibh.ebh)
-		udf_release_data(fibh.ebh);
-	udf_release_data(fibh.sbh);
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
 out:
 	unlock_kernel();
 	return retval;
@@ -928,8 +929,8 @@ static int udf_unlink(struct inode * dir, struct dentry * dentry)
 
 end_unlink:
 	if (fibh.sbh != fibh.ebh)
-		udf_release_data(fibh.ebh);
-	udf_release_data(fibh.sbh);
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
 out:
 	unlock_kernel();
 	return retval;
@@ -941,7 +942,7 @@ static int udf_symlink(struct inode * dir, struct dentry * dentry, const char *
 	struct pathComponent *pc;
 	char *compstart;
 	struct udf_fileident_bh fibh;
-	struct buffer_head *bh = NULL;
+	struct extent_position epos = { NULL,  0, {0, 0}};
 	int eoffset, elen = 0;
 	struct fileIdentDesc *fi;
 	struct fileIdentDesc cfi;
@@ -961,33 +962,33 @@ static int udf_symlink(struct inode * dir, struct dentry * dentry, const char *
 
 	if (UDF_I_ALLOCTYPE(inode) != ICBTAG_FLAG_AD_IN_ICB)
 	{
-		struct buffer_head *bh = NULL;
-		kernel_lb_addr bloc, eloc;
-		uint32_t elen, extoffset;
+		kernel_lb_addr eloc;
+		uint32_t elen;
 
 		block = udf_new_block(inode->i_sb, inode,
 			UDF_I_LOCATION(inode).partitionReferenceNum,
 			UDF_I_LOCATION(inode).logicalBlockNum, &err);
 		if (!block)
 			goto out_no_entry;
-		bloc = UDF_I_LOCATION(inode);
+		epos.block = UDF_I_LOCATION(inode);
+		epos.offset = udf_file_entry_alloc_offset(inode);
+		epos.bh = NULL;
 		eloc.logicalBlockNum = block;
 		eloc.partitionReferenceNum = UDF_I_LOCATION(inode).partitionReferenceNum;
 		elen = inode->i_sb->s_blocksize;
 		UDF_I_LENEXTENTS(inode) = elen;
-		extoffset = udf_file_entry_alloc_offset(inode);
-		udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 0);
-		udf_release_data(bh);
+		udf_add_aext(inode, &epos, eloc, elen, 0);
+		brelse(epos.bh);
 
 		block = udf_get_pblock(inode->i_sb, block,
 			UDF_I_LOCATION(inode).partitionReferenceNum, 0);
-		bh = udf_tread(inode->i_sb, block);
-		lock_buffer(bh);
-		memset(bh->b_data, 0x00, inode->i_sb->s_blocksize);
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-		mark_buffer_dirty_inode(bh, inode);
-		ea = bh->b_data + udf_ext0_offset(inode);
+		epos.bh = udf_tread(inode->i_sb, block);
+		lock_buffer(epos.bh);
+		memset(epos.bh->b_data, 0x00, inode->i_sb->s_blocksize);
+		set_buffer_uptodate(epos.bh);
+		unlock_buffer(epos.bh);
+		mark_buffer_dirty_inode(epos.bh, inode);
+		ea = epos.bh->b_data + udf_ext0_offset(inode);
 	}
 	else
 		ea = UDF_I_DATA(inode) + UDF_I_LENEATTR(inode);
@@ -1060,7 +1061,7 @@ static int udf_symlink(struct inode * dir, struct dentry * dentry, const char *
 		}
 	}
 
-	udf_release_data(bh);
+	brelse(epos.bh);
 	inode->i_size = elen;
 	if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_IN_ICB)
 		UDF_I_LENALLOC(inode) = inode->i_size;
@@ -1089,8 +1090,8 @@ static int udf_symlink(struct inode * dir, struct dentry * dentry, const char *
 		mark_inode_dirty(dir);
 	}
 	if (fibh.sbh != fibh.ebh)
-		udf_release_data(fibh.ebh);
-	udf_release_data(fibh.sbh);
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
 	d_instantiate(dentry, inode);
 	err = 0;
 
@@ -1145,8 +1146,8 @@ static int udf_link(struct dentry * old_dentry, struct inode * dir,
 		mark_inode_dirty(dir);
 	}
 	if (fibh.sbh != fibh.ebh)
-		udf_release_data(fibh.ebh);
-	udf_release_data(fibh.sbh);
+		brelse(fibh.ebh);
+	brelse(fibh.sbh);
 	inc_nlink(inode);
 	inode->i_ctime = current_fs_time(inode->i_sb);
 	mark_inode_dirty(inode);
@@ -1174,8 +1175,8 @@ static int udf_rename (struct inode * old_dir, struct dentry * old_dentry,
 	if ((ofi = udf_find_entry(old_dir, old_dentry, &ofibh, &ocfi)))
 	{
 		if (ofibh.sbh != ofibh.ebh)
-			udf_release_data(ofibh.ebh);
-		udf_release_data(ofibh.sbh);
+			brelse(ofibh.ebh);
+		brelse(ofibh.sbh);
 	}
 	tloc = lelb_to_cpu(ocfi.icb.extLocation);
 	if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0)
@@ -1188,8 +1189,8 @@ static int udf_rename (struct inode * old_dir, struct dentry * old_dentry,
 		if (!new_inode)
 		{
 			if (nfibh.sbh != nfibh.ebh)
-				udf_release_data(nfibh.ebh);
-			udf_release_data(nfibh.sbh);
+				brelse(nfibh.ebh);
+			brelse(nfibh.sbh);
 			nfi = NULL;
 		}
 	}
@@ -1290,19 +1291,19 @@ static int udf_rename (struct inode * old_dir, struct dentry * old_dentry,
 	if (ofi)
 	{
 		if (ofibh.sbh != ofibh.ebh)
-			udf_release_data(ofibh.ebh);
-		udf_release_data(ofibh.sbh);
+			brelse(ofibh.ebh);
+		brelse(ofibh.sbh);
 	}
 
 	retval = 0;
 
 end_rename:
-	udf_release_data(dir_bh);
+	brelse(dir_bh);
 	if (nfi)
 	{
 		if (nfibh.sbh != nfibh.ebh)
-			udf_release_data(nfibh.ebh);
-		udf_release_data(nfibh.sbh);
+			brelse(nfibh.ebh);
+		brelse(nfibh.sbh);
 	}
 	unlock_kernel();
 	return retval;
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index dabf2b841db8..467a26171cd9 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -81,7 +81,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block, uint16_t
 
 	loc = le32_to_cpu(((__le32 *)bh->b_data)[index]);
 
-	udf_release_data(bh);
+	brelse(bh);
 
 	if (UDF_I_LOCATION(UDF_SB_VAT(sb)).partitionReferenceNum == partition)
 	{
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 8672b88f7ff2..6658afb41cc7 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -134,12 +134,8 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-	{
-		ei->i_ext.i_data = NULL;
-		inode_init_once(&ei->vfs_inode);
-	}
+	ei->i_ext.i_data = NULL;
+	inode_init_once(&ei->vfs_inode);
 }
 
 static int init_inodecache(void)
@@ -565,7 +561,7 @@ udf_vrs(struct super_block *sb, int silent)
 
 		if (vsd->stdIdent[0] == 0)
 		{
-			udf_release_data(bh);
+			brelse(bh);
 			break;
 		}
 		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001, VSD_STD_ID_LEN))
@@ -598,7 +594,7 @@ udf_vrs(struct super_block *sb, int silent)
 		}
 		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_TEA01, VSD_STD_ID_LEN))
 		{
-			udf_release_data(bh);
+			brelse(bh);
 			break;
 		}
 		else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR02, VSD_STD_ID_LEN))
@@ -609,7 +605,7 @@ udf_vrs(struct super_block *sb, int silent)
 		{
 			nsr03 = sector;
 		}
-		udf_release_data(bh);
+		brelse(bh);
 	}
 
 	if (nsr03)
@@ -675,7 +671,7 @@ udf_find_anchor(struct super_block *sb)
 			{
 				ident = le16_to_cpu(((tag *)bh->b_data)->tagIdent);
 				location = le32_to_cpu(((tag *)bh->b_data)->tagLocation);
-				udf_release_data(bh);
+				brelse(bh);
 			}
 
 			if (ident == TAG_IDENT_AVDP)
@@ -710,7 +706,7 @@ udf_find_anchor(struct super_block *sb)
 				{
 					ident = le16_to_cpu(((tag *)bh->b_data)->tagIdent);
 					location = le32_to_cpu(((tag *)bh->b_data)->tagLocation);
-					udf_release_data(bh);
+					brelse(bh);
 				}
 	
 				if (ident == TAG_IDENT_AVDP &&
@@ -729,7 +725,7 @@ udf_find_anchor(struct super_block *sb)
 					{
 						ident = le16_to_cpu(((tag *)bh->b_data)->tagIdent);
 						location = le32_to_cpu(((tag *)bh->b_data)->tagLocation);
-						udf_release_data(bh);
+						brelse(bh);
 					}
 	
 					if (ident == TAG_IDENT_AVDP &&
@@ -751,7 +747,7 @@ udf_find_anchor(struct super_block *sb)
 		{
 			ident = le16_to_cpu(((tag *)bh->b_data)->tagIdent);
 			location = le32_to_cpu(((tag *)bh->b_data)->tagLocation);
-			udf_release_data(bh);
+			brelse(bh);
 
 			if (ident == TAG_IDENT_AVDP && location == 256)
 				UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
@@ -768,7 +764,7 @@ udf_find_anchor(struct super_block *sb)
 			}
 			else
 			{
-				udf_release_data(bh);
+				brelse(bh);
 				if ((ident != TAG_IDENT_AVDP) && (i ||
 					(ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE)))
 				{
@@ -797,7 +793,7 @@ udf_find_fileset(struct super_block *sb, kernel_lb_addr *fileset, kernel_lb_addr
 			return 1;
 		else if (ident != TAG_IDENT_FSD)
 		{
-			udf_release_data(bh);
+			brelse(bh);
 			return 1;
 		}
 			
@@ -836,7 +832,7 @@ udf_find_fileset(struct super_block *sb, kernel_lb_addr *fileset, kernel_lb_addr
 						newfileset.logicalBlockNum += 1 +
 							((le32_to_cpu(sp->numOfBytes) + sizeof(struct spaceBitmapDesc) - 1)
 								>> sb->s_blocksize_bits);
-						udf_release_data(bh);
+						brelse(bh);
 						break;
 					}
 					case TAG_IDENT_FSD:
@@ -847,7 +843,7 @@ udf_find_fileset(struct super_block *sb, kernel_lb_addr *fileset, kernel_lb_addr
 					default:
 					{
 						newfileset.logicalBlockNum ++;
-						udf_release_data(bh);
+						brelse(bh);
 						bh = NULL;
 						break;
 					}
@@ -867,7 +863,7 @@ udf_find_fileset(struct super_block *sb, kernel_lb_addr *fileset, kernel_lb_addr
 
 		UDF_SB_PARTITION(sb) = fileset->partitionReferenceNum;
 		udf_load_fileset(sb, bh, root);
-		udf_release_data(bh);
+		brelse(bh);
 		return 0;
 	}
 	return 1;
@@ -1085,7 +1081,7 @@ udf_load_logicalvol(struct super_block *sb, struct buffer_head * bh, kernel_lb_a
 						if (ident != 0 ||
 							strncmp(st->sparingIdent.ident, UDF_ID_SPARING, strlen(UDF_ID_SPARING)))
 						{
-							udf_release_data(UDF_SB_TYPESPAR(sb,i).s_spar_map[j]);
+							brelse(UDF_SB_TYPESPAR(sb,i).s_spar_map[j]);
 							UDF_SB_TYPESPAR(sb,i).s_spar_map[j] = NULL;
 						}
 					}
@@ -1139,12 +1135,12 @@ udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
 			udf_load_logicalvolint(sb, leea_to_cpu(UDF_SB_LVID(sb)->nextIntegrityExt));
 		
 		if (UDF_SB_LVIDBH(sb) != bh)
-			udf_release_data(bh);
+			brelse(bh);
 		loc.extLength -= sb->s_blocksize;
 		loc.extLocation ++;
 	}
 	if (UDF_SB_LVIDBH(sb) != bh)
-		udf_release_data(bh);
+		brelse(bh);
 }
 
 /*
@@ -1247,7 +1243,7 @@ udf_process_sequence(struct super_block *sb, long block, long lastblock, kernel_
 					done = 1;
 				break;
 		}
-		udf_release_data(bh);
+		brelse(bh);
 	}
 	for (i=0; i<VDS_POS_LENGTH; i++)
 	{
@@ -1269,10 +1265,10 @@ udf_process_sequence(struct super_block *sb, long block, long lastblock, kernel_
 					gd = (struct generic_desc *)bh2->b_data;
 					if (ident == TAG_IDENT_PD)
 						udf_load_partdesc(sb, bh2);
-					udf_release_data(bh2);
+					brelse(bh2);
 				}
 			}
-			udf_release_data(bh);
+			brelse(bh);
 		}
 	}
 
@@ -1335,7 +1331,7 @@ udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
 			reserve_e = reserve_e >> sb->s_blocksize_bits;
 			reserve_e += reserve_s;
 
-			udf_release_data(bh);
+			brelse(bh);
 
 			/* Process the main & reserve sequences */
 			/* responsible for finding the PartitionDesc(s) */
@@ -1355,7 +1351,7 @@ udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
 
 	for (i=0; i<UDF_SB_NUMPARTS(sb); i++)
 	{
-		switch UDF_SB_PARTTYPE(sb, i)
+		switch (UDF_SB_PARTTYPE(sb, i))
 		{
 			case UDF_VIRTUAL_MAP15:
 			case UDF_VIRTUAL_MAP20:
@@ -1405,12 +1401,14 @@ udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
 
 					pos = udf_block_map(UDF_SB_VAT(sb), 0);
 					bh = sb_bread(sb, pos);
+					if (!bh)
+						return 1;
 					UDF_SB_TYPEVIRT(sb,i).s_start_offset =
 						le16_to_cpu(((struct virtualAllocationTable20 *)bh->b_data + udf_ext0_offset(UDF_SB_VAT(sb)))->lengthHeader) +
 							udf_ext0_offset(UDF_SB_VAT(sb));
 					UDF_SB_TYPEVIRT(sb,i).s_num_entries = (UDF_SB_VAT(sb)->i_size -
 						UDF_SB_TYPEVIRT(sb,i).s_start_offset) >> 2;
-					udf_release_data(bh);
+					brelse(bh);
 				}
 				UDF_SB_PARTROOT(sb,i) = udf_get_pblock(sb, 0, i, 0);
 				UDF_SB_PARTLEN(sb,i) = UDF_SB_PARTLEN(sb,ino.partitionReferenceNum);
@@ -1663,7 +1661,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 		iput(inode);
 		goto error_out;
 	}
-	sb->s_maxbytes = 1<<30;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	return 0;
 
 error_out:
@@ -1682,7 +1680,7 @@ error_out:
 		if (UDF_SB_PARTTYPE(sb, UDF_SB_PARTITION(sb)) == UDF_SPARABLE_MAP15)
 		{
 			for (i=0; i<4; i++)
-				udf_release_data(UDF_SB_TYPESPAR(sb, UDF_SB_PARTITION(sb)).s_spar_map[i]);
+				brelse(UDF_SB_TYPESPAR(sb, UDF_SB_PARTITION(sb)).s_spar_map[i]);
 		}
 	}
 #ifdef CONFIG_UDF_NLS
@@ -1691,7 +1689,7 @@ error_out:
 #endif
 	if (!(sb->s_flags & MS_RDONLY))
 		udf_close_lvid(sb);
-	udf_release_data(UDF_SB_LVIDBH(sb));
+	brelse(UDF_SB_LVIDBH(sb));
 	UDF_SB_FREE(sb);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
@@ -1760,7 +1758,7 @@ udf_put_super(struct super_block *sb)
 		if (UDF_SB_PARTTYPE(sb, UDF_SB_PARTITION(sb)) == UDF_SPARABLE_MAP15)
 		{
 			for (i=0; i<4; i++)
-				udf_release_data(UDF_SB_TYPESPAR(sb, UDF_SB_PARTITION(sb)).s_spar_map[i]);
+				brelse(UDF_SB_TYPESPAR(sb, UDF_SB_PARTITION(sb)).s_spar_map[i]);
 		}
 	}
 #ifdef CONFIG_UDF_NLS
@@ -1769,7 +1767,7 @@ udf_put_super(struct super_block *sb)
 #endif
 	if (!(sb->s_flags & MS_RDONLY))
 		udf_close_lvid(sb);
-	udf_release_data(UDF_SB_LVIDBH(sb));
+	brelse(UDF_SB_LVIDBH(sb));
 	UDF_SB_FREE(sb);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
@@ -1839,7 +1837,7 @@ udf_count_free_bitmap(struct super_block *sb, struct udf_bitmap *bitmap)
 	}
 	else if (ident != TAG_IDENT_SBD)
 	{
-		udf_release_data(bh);
+		brelse(bh);
 		printk(KERN_ERR "udf: udf_count_free failed\n");
 		goto out;
 	}
@@ -1861,7 +1859,7 @@ udf_count_free_bitmap(struct super_block *sb, struct udf_bitmap *bitmap)
 		}
 		if ( bytes )
 		{
-			udf_release_data(bh);
+			brelse(bh);
 			newblock = udf_get_lb_pblock(sb, loc, ++block);
 			bh = udf_tread(sb, newblock);
 			if (!bh)
@@ -1873,7 +1871,7 @@ udf_count_free_bitmap(struct super_block *sb, struct udf_bitmap *bitmap)
 			ptr = (uint8_t *)bh->b_data;
 		}
 	}
-	udf_release_data(bh);
+	brelse(bh);
 
 out:
 	unlock_kernel();
@@ -1885,21 +1883,20 @@ static unsigned int
 udf_count_free_table(struct super_block *sb, struct inode * table)
 {
 	unsigned int accum = 0;
-	uint32_t extoffset, elen;
-	kernel_lb_addr bloc, eloc;
+	uint32_t elen;
+	kernel_lb_addr eloc;
 	int8_t etype;
-	struct buffer_head *bh = NULL;
+	struct extent_position epos;
 
 	lock_kernel();
 
-	bloc = UDF_I_LOCATION(table);
-	extoffset = sizeof(struct unallocSpaceEntry);
+	epos.block = UDF_I_LOCATION(table);
+	epos.offset = sizeof(struct unallocSpaceEntry);
+	epos.bh = NULL;
 
-	while ((etype = udf_next_aext(table, &bloc, &extoffset, &eloc, &elen, &bh, 1)) != -1)
-	{
+	while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1)
 		accum += (elen >> table->i_sb->s_blocksize_bits);
-	}
-	udf_release_data(bh);
+	brelse(epos.bh);
 
 	unlock_kernel();
 
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index ba068a786563..12613b680cc4 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -95,7 +95,7 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 	}
 
 	udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
-	udf_release_data(bh);
+	brelse(bh);
 
 	unlock_kernel();
 	SetPageUptodate(page);
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 0abd66ce36ea..77975ae291a5 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -28,8 +28,8 @@
 #include "udf_i.h"
 #include "udf_sb.h"
 
-static void extent_trunc(struct inode * inode, kernel_lb_addr bloc, int extoffset,
-	kernel_lb_addr eloc, int8_t etype, uint32_t elen, struct buffer_head *bh, uint32_t nelen)
+static void extent_trunc(struct inode * inode, struct extent_position *epos,
+	kernel_lb_addr eloc, int8_t etype, uint32_t elen, uint32_t nelen)
 {
 	kernel_lb_addr neloc = { 0, 0 };
 	int last_block = (elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits;
@@ -49,7 +49,7 @@ static void extent_trunc(struct inode * inode, kernel_lb_addr bloc, int extoffse
 
 	if (elen != nelen)
 	{
-		udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 0);
+		udf_write_aext(inode, epos, neloc, nelen, 0);
 		if (last_block - first_block > 0)
 		{
 			if (etype == (EXT_RECORDED_ALLOCATED >> 30))
@@ -63,18 +63,16 @@ static void extent_trunc(struct inode * inode, kernel_lb_addr bloc, int extoffse
 
 void udf_discard_prealloc(struct inode * inode)
 {
-	kernel_lb_addr bloc, eloc;
-	uint32_t extoffset = 0, elen, nelen;
+	struct extent_position epos = { NULL, 0, {0, 0}};
+	kernel_lb_addr eloc;
+	uint32_t elen, nelen;
 	uint64_t lbcount = 0;
 	int8_t etype = -1, netype;
-	struct buffer_head *bh = NULL;
 	int adsize;
 
 	if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_IN_ICB ||
 		inode->i_size == UDF_I_LENEXTENTS(inode))
-	{
 		return;
-	}
 
 	if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_SHORT)
 		adsize = sizeof(short_ad);
@@ -83,52 +81,58 @@ void udf_discard_prealloc(struct inode * inode)
 	else
 		adsize = 0;
 
-	bloc = UDF_I_LOCATION(inode);
+	epos.block = UDF_I_LOCATION(inode);
 
-	while ((netype = udf_next_aext(inode, &bloc, &extoffset, &eloc, &elen, &bh, 1)) != -1)
+	/* Find the last extent in the file */
+	while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1)
 	{
 		etype = netype;
 		lbcount += elen;
-		if (lbcount > inode->i_size && lbcount - inode->i_size < inode->i_sb->s_blocksize)
+		if (lbcount > inode->i_size && lbcount - elen < inode->i_size)
 		{
+			WARN_ON(lbcount - inode->i_size >= inode->i_sb->s_blocksize);
 			nelen = elen - (lbcount - inode->i_size);
-			extent_trunc(inode, bloc, extoffset-adsize, eloc, etype, elen, bh, nelen);
+			epos.offset -= adsize;
+			extent_trunc(inode, &epos, eloc, etype, elen, nelen);
+			epos.offset += adsize;
 			lbcount = inode->i_size;
 		}
 	}
-	if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30))
-	{
-		extoffset -= adsize;
+	if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
+		epos.offset -= adsize;
 		lbcount -= elen;
-		extent_trunc(inode, bloc, extoffset, eloc, etype, elen, bh, 0);
-		if (!bh)
+		extent_trunc(inode, &epos, eloc, etype, elen, 0);
+		if (!epos.bh)
 		{
-			UDF_I_LENALLOC(inode) = extoffset - udf_file_entry_alloc_offset(inode);
+			UDF_I_LENALLOC(inode) = epos.offset - udf_file_entry_alloc_offset(inode);
 			mark_inode_dirty(inode);
 		}
 		else
 		{
-			struct allocExtDesc *aed = (struct allocExtDesc *)(bh->b_data);
-			aed->lengthAllocDescs = cpu_to_le32(extoffset - sizeof(struct allocExtDesc));
+			struct allocExtDesc *aed = (struct allocExtDesc *)(epos.bh->b_data);
+			aed->lengthAllocDescs = cpu_to_le32(epos.offset - sizeof(struct allocExtDesc));
 			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201)
-				udf_update_tag(bh->b_data, extoffset);
+				udf_update_tag(epos.bh->b_data, epos.offset);
 			else
-				udf_update_tag(bh->b_data, sizeof(struct allocExtDesc));
-			mark_buffer_dirty_inode(bh, inode);
+				udf_update_tag(epos.bh->b_data, sizeof(struct allocExtDesc));
+			mark_buffer_dirty_inode(epos.bh, inode);
 		}
 	}
 	UDF_I_LENEXTENTS(inode) = lbcount;
 
-	udf_release_data(bh);
+	WARN_ON(lbcount != inode->i_size);
+	brelse(epos.bh);
 }
 
 void udf_truncate_extents(struct inode * inode)
 {
-	kernel_lb_addr bloc, eloc, neloc = { 0, 0 };
-	uint32_t extoffset, elen, offset, nelen = 0, lelen = 0, lenalloc;
+	struct extent_position epos;
+	kernel_lb_addr eloc, neloc = { 0, 0 };
+	uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
 	int8_t etype;
-	int first_block = inode->i_size >> inode->i_sb->s_blocksize_bits;
-	struct buffer_head *bh = NULL;
+	struct super_block *sb = inode->i_sb;
+	sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset;
+	loff_t byte_offset;
 	int adsize;
 
 	if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_SHORT)
@@ -136,158 +140,130 @@ void udf_truncate_extents(struct inode * inode)
 	else if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_LONG)
 		adsize = sizeof(long_ad);
 	else
-		adsize = 0;
+		BUG();
 
-	etype = inode_bmap(inode, first_block, &bloc, &extoffset, &eloc, &elen, &offset, &bh);
-	offset += (inode->i_size & (inode->i_sb->s_blocksize - 1));
+	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+	byte_offset = (offset << sb->s_blocksize_bits) + (inode->i_size & (sb->s_blocksize-1));
 	if (etype != -1)
 	{
-		extoffset -= adsize;
-		extent_trunc(inode, bloc, extoffset, eloc, etype, elen, bh, offset);
-		extoffset += adsize;
-
-		if (offset)
-			lenalloc = extoffset;
+		epos.offset -= adsize;
+		extent_trunc(inode, &epos, eloc, etype, elen, byte_offset);
+		epos.offset += adsize;
+		if (byte_offset)
+			lenalloc = epos.offset;
 		else
-			lenalloc = extoffset - adsize;
+			lenalloc = epos.offset - adsize;
 
-		if (!bh)
+		if (!epos.bh)
 			lenalloc -= udf_file_entry_alloc_offset(inode);
 		else
 			lenalloc -= sizeof(struct allocExtDesc);
 
-		while ((etype = udf_current_aext(inode, &bloc, &extoffset, &eloc, &elen, &bh, 0)) != -1)
+		while ((etype = udf_current_aext(inode, &epos, &eloc, &elen, 0)) != -1)
 		{
 			if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30))
 			{
-				udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 0);
-				extoffset = 0;
-				if (lelen)
+				udf_write_aext(inode, &epos, neloc, nelen, 0);
+				if (indirect_ext_len)
 				{
-					if (!bh)
+					/* We managed to free all extents in the
+					 * indirect extent - free it too */
+					if (!epos.bh)
 						BUG();
-					else
-						memset(bh->b_data, 0x00, sizeof(struct allocExtDesc));
-					udf_free_blocks(inode->i_sb, inode, bloc, 0, lelen);
+					udf_free_blocks(sb, inode, epos.block, 0, indirect_ext_len);
 				}
 				else
 				{
-					if (!bh)
+					if (!epos.bh)
 					{
 						UDF_I_LENALLOC(inode) = lenalloc;
 						mark_inode_dirty(inode);
 					}
 					else
 					{
-						struct allocExtDesc *aed = (struct allocExtDesc *)(bh->b_data);
+						struct allocExtDesc *aed = (struct allocExtDesc *)(epos.bh->b_data);
 						aed->lengthAllocDescs = cpu_to_le32(lenalloc);
-						if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201)
-							udf_update_tag(bh->b_data, lenalloc +
+						if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(sb) >= 0x0201)
+							udf_update_tag(epos.bh->b_data, lenalloc +
 								sizeof(struct allocExtDesc));
 						else
-							udf_update_tag(bh->b_data, sizeof(struct allocExtDesc));
-						mark_buffer_dirty_inode(bh, inode);
+							udf_update_tag(epos.bh->b_data, sizeof(struct allocExtDesc));
+						mark_buffer_dirty_inode(epos.bh, inode);
 					}
 				}
-
-				udf_release_data(bh);
-				extoffset = sizeof(struct allocExtDesc);
-				bloc = eloc;
-				bh = udf_tread(inode->i_sb, udf_get_lb_pblock(inode->i_sb, bloc, 0));
+				brelse(epos.bh);
+				epos.offset = sizeof(struct allocExtDesc);
+				epos.block = eloc;
+				epos.bh = udf_tread(sb, udf_get_lb_pblock(sb, eloc, 0));
 				if (elen)
-					lelen = (elen + inode->i_sb->s_blocksize - 1) >>
-						inode->i_sb->s_blocksize_bits;
+					indirect_ext_len = (elen +
+						sb->s_blocksize - 1) >>
+						sb->s_blocksize_bits;
 				else
-					lelen = 1;
+					indirect_ext_len = 1;
 			}
 			else
 			{
-				extent_trunc(inode, bloc, extoffset, eloc, etype, elen, bh, 0);
-				extoffset += adsize;
+				extent_trunc(inode, &epos, eloc, etype, elen, 0);
+				epos.offset += adsize;
 			}
 		}
 
-		if (lelen)
+		if (indirect_ext_len)
 		{
-			if (!bh)
+			if (!epos.bh)
 				BUG();
-			else
-				memset(bh->b_data, 0x00, sizeof(struct allocExtDesc));
-			udf_free_blocks(inode->i_sb, inode, bloc, 0, lelen);
+			udf_free_blocks(sb, inode, epos.block, 0, indirect_ext_len);
 		}
 		else
 		{
-			if (!bh)
+			if (!epos.bh)
 			{
 				UDF_I_LENALLOC(inode) = lenalloc;
 				mark_inode_dirty(inode);
 			}
 			else
 			{
-				struct allocExtDesc *aed = (struct allocExtDesc *)(bh->b_data);
+				struct allocExtDesc *aed = (struct allocExtDesc *)(epos.bh->b_data);
 				aed->lengthAllocDescs = cpu_to_le32(lenalloc);
-				if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201)
-					udf_update_tag(bh->b_data, lenalloc +
+				if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(sb) >= 0x0201)
+					udf_update_tag(epos.bh->b_data, lenalloc +
 						sizeof(struct allocExtDesc));
 				else
-					udf_update_tag(bh->b_data, sizeof(struct allocExtDesc));
-				mark_buffer_dirty_inode(bh, inode);
+					udf_update_tag(epos.bh->b_data, sizeof(struct allocExtDesc));
+				mark_buffer_dirty_inode(epos.bh, inode);
 			}
 		}
 	}
 	else if (inode->i_size)
 	{
-		if (offset)
+		if (byte_offset)
 		{
+			kernel_long_ad extent;
+
 			/*
 			 *  OK, there is not extent covering inode->i_size and
 			 *  no extent above inode->i_size => truncate is
-			 *  extending the file by 'offset'.
+			 *  extending the file by 'offset' blocks.
 			 */
-			if ((!bh && extoffset == udf_file_entry_alloc_offset(inode)) ||
-			    (bh && extoffset == sizeof(struct allocExtDesc))) {
-				/* File has no extents at all! */
-				memset(&eloc, 0x00, sizeof(kernel_lb_addr));
-				elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset;
-				udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1);
+			if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
+			    (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
+				/* File has no extents at all or has empty last
+				 * indirect extent! Create a fake extent... */
+				extent.extLocation.logicalBlockNum = 0;
+				extent.extLocation.partitionReferenceNum = 0;
+				extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
 			}
 			else {
-				extoffset -= adsize;
-				etype = udf_next_aext(inode, &bloc, &extoffset, &eloc, &elen, &bh, 1);
-				if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
-				{
-					extoffset -= adsize;
-					elen = EXT_NOT_RECORDED_NOT_ALLOCATED | (elen + offset);
-					udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 0);
-				}
-				else if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30))
-				{
-					kernel_lb_addr neloc = { 0, 0 };
-					extoffset -= adsize;
-					nelen = EXT_NOT_RECORDED_NOT_ALLOCATED |
-						((elen + offset + inode->i_sb->s_blocksize - 1) &
-						~(inode->i_sb->s_blocksize - 1));
-					udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 1);
-					udf_add_aext(inode, &bloc, &extoffset, eloc, (etype << 30) | elen, &bh, 1);
-				}
-				else
-				{
-					if (elen & (inode->i_sb->s_blocksize - 1))
-					{
-						extoffset -= adsize;
-						elen = EXT_RECORDED_ALLOCATED |
-							((elen + inode->i_sb->s_blocksize - 1) &
-							~(inode->i_sb->s_blocksize - 1));
-						udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 1);
-					}
-					memset(&eloc, 0x00, sizeof(kernel_lb_addr));
-					elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset;
-					udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1);
-				}
+				epos.offset -= adsize;
+				etype = udf_next_aext(inode, &epos,
+					&extent.extLocation, &extent.extLength, 0);
+				extent.extLength |= etype << 30;
 			}
+			udf_extend_file(inode, &epos, &extent, offset+((inode->i_size & (sb->s_blocksize-1)) != 0));
 		}
 	}
 	UDF_I_LENEXTENTS(inode) = inode->i_size;
 
-	udf_release_data(bh);
+	brelse(epos.bh);
 }
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 110f8d62616f..3b2e6c8cb151 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -93,7 +93,7 @@ static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
 	for (i=0; i<nr_groups; i++)\
 	{\
 		if (UDF_SB_BITMAP(X,Y,Z,i))\
-			udf_release_data(UDF_SB_BITMAP(X,Y,Z,i));\
+			brelse(UDF_SB_BITMAP(X,Y,Z,i));\
 	}\
 	if (size <= PAGE_SIZE)\
 		kfree(UDF_SB_PARTMAPS(X)[Y].Z.s_bitmap);\
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index ee1dece1f6f5..67ded289497c 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -77,6 +77,13 @@ struct ustr
 	uint8_t u_len;
 };
 
+struct extent_position {
+	struct buffer_head *bh;
+	uint32_t offset;
+	kernel_lb_addr block;
+};
+
+
 /* super.c */
 extern void udf_error(struct super_block *, const char *, const char *, ...);
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
@@ -98,13 +105,14 @@ extern void udf_read_inode(struct inode *);
 extern void udf_delete_inode(struct inode *);
 extern void udf_clear_inode(struct inode *);
 extern int udf_write_inode(struct inode *, int);
-extern long udf_block_map(struct inode *, long);
-extern int8_t inode_bmap(struct inode *, int, kernel_lb_addr *, uint32_t *, kernel_lb_addr *, uint32_t *, uint32_t *, struct buffer_head **);
-extern int8_t udf_add_aext(struct inode *, kernel_lb_addr *, int *, kernel_lb_addr, uint32_t, struct buffer_head **, int);
-extern int8_t udf_write_aext(struct inode *, kernel_lb_addr, int *, kernel_lb_addr, uint32_t, struct buffer_head *, int);
-extern int8_t udf_delete_aext(struct inode *, kernel_lb_addr, int, kernel_lb_addr, uint32_t, struct buffer_head *);
-extern int8_t udf_next_aext(struct inode *, kernel_lb_addr *, int *, kernel_lb_addr *, uint32_t *, struct buffer_head **, int);
-extern int8_t udf_current_aext(struct inode *, kernel_lb_addr *, int *, kernel_lb_addr *, uint32_t *, struct buffer_head **, int);
+extern long udf_block_map(struct inode *, sector_t);
+extern int udf_extend_file(struct inode *, struct extent_position *, kernel_long_ad *, sector_t);
+extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, kernel_lb_addr *, uint32_t *, sector_t *);
+extern int8_t udf_add_aext(struct inode *, struct extent_position *, kernel_lb_addr, uint32_t, int);
+extern int8_t udf_write_aext(struct inode *, struct extent_position *, kernel_lb_addr, uint32_t, int);
+extern int8_t udf_delete_aext(struct inode *, struct extent_position, kernel_lb_addr, uint32_t);
+extern int8_t udf_next_aext(struct inode *, struct extent_position *, kernel_lb_addr *, uint32_t *, int);
+extern int8_t udf_current_aext(struct inode *, struct extent_position *, kernel_lb_addr *, uint32_t *, int);
 
 /* misc.c */
 extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -113,7 +121,6 @@ extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t, uint
 extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, uint8_t);
 extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, uint32_t, uint16_t *);
 extern struct buffer_head *udf_read_ptagged(struct super_block *, kernel_lb_addr, uint32_t, uint16_t *);
-extern void udf_release_data(struct buffer_head *);
 extern void udf_update_tag(char *, int);
 extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
 
@@ -151,7 +158,7 @@ extern int udf_new_block(struct super_block *, struct inode *, uint16_t, uint32_
 extern int udf_fsync_file(struct file *, struct dentry *, int);
 
 /* directory.c */
-extern struct fileIdentDesc * udf_fileident_read(struct inode *, loff_t *, struct udf_fileident_bh *, struct fileIdentDesc *, kernel_lb_addr *, uint32_t *, kernel_lb_addr *, uint32_t *, uint32_t *, struct buffer_head **);
+extern struct fileIdentDesc * udf_fileident_read(struct inode *, loff_t *, struct udf_fileident_bh *, struct fileIdentDesc *, struct extent_position *, kernel_lb_addr *, uint32_t *, sector_t *);
 extern struct fileIdentDesc * udf_get_fileident(void * buffer, int bufsize, int * offset);
 extern long_ad * udf_get_filelongad(uint8_t *, int, int *, int);
 extern short_ad * udf_get_fileshortad(uint8_t *, int, int *, int);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 4890ddf1518e..154452172f43 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -19,7 +19,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
-#include <linux/smp_lock.h>
 
 #include "swab.h"
 #include "util.h"
@@ -180,13 +179,9 @@ fail:
 static struct page *ufs_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
 		if (!PageChecked(page))
 			ufs_check_page(page);
 		if (PageError(page))
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b5a6461ec66b..22ff6ed55ce9 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1237,9 +1237,7 @@ static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flag
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
+	inode_init_once(&ei->vfs_inode);
 }
  
 static int init_inodecache(void)
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 17437574f79c..84357f1ff0ec 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -251,13 +251,11 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
 
 	page = find_lock_page(mapping, index);
 	if (!page) {
-		page = read_cache_page(mapping, index,
-				       (filler_t*)mapping->a_ops->readpage,
-				       NULL);
+		page = read_mapping_page(mapping, index, NULL);
 
 		if (IS_ERR(page)) {
 			printk(KERN_ERR "ufs_change_blocknr: "
-			       "read_cache_page error: ino %lu, index: %lu\n",
+			       "read_mapping_page error: ino %lu, index: %lu\n",
 			       mapping->host->i_ino, index);
 			goto out;
 		}
diff --git a/fs/utimes.c b/fs/utimes.c
index 99cf2cb11fec..480f7c8c29da 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -1,8 +1,10 @@
 #include <linux/compiler.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/linkage.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
+#include <linux/stat.h>
 #include <linux/utime.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -20,54 +22,18 @@
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
+asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
 {
-	int error;
-	struct nameidata nd;
-	struct inode * inode;
-	struct iattr newattrs;
+	struct timespec tv[2];
 
-	error = user_path_walk(filename, &nd);
-	if (error)
-		goto out;
-	inode = nd.dentry->d_inode;
-
-	error = -EROFS;
-	if (IS_RDONLY(inode))
-		goto dput_and_out;
-
-	/* Don't worry, the checks are done in inode_change_ok() */
-	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
 	if (times) {
-		error = -EPERM;
-		if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-			goto dput_and_out;
-
-		error = get_user(newattrs.ia_atime.tv_sec, &times->actime);
-		newattrs.ia_atime.tv_nsec = 0;
-		if (!error)
-			error = get_user(newattrs.ia_mtime.tv_sec, &times->modtime);
-		newattrs.ia_mtime.tv_nsec = 0;
-		if (error)
-			goto dput_and_out;
-
-		newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
-	} else {
-                error = -EACCES;
-                if (IS_IMMUTABLE(inode))
-                        goto dput_and_out;
-
-		if (current->fsuid != inode->i_uid &&
-		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
-			goto dput_and_out;
+		if (get_user(tv[0].tv_sec, &times->actime) ||
+		    get_user(tv[1].tv_sec, &times->modtime))
+			return -EFAULT;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
 	}
-	mutex_lock(&inode->i_mutex);
-	error = notify_change(nd.dentry, &newattrs);
-	mutex_unlock(&inode->i_mutex);
-dput_and_out:
-	path_release(&nd);
-out:
-	return error;
+	return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0);
 }
 
 #endif
@@ -76,18 +42,38 @@ out:
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-long do_utimes(int dfd, char __user *filename, struct timeval *times)
+long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
 {
 	int error;
 	struct nameidata nd;
-	struct inode * inode;
+	struct dentry *dentry;
+	struct inode *inode;
 	struct iattr newattrs;
+	struct file *f = NULL;
 
-	error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
-
-	if (error)
+	error = -EINVAL;
+	if (flags & ~AT_SYMLINK_NOFOLLOW)
 		goto out;
-	inode = nd.dentry->d_inode;
+
+	if (filename == NULL && dfd != AT_FDCWD) {
+		error = -EINVAL;
+		if (flags & AT_SYMLINK_NOFOLLOW)
+			goto out;
+
+		error = -EBADF;
+		f = fget(dfd);
+		if (!f)
+			goto out;
+		dentry = f->f_path.dentry;
+	} else {
+		error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
+		if (error)
+			goto out;
+
+		dentry = nd.dentry;
+	}
+
+	inode = dentry->d_inode;
 
 	error = -EROFS;
 	if (IS_RDONLY(inode))
@@ -100,11 +86,21 @@ long do_utimes(int dfd, char __user *filename, struct timeval *times)
                 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                         goto dput_and_out;
 
-		newattrs.ia_atime.tv_sec = times[0].tv_sec;
-		newattrs.ia_atime.tv_nsec = times[0].tv_usec * 1000;
-		newattrs.ia_mtime.tv_sec = times[1].tv_sec;
-		newattrs.ia_mtime.tv_nsec = times[1].tv_usec * 1000;
-		newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+		if (times[0].tv_nsec == UTIME_OMIT)
+			newattrs.ia_valid &= ~ATTR_ATIME;
+		else if (times[0].tv_nsec != UTIME_NOW) {
+			newattrs.ia_atime.tv_sec = times[0].tv_sec;
+			newattrs.ia_atime.tv_nsec = times[0].tv_nsec;
+			newattrs.ia_valid |= ATTR_ATIME_SET;
+		}
+
+		if (times[1].tv_nsec == UTIME_OMIT)
+			newattrs.ia_valid &= ~ATTR_MTIME;
+		else if (times[1].tv_nsec != UTIME_NOW) {
+			newattrs.ia_mtime.tv_sec = times[1].tv_sec;
+			newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
+			newattrs.ia_valid |= ATTR_MTIME_SET;
+		}
 	} else {
 		error = -EACCES;
                 if (IS_IMMUTABLE(inode))
@@ -115,21 +111,67 @@ long do_utimes(int dfd, char __user *filename, struct timeval *times)
 			goto dput_and_out;
 	}
 	mutex_lock(&inode->i_mutex);
-	error = notify_change(nd.dentry, &newattrs);
+	error = notify_change(dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
 dput_and_out:
-	path_release(&nd);
+	if (f)
+		fput(f);
+	else
+		path_release(&nd);
 out:
 	return error;
 }
 
+asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __user *utimes, int flags)
+{
+	struct timespec tstimes[2];
+
+	if (utimes) {
+		if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
+			return -EFAULT;
+		if ((tstimes[0].tv_nsec == UTIME_OMIT ||
+		     tstimes[0].tv_nsec == UTIME_NOW) &&
+		    tstimes[0].tv_sec != 0)
+			return -EINVAL;
+		if ((tstimes[1].tv_nsec == UTIME_OMIT ||
+		     tstimes[1].tv_nsec == UTIME_NOW) &&
+		    tstimes[1].tv_sec != 0)
+			return -EINVAL;
+
+		/* Nothing to do, we must not even check the path.  */
+		if (tstimes[0].tv_nsec == UTIME_OMIT &&
+		    tstimes[1].tv_nsec == UTIME_OMIT)
+			return 0;
+	}
+
+	return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
+}
+
 asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes)
 {
 	struct timeval times[2];
+	struct timespec tstimes[2];
+
+	if (utimes) {
+		if (copy_from_user(&times, utimes, sizeof(times)))
+			return -EFAULT;
+
+		/* This test is needed to catch all invalid values.  If we
+		   would test only in do_utimes we would miss those invalid
+		   values truncated by the multiplication with 1000.  Note
+		   that we also catch UTIME_{NOW,OMIT} here which are only
+		   valid for utimensat.  */
+		if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 ||
+		    times[1].tv_usec >= 1000000 || times[1].tv_usec < 0)
+			return -EINVAL;
+
+		tstimes[0].tv_sec = times[0].tv_sec;
+		tstimes[0].tv_nsec = 1000 * times[0].tv_usec;
+		tstimes[1].tv_sec = times[1].tv_sec;
+		tstimes[1].tv_nsec = 1000 * times[1].tv_usec;
+	}
 
-	if (utimes && copy_from_user(&times, utimes, sizeof(times)))
-		return -EFAULT;
-	return do_utimes(dfd, filename, utimes ? times : NULL);
+	return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0);
 }
 
 asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes)
diff --git a/fs/xattr.c b/fs/xattr.c
index 38646132ab0e..4523aca79659 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -9,7 +9,6 @@
  */
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/xattr.h>
 #include <linux/namei.h>
@@ -351,6 +350,7 @@ sys_fgetxattr(int fd, char __user *name, void __user *value, size_t size)
 	f = fget(fd);
 	if (!f)
 		return error;
+	audit_inode(NULL, f->f_path.dentry->d_inode);
 	error = getxattr(f->f_path.dentry, name, value, size);
 	fput(f);
 	return error;
@@ -423,6 +423,7 @@ sys_flistxattr(int fd, char __user *list, size_t size)
 	f = fget(fd);
 	if (!f)
 		return error;
+	audit_inode(NULL, f->f_path.dentry->d_inode);
 	error = listxattr(f->f_path.dentry, list, size);
 	fput(f);
 	return error;
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
index af168a1a98c1..c110bb002665 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -43,6 +43,18 @@ static inline void mrupdate(mrlock_t *mrp)
 	mrp->mr_writer = 1;
 }
 
+static inline void mraccess_nested(mrlock_t *mrp, int subclass)
+{
+	down_read_nested(&mrp->mr_lock, subclass);
+}
+
+static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
+{
+	down_write_nested(&mrp->mr_lock, subclass);
+	mrp->mr_writer = 1;
+}
+
+
 static inline int mrtryaccess(mrlock_t *mrp)
 {
 	return down_read_trylock(&mrp->mr_lock);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 143ffc851c9d..7361861e3aac 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -141,9 +141,46 @@ xfs_destroy_ioend(
 }
 
 /*
+ * Update on-disk file size now that data has been written to disk.
+ * The current in-memory file size is i_size.  If a write is beyond
+ * eof io_new_size will be the intended file size until i_size is
+ * updated.  If this write does not extend all the way to the valid
+ * file size then restrict this update to the end of the write.
+ */
+STATIC void
+xfs_setfilesize(
+	xfs_ioend_t		*ioend)
+{
+	xfs_inode_t		*ip;
+	xfs_fsize_t		isize;
+	xfs_fsize_t		bsize;
+
+	ip = xfs_vtoi(ioend->io_vnode);
+
+	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+	ASSERT(ioend->io_type != IOMAP_READ);
+
+	if (unlikely(ioend->io_error))
+		return;
+
+	bsize = ioend->io_offset + ioend->io_size;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	isize = MAX(ip->i_size, ip->i_iocore.io_new_size);
+	isize = MIN(isize, bsize);
+
+	if (ip->i_d.di_size < isize) {
+		ip->i_d.di_size = isize;
+		ip->i_update_core = 1;
+		ip->i_update_size = 1;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
+/*
  * Buffered IO write completion for delayed allocate extents.
- * TODO: Update ondisk isize now that we know the file data
- * has been flushed (i.e. the notorious "NULL file" problem).
  */
 STATIC void
 xfs_end_bio_delalloc(
@@ -152,6 +189,7 @@ xfs_end_bio_delalloc(
 	xfs_ioend_t		*ioend =
 		container_of(work, xfs_ioend_t, io_work);
 
+	xfs_setfilesize(ioend);
 	xfs_destroy_ioend(ioend);
 }
 
@@ -165,6 +203,7 @@ xfs_end_bio_written(
 	xfs_ioend_t		*ioend =
 		container_of(work, xfs_ioend_t, io_work);
 
+	xfs_setfilesize(ioend);
 	xfs_destroy_ioend(ioend);
 }
 
@@ -184,8 +223,23 @@ xfs_end_bio_unwritten(
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
 
-	if (likely(!ioend->io_error))
+	if (likely(!ioend->io_error)) {
 		bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL);
+		xfs_setfilesize(ioend);
+	}
+	xfs_destroy_ioend(ioend);
+}
+
+/*
+ * IO read completion for regular, written extents.
+ */
+STATIC void
+xfs_end_bio_read(
+	struct work_struct	*work)
+{
+	xfs_ioend_t		*ioend =
+		container_of(work, xfs_ioend_t, io_work);
+
 	xfs_destroy_ioend(ioend);
 }
 
@@ -224,6 +278,8 @@ xfs_alloc_ioend(
 		INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
 	else if (type == IOMAP_DELAY)
 		INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
+	else if (type == IOMAP_READ)
+		INIT_WORK(&ioend->io_work, xfs_end_bio_read);
 	else
 		INIT_WORK(&ioend->io_work, xfs_end_bio_written);
 
@@ -645,7 +701,7 @@ xfs_is_delayed_page(
 			else if (buffer_delay(bh))
 				acceptable = (type == IOMAP_DELAY);
 			else if (buffer_dirty(bh) && buffer_mapped(bh))
-				acceptable = (type == 0);
+				acceptable = (type == IOMAP_NEW);
 			else
 				break;
 		} while ((bh = bh->b_this_page) != head);
@@ -754,7 +810,7 @@ xfs_convert_page(
 			page_dirty--;
 			count++;
 		} else {
-			type = 0;
+			type = IOMAP_NEW;
 			if (buffer_mapped(bh) && all_bh && startio) {
 				lock_buffer(bh);
 				xfs_add_to_ioend(inode, bh, offset,
@@ -912,8 +968,8 @@ xfs_page_state_convert(
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
-	flags = -1;
-	type = 0;
+	flags = BMAPI_READ;
+	type = IOMAP_NEW;
 
 	/* TODO: cleanup count and page_dirty */
 
@@ -943,14 +999,14 @@ xfs_page_state_convert(
 		 *
 		 * Third case, an unmapped buffer was found, and we are
 		 * in a path where we need to write the whole page out.
- 		 */
+		 */
 		if (buffer_unwritten(bh) || buffer_delay(bh) ||
 		    ((buffer_uptodate(bh) || PageUptodate(page)) &&
 		     !buffer_mapped(bh) && (unmapped || startio))) {
-		     	/*
+			/*
 			 * Make sure we don't use a read-only iomap
 			 */
-		     	if (flags == BMAPI_READ)
+			if (flags == BMAPI_READ)
 				iomap_valid = 0;
 
 			if (buffer_unwritten(bh)) {
@@ -999,7 +1055,7 @@ xfs_page_state_convert(
 			 * That means it must already have extents allocated
 			 * underneath it. Map the extent by reading it.
 			 */
-			if (!iomap_valid || type != 0) {
+			if (!iomap_valid || flags != BMAPI_READ) {
 				flags = BMAPI_READ;
 				size = xfs_probe_cluster(inode, page, bh,
 								head, 1);
@@ -1010,7 +1066,15 @@ xfs_page_state_convert(
 				iomap_valid = xfs_iomap_valid(&iomap, offset);
 			}
 
-			type = 0;
+			/*
+			 * We set the type to IOMAP_NEW in case we are doing a
+			 * small write at EOF that is extending the file but
+			 * without needing an allocation. We need to update the
+			 * file size on I/O completion in this case so it is
+			 * the same case as having just allocated a new extent
+			 * that we are writing into for the first time.
+			 */
+			type = IOMAP_NEW;
 			if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
 				ASSERT(buffer_mapped(bh));
 				if (iomap_valid)
@@ -1356,12 +1420,21 @@ xfs_end_io_direct(
 	 * completion handler in the future, in which case all this can
 	 * go away.
 	 */
-	if (private && size > 0) {
-		ioend->io_offset = offset;
-		ioend->io_size = size;
+	ioend->io_offset = offset;
+	ioend->io_size = size;
+	if (ioend->io_type == IOMAP_READ) {
+		xfs_finish_ioend(ioend);
+	} else if (private && size > 0) {
 		xfs_finish_ioend(ioend);
 	} else {
-		xfs_destroy_ioend(ioend);
+		/*
+		 * A direct I/O write ioend starts it's life in unwritten
+		 * state in case they map an unwritten extent.  This write
+		 * didn't map an unwritten extent so switch it's completion
+		 * handler.
+		 */
+		INIT_WORK(&ioend->io_work, xfs_end_bio_written);
+		xfs_finish_ioend(ioend);
 	}
 
 	/*
@@ -1392,15 +1465,15 @@ xfs_vm_direct_IO(
 	if (error)
 		return -error;
 
-	iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
-
 	if (rw == WRITE) {
+		iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
 		ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
 			iomap.iomap_target->bt_bdev,
 			iov, offset, nr_segs,
 			xfs_get_blocks_direct,
 			xfs_end_io_direct);
 	} else {
+		iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
 		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 			iomap.iomap_target->bt_bdev,
 			iov, offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 69e9e80735d2..fe4f66a5af14 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1426,7 +1426,7 @@ xfs_free_bufhash(
 /*
  *	buftarg list for delwrite queue processing
  */
-LIST_HEAD(xfs_buftarg_list);
+static LIST_HEAD(xfs_buftarg_list);
 static DEFINE_SPINLOCK(xfs_buftarg_lock);
 
 STATIC void
@@ -1867,3 +1867,11 @@ xfs_buf_terminate(void)
 	ktrace_free(xfs_buf_trace_buf);
 #endif
 }
+
+#ifdef CONFIG_KDB_MODULES
+struct list_head *
+xfs_get_buftarg_list(void)
+{
+	return &xfs_buftarg_list;
+}
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 9e8ef8fef39f..b6241f6201a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -411,6 +411,9 @@ extern void xfs_free_buftarg(xfs_buftarg_t *, int);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
+#ifdef CONFIG_KDB_MODULES
+extern struct list_head *xfs_get_buftarg_list(void);
+#endif
 
 #define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index dc0562828e76..2eb87cd082af 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -35,7 +35,7 @@ fs_tosspages(
 		truncate_inode_pages(ip->i_mapping, first);
 }
 
-void
+int
 fs_flushinval_pages(
 	bhv_desc_t	*bdp,
 	xfs_off_t	first,
@@ -44,13 +44,16 @@ fs_flushinval_pages(
 {
 	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
+	int		ret = 0;
 
 	if (VN_CACHED(vp)) {
 		if (VN_TRUNC(vp))
 			VUNTRUNCATE(vp);
-		filemap_write_and_wait(ip->i_mapping);
-		truncate_inode_pages(ip->i_mapping, first);
+		ret = filemap_write_and_wait(ip->i_mapping);
+		if (!ret)
+			truncate_inode_pages(ip->i_mapping, first);
 	}
+	return ret;
 }
 
 int
@@ -63,14 +66,18 @@ fs_flush_pages(
 {
 	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
+	int		ret = 0;
+	int		ret2;
 
 	if (VN_DIRTY(vp)) {
 		if (VN_TRUNC(vp))
 			VUNTRUNCATE(vp);
-		filemap_fdatawrite(ip->i_mapping);
+		ret = filemap_fdatawrite(ip->i_mapping);
 		if (flags & XFS_B_ASYNC)
-			return 0;
-		filemap_fdatawait(ip->i_mapping);
+			return ret;
+		ret2 = filemap_fdatawait(ip->i_mapping);
+		if (!ret)
+			ret = ret2;
 	}
-	return 0;
+	return ret;
 }
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
index aee9ccdd18f7..c1b53118a303 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.h
@@ -23,7 +23,7 @@ extern int  fs_noerr(void);
 extern int  fs_nosys(void);
 extern void fs_noval(void);
 extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+extern int  fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 extern int  fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
 
 #endif	/* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index ff8d64eba9f8..86fb671a8bcc 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -191,7 +191,7 @@ xfs_read(
 	struct file		*file = iocb->ki_filp;
 	struct inode		*inode = file->f_mapping->host;
 	size_t			size = 0;
-	ssize_t			ret;
+	ssize_t			ret = 0;
 	xfs_fsize_t		n;
 	xfs_inode_t		*ip;
 	xfs_mount_t		*mp;
@@ -224,7 +224,7 @@ xfs_read(
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 		if ((*offset & target->bt_smask) ||
 		    (size & target->bt_smask)) {
-			if (*offset == ip->i_d.di_size) {
+			if (*offset == ip->i_size) {
 				return (0);
 			}
 			return -XFS_ERROR(EINVAL);
@@ -263,9 +263,13 @@ xfs_read(
 
 	if (unlikely(ioflags & IO_ISDIRECT)) {
 		if (VN_CACHED(vp))
-			bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
+			ret = bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
 						 -1, FI_REMAPF_LOCKED);
 		mutex_unlock(&inode->i_mutex);
+		if (ret) {
+			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+			return ret;
+		}
 	}
 
 	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
@@ -383,9 +387,10 @@ xfs_splice_write(
 {
 	xfs_inode_t		*ip = XFS_BHVTOI(bdp);
 	xfs_mount_t		*mp = ip->i_mount;
+	xfs_iocore_t		*io = &ip->i_iocore;
 	ssize_t			ret;
 	struct inode		*inode = outfilp->f_mapping->host;
-	xfs_fsize_t		isize;
+	xfs_fsize_t		isize, new_size;
 
 	XFS_STATS_INC(xs_write_calls);
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -406,6 +411,14 @@ xfs_splice_write(
 			return -error;
 		}
 	}
+
+	new_size = *ppos + count;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (new_size > ip->i_size)
+		io->io_new_size = new_size;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
 	xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
 			   pipe, count, *ppos, ioflags);
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
@@ -416,14 +429,18 @@ xfs_splice_write(
 	if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
 		*ppos = isize;
 
-	if (*ppos > ip->i_d.di_size) {
+	if (*ppos > ip->i_size) {
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (*ppos > ip->i_d.di_size) {
-			ip->i_d.di_size = *ppos;
-			i_size_write(inode, *ppos);
-			ip->i_update_core = 1;
-			ip->i_update_size = 1;
-		}
+		if (*ppos > ip->i_size)
+			ip->i_size = *ppos;
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	if (io->io_new_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		io->io_new_size = 0;
+		if (ip->i_d.di_size > ip->i_size)
+			ip->i_d.di_size = ip->i_size;
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -639,37 +656,21 @@ xfs_write(
 	xfs_fsize_t		isize, new_size;
 	xfs_iocore_t		*io;
 	bhv_vnode_t		*vp;
-	unsigned long		seg;
 	int			iolock;
 	int			eventsent = 0;
 	bhv_vrwlock_t		locktype;
 	size_t			ocount = 0, count;
 	loff_t			pos;
-	int			need_i_mutex = 1, need_flush = 0;
+	int			need_i_mutex;
 
 	XFS_STATS_INC(xs_write_calls);
 
 	vp = BHV_TO_VNODE(bdp);
 	xip = XFS_BHVTOI(bdp);
 
-	for (seg = 0; seg < segs; seg++) {
-		const struct iovec *iv = &iovp[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		ocount += iv->iov_len;
-		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		segs = seg;
-		ocount -= iv->iov_len;  /* This segment is no good */
-		break;
-	}
+	error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
+	if (error)
+		return error;
 
 	count = ocount;
 	pos = *offset;
@@ -685,39 +686,20 @@ xfs_write(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if (ioflags & IO_ISDIRECT) {
-		xfs_buftarg_t	*target =
-			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
-
-		if ((pos & target->bt_smask) || (count & target->bt_smask))
-			return XFS_ERROR(-EINVAL);
-
-		if (!VN_CACHED(vp) && pos < i_size_read(inode))
-			need_i_mutex = 0;
-
-		if (VN_CACHED(vp))
-			need_flush = 1;
-	}
-
 relock:
-	if (need_i_mutex) {
+	if (ioflags & IO_ISDIRECT) {
+		iolock = XFS_IOLOCK_SHARED;
+		locktype = VRWLOCK_WRITE_DIRECT;
+		need_i_mutex = 0;
+	} else {
 		iolock = XFS_IOLOCK_EXCL;
 		locktype = VRWLOCK_WRITE;
-
+		need_i_mutex = 1;
 		mutex_lock(&inode->i_mutex);
-	} else {
-		iolock = XFS_IOLOCK_SHARED;
-		locktype = VRWLOCK_WRITE_DIRECT;
 	}
 
 	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
 
-	isize = i_size_read(inode);
-
-	if (file->f_flags & O_APPEND)
-		*offset = isize;
-
 start:
 	error = -generic_write_checks(file, &pos, &count,
 					S_ISBLK(inode->i_mode));
@@ -726,13 +708,8 @@ start:
 		goto out_unlock_mutex;
 	}
 
-	new_size = pos + count;
-	if (new_size > isize)
-		io->io_new_size = new_size;
-
 	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
 	    !(ioflags & IO_INVIS) && !eventsent)) {
-		loff_t		savedsize = pos;
 		int		dmflags = FILP_DELAY_FLAG(file);
 
 		if (need_i_mutex)
@@ -743,8 +720,7 @@ start:
 				      pos, count,
 				      dmflags, &locktype);
 		if (error) {
-			xfs_iunlock(xip, iolock);
-			goto out_unlock_mutex;
+			goto out_unlock_internal;
 		}
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
 		eventsent = 1;
@@ -756,12 +732,35 @@ start:
 		 * event prevents another call to XFS_SEND_DATA, which is
 		 * what allows the size to change in the first place.
 		 */
-		if ((file->f_flags & O_APPEND) && savedsize != isize) {
-			pos = isize = xip->i_d.di_size;
+		if ((file->f_flags & O_APPEND) && pos != xip->i_size)
+			goto start;
+	}
+
+	if (ioflags & IO_ISDIRECT) {
+		xfs_buftarg_t	*target =
+			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+				mp->m_rtdev_targp : mp->m_ddev_targp;
+
+		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+			return XFS_ERROR(-EINVAL);
+		}
+
+		if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
+			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+			iolock = XFS_IOLOCK_EXCL;
+			locktype = VRWLOCK_WRITE;
+			need_i_mutex = 1;
+			mutex_lock(&inode->i_mutex);
+			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
 			goto start;
 		}
 	}
 
+	new_size = pos + count;
+	if (new_size > xip->i_size)
+		io->io_new_size = new_size;
+
 	if (likely(!(ioflags & IO_INVIS))) {
 		file_update_time(file);
 		xfs_ichgtime_fast(xip, inode,
@@ -777,11 +776,11 @@ start:
 	 * to zero it out up to the new size.
 	 */
 
-	if (pos > isize) {
-		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize);
+	if (pos > xip->i_size) {
+		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, xip->i_size);
 		if (error) {
-			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-			goto out_unlock_mutex;
+			xfs_iunlock(xip, XFS_ILOCK_EXCL);
+			goto out_unlock_internal;
 		}
 	}
 	xfs_iunlock(xip, XFS_ILOCK_EXCL);
@@ -801,8 +800,7 @@ start:
 		if (likely(!error))
 			error = -remove_suid(file->f_path.dentry);
 		if (unlikely(error)) {
-			xfs_iunlock(xip, iolock);
-			goto out_unlock_mutex;
+			goto out_unlock_internal;
 		}
 	}
 
@@ -811,11 +809,14 @@ retry:
 	current->backing_dev_info = mapping->backing_dev_info;
 
 	if ((ioflags & IO_ISDIRECT)) {
-		if (need_flush) {
+		if (VN_CACHED(vp)) {
+			WARN_ON(need_i_mutex == 0);
 			xfs_inval_cached_trace(io, pos, -1,
 					ctooff(offtoct(pos)), -1);
-			bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
+			error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
 					-1, FI_REMAPF_LOCKED);
+			if (error)
+				goto out_unlock_internal;
 		}
 
 		if (need_i_mutex) {
@@ -843,7 +844,6 @@ retry:
 			pos += ret;
 			count -= ret;
 
-			need_i_mutex = 1;
 			ioflags &= ~IO_ISDIRECT;
 			xfs_iunlock(xip, iolock);
 			goto relock;
@@ -870,12 +870,12 @@ retry:
 		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
 				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
 				0, 0, 0); /* Delay flag intentionally  unused */
-		if (error)
-			goto out_nounlocks;
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
 		xfs_rwlock(bdp, locktype);
-		pos = xip->i_d.di_size;
+		if (error)
+			goto out_unlock_internal;
+		pos = xip->i_size;
 		ret = 0;
 		goto retry;
 	}
@@ -884,14 +884,10 @@ retry:
 	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
 		*offset = isize;
 
-	if (*offset > xip->i_d.di_size) {
+	if (*offset > xip->i_size) {
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
-		if (*offset > xip->i_d.di_size) {
-			xip->i_d.di_size = *offset;
-			i_size_write(inode, *offset);
-			xip->i_update_core = 1;
-			xip->i_update_size = 1;
-		}
+		if (*offset > xip->i_size)
+			xip->i_size = *offset;
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 	}
 
@@ -913,16 +909,31 @@ retry:
 
 		error = sync_page_range(inode, mapping, pos, ret);
 		if (!error)
-			error = ret;
-		return error;
+			error = -ret;
+		if (need_i_mutex)
+			mutex_lock(&inode->i_mutex);
+		xfs_rwlock(bdp, locktype);
 	}
 
  out_unlock_internal:
+	if (io->io_new_size) {
+		xfs_ilock(xip, XFS_ILOCK_EXCL);
+		io->io_new_size = 0;
+		/*
+		 * If this was a direct or synchronous I/O that failed (such
+		 * as ENOSPC) then part of the I/O may have been written to
+		 * disk before the error occured.  In this case the on-disk
+		 * file size may have been adjusted beyond the in-memory file
+		 * size and now needs to be truncated back.
+		 */
+		if (xip->i_d.di_size > xip->i_size)
+			xip->i_d.di_size = xip->i_size;
+		xfs_iunlock(xip, XFS_ILOCK_EXCL);
+	}
 	xfs_rwunlock(bdp, locktype);
  out_unlock_mutex:
 	if (need_i_mutex)
 		mutex_unlock(&inode->i_mutex);
- out_nounlocks:
 	return -error;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 2f2c40db562e..bf9a9d5909be 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -360,9 +360,7 @@ xfs_fs_inode_init_once(
 	kmem_zone_t		*zonep,
 	unsigned long		flags)
 {
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-		      SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
+	inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
 }
 
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index b76118cf4897..d1b2d01843d1 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -194,7 +194,7 @@ typedef	int	(*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
 typedef void	(*vop_link_removed_t)(bhv_desc_t *, bhv_vnode_t *, int);
 typedef void	(*vop_vnode_change_t)(bhv_desc_t *, bhv_vchange_t, __psint_t);
 typedef void	(*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-typedef void	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+typedef int	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
 				uint64_t, int);
 typedef int	(*vop_iflush_t)(bhv_desc_t *, int);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 4adaf13aac6f..cfdd35ee9f7a 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -753,8 +753,7 @@ xfs_qm_idtodq(
 		goto error0;
 	}
 	if (tp) {
-		if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
-					     NULL)))
+		if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES)))
 			goto error1;
 	}
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 1de2acdc7f70..3e4a8ad8a34c 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -388,6 +388,17 @@ xfs_qm_mount_quotas(
 			return XFS_ERROR(error);
 		}
 	}
+	/* 
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	if (!XFS_IS_UQUOTA_ON(mp)) {
+		mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+	}
+	if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) {
+		mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+	}
 
  write_changes:
 	/*
@@ -1453,8 +1464,7 @@ xfs_qm_qino_alloc(
 	XFS_SB_UNLOCK(mp, s);
 	xfs_mod_sb(tp, sbfields);
 
-	if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
-				     NULL))) {
+	if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
 		xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
 		return error;
 	}
@@ -2405,7 +2415,7 @@ xfs_qm_write_sb_changes(
 	}
 
 	xfs_mod_sb(tp, flags);
-	(void) xfs_trans_commit(tp, 0, NULL);
+	(void) xfs_trans_commit(tp, 0);
 
 	return 0;
 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 716f562aa8b2..2df67fd913e5 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -456,9 +456,7 @@ xfs_qm_scall_quotaon(
 	    ||
 	    ((flags & XFS_PQUOTA_ACCT) == 0 &&
 	    (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
-	    (flags & XFS_OQUOTA_ENFD))
-	    ||
-	    ((flags & XFS_GQUOTA_ACCT) == 0 &&
+	    (flags & XFS_GQUOTA_ACCT) == 0 &&
 	    (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
 	    (flags & XFS_OQUOTA_ENFD))) {
 		qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
@@ -735,7 +733,7 @@ xfs_qm_scall_setqlim(
 	xfs_trans_log_dquot(tp, dqp);
 
 	xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
-	xfs_trans_commit(tp, 0, NULL);
+	xfs_trans_commit(tp, 0);
 	xfs_qm_dqprint(dqp);
 	xfs_qm_dqrele(dqp);
 	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
@@ -809,7 +807,7 @@ xfs_qm_log_quotaoff_end(
 	 * We don't care about quotoff's performance.
 	 */
 	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0, NULL);
+	error = xfs_trans_commit(tp, 0);
 	return (error);
 }
 
@@ -852,7 +850,7 @@ xfs_qm_log_quotaoff(
 	 * We don't care about quotoff's performance.
 	 */
 	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0, NULL);
+	error = xfs_trans_commit(tp, 0);
 
 error0:
 	if (error) {
@@ -911,14 +909,19 @@ xfs_qm_export_dquot(
 	 * gets turned off. No need to confuse the user level code,
 	 * so return zeroes in that case.
 	 */
-	if (! XFS_IS_QUOTA_ENFORCED(mp)) {
+	if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) ||
+	    (!XFS_IS_OQUOTA_ENFORCED(mp) &&
+			(src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
 		dst->d_btimer = 0;
 		dst->d_itimer = 0;
 		dst->d_rtbtimer = 0;
 	}
 
 #ifdef DEBUG
-	if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) {
+	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) ||
+	     (XFS_IS_OQUOTA_ENFORCED(mp) &&
+			(dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) &&
+	    dst->d_id != 0) {
 		if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
 		    (dst->d_blk_softlimit > 0)) {
 			ASSERT(dst->d_btimer != 0);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index d7491e7b1f3b..7de6874bf1b8 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -656,7 +656,9 @@ xfs_trans_dqresv(
 
 	if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
 	    dqp->q_core.d_id &&
-	    XFS_IS_QUOTA_ENFORCED(dqp->q_mount)) {
+	    ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
+	     (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
+	      (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
 #ifdef QUOTADEBUG
 		cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld"
 			  " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit);
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 08bbd3cb87ae..f45a49ffd3a3 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -81,20 +81,3 @@ assfail(char *expr, char *file, int line)
 	printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
 	BUG();
 }
-
-#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM))
-unsigned long random(void)
-{
-	static unsigned long	RandomValue = 1;
-	/* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
-	register long	rv = RandomValue;
-	register long	lo;
-	register long	hi;
-
-	hi = rv / 127773;
-	lo = rv % 127773;
-	rv = 16807 * lo - 2836 * hi;
-	if (rv <= 0) rv += 2147483647;
-	return RandomValue = rv;
-}
-#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 2a70cc605ae3..a27a7c8c0526 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -50,7 +50,7 @@ extern void assfail(char *expr, char *f, int l);
 #else /* DEBUG */
 
 # define ASSERT(expr)	ASSERT_ALWAYS(expr)
-extern unsigned long random(void);
+# include <linux/random.h>
 
 #ifndef STATIC
 # define STATIC noinline
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index e80dda3437d1..8e9a40aa0cd3 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -764,7 +764,7 @@ xfs_alloc_ag_vextent_near(
 	 */
 	int		dofirst;	/* set to do first algorithm */
 
-	dofirst = random() & 1;
+	dofirst = random32() & 1;
 #endif
 	/*
 	 * Get a cursor for the by-size btree.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 9d358ffce4e5..7ce44a7b88a2 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -328,8 +328,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
 				xfs_trans_set_sync(args.trans);
 			}
 			err2 = xfs_trans_commit(args.trans,
-						 XFS_TRANS_RELEASE_LOG_RES,
-						 NULL);
+						 XFS_TRANS_RELEASE_LOG_RES);
 			xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
 			/*
@@ -397,8 +396,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
 	 * Commit the last in the sequence of transactions.
 	 */
 	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
-				 NULL);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
 	/*
@@ -544,8 +542,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags)
 	 * Commit the last in the sequence of transactions.
 	 */
 	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
-	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
-				 NULL);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
 	/*
@@ -859,8 +856,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	 * Commit the last in the sequence of transactions.
 	 */
 	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
-	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES,
-				 NULL);
+	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
 	return(error);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 8eab73e8340a..81f45dae1c57 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -3053,7 +3053,7 @@ xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
 	 * is in progress. The caller takes the responsibility to cancel
 	 * the duplicate transaction that gets returned.
 	 */
-	if ((error = xfs_trans_commit(trans, 0, NULL)))
+	if ((error = xfs_trans_commit(trans, 0)))
 		return (error);
 
 	trans = *transp;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 87795188cedf..b1ea26e40aaf 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -130,7 +130,6 @@ STATIC int				/* error */
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	xfs_extnum_t		idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp,/* inode logging flags */
 	xfs_extdelta_t		*delta, /* Change made to incore extents */
@@ -399,7 +398,6 @@ xfs_bmap_count_leaves(
 
 STATIC int
 xfs_bmap_disk_count_leaves(
-	xfs_ifork_t		*ifp,
 	xfs_extnum_t		idx,
 	xfs_bmbt_block_t	*block,
 	int			numrecs,
@@ -580,7 +578,7 @@ xfs_bmap_add_extent(
 		if (cur)
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
-		if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new,
+		if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
 				&logflags, delta, rsvd)))
 			goto done;
 	}
@@ -1841,7 +1839,6 @@ STATIC int				/* error */
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	xfs_extnum_t		idx,	/* extent number to update/insert */
-	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
 	xfs_extdelta_t		*delta, /* Change made to incore extents */
@@ -4071,7 +4068,7 @@ xfs_bmap_add_attrfork(
 	}
 	if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
 		goto error2;
-	error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES);
 	ASSERT(ip->i_df.if_ext_max ==
 	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 	return error;
@@ -4227,7 +4224,7 @@ xfs_bmap_finish(
 	logres = ntp->t_log_res;
 	logcount = ntp->t_log_count;
 	ntp = xfs_trans_dup(*tp);
-	error = xfs_trans_commit(*tp, 0, NULL);
+	error = xfs_trans_commit(*tp, 0);
 	*tp = ntp;
 	*committed = 1;
 	/*
@@ -4447,8 +4444,11 @@ xfs_bmap_one_block(
 	xfs_bmbt_irec_t	s;		/* internal version of extent */
 
 #ifndef DEBUG
-	if (whichfork == XFS_DATA_FORK)
-		return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize;
+	if (whichfork == XFS_DATA_FORK) {
+		return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ?
+			(ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
+			(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
+	}
 #endif	/* !DEBUG */
 	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
 		return 0;
@@ -4460,7 +4460,7 @@ xfs_bmap_one_block(
 	xfs_bmbt_get_all(ep, &s);
 	rval = s.br_startoff == 0 && s.br_blockcount == 1;
 	if (rval && whichfork == XFS_DATA_FORK)
-		ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
+		ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
 	return rval;
 }
 
@@ -5820,7 +5820,7 @@ xfs_getbmap(
 			fixlen = XFS_MAXIOFFSET(mp);
 		} else {
 			prealloced = 0;
-			fixlen = ip->i_d.di_size;
+			fixlen = ip->i_size;
 		}
 	} else {
 		prealloced = 0;
@@ -5844,7 +5844,8 @@ xfs_getbmap(
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
-	if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
+	if (whichfork == XFS_DATA_FORK &&
+		(ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
 		/* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
 		error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF);
 	}
@@ -6425,8 +6426,8 @@ xfs_bmap_count_tree(
 		for (;;) {
 			nextbno = be64_to_cpu(block->bb_rightsib);
 			numrecs = be16_to_cpu(block->bb_numrecs);
-			if (unlikely(xfs_bmap_disk_count_leaves(ifp,
-					0, block, numrecs, count) < 0)) {
+			if (unlikely(xfs_bmap_disk_count_leaves(0,
+					block, numrecs, count) < 0)) {
 				xfs_trans_brelse(tp, bp);
 				XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
 						 XFS_ERRLEVEL_LOW, mp);
@@ -6472,7 +6473,6 @@ xfs_bmap_count_leaves(
  */
 int
 xfs_bmap_disk_count_leaves(
-	xfs_ifork_t		*ifp,
 	xfs_extnum_t		idx,
 	xfs_bmbt_block_t	*block,
 	int			numrecs,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b847e6a7a3f0..de35d18cc002 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -199,7 +199,9 @@ xfs_swap_extents(
 
 	if (VN_CACHED(tvp) != 0) {
 		xfs_inval_cached_trace(&tip->i_iocore, 0, -1, 0, -1);
-		bhv_vop_flushinval_pages(tvp, 0, -1, FI_REMAPF_LOCKED);
+		error = bhv_vop_flushinval_pages(tvp, 0, -1, FI_REMAPF_LOCKED);
+		if (error)
+			goto error0;
 	}
 
 	/* Verify O_DIRECT for ftmp */
@@ -382,7 +384,7 @@ xfs_swap_extents(
 		xfs_trans_set_sync(tp);
 	}
 
-	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
 	locked = 0;
 
  error0:
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 9d7438bba30d..3accc1dcd6c9 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -282,8 +282,7 @@ xfs_dir2_block_addname(
 		 * This needs to happen before the next call to use_free.
 		 */
 		if (needscan) {
-			xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
-				&needlog, NULL);
+			xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
 			needscan = 0;
 		}
 	}
@@ -333,7 +332,7 @@ xfs_dir2_block_addname(
 		 */
 		if (needscan) {
 			xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
-				&needlog, NULL);
+				&needlog);
 			needscan = 0;
 		}
 		/*
@@ -418,8 +417,7 @@ xfs_dir2_block_addname(
 	 * Clean up the bestfree array and log the header, tail, and entry.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
-			NULL);
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, bp);
 	xfs_dir2_block_log_tail(tp, bp);
@@ -798,8 +796,7 @@ xfs_dir2_block_removename(
 	 * Fix up bestfree, log the header if necessary.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
-			NULL);
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, bp);
 	xfs_dir2_data_check(dp, bp);
@@ -996,8 +993,7 @@ xfs_dir2_leaf_to_block(
 	 * Scan the bestfree if we need it and log the data block header.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
-			NULL);
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dbp);
 	/*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index f7c799217072..c211c37ef67c 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -324,8 +324,7 @@ void
 xfs_dir2_data_freescan(
 	xfs_mount_t		*mp,		/* filesystem mount point */
 	xfs_dir2_data_t		*d,		/* data block pointer */
-	int			*loghead,	/* out: log data header */
-	char			*aendp)		/* in: caller's endp */
+	int			*loghead)	/* out: log data header */
 {
 	xfs_dir2_block_tail_t	*btp;		/* block tail */
 	xfs_dir2_data_entry_t	*dep;		/* active data entry */
@@ -346,9 +345,7 @@ xfs_dir2_data_freescan(
 	 * Set up pointers.
 	 */
 	p = (char *)d->u;
-	if (aendp)
-		endp = aendp;
-	else if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
+	if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
 		btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
 		endp = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
 	} else
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index a6ae2d21c40a..c94c9099cfb1 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -166,7 +166,7 @@ extern xfs_dir2_data_free_t *xfs_dir2_data_freefind(xfs_dir2_data_t *d,
 extern xfs_dir2_data_free_t *xfs_dir2_data_freeinsert(xfs_dir2_data_t *d,
 				xfs_dir2_data_unused_t *dup, int *loghead);
 extern void xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d,
-				int *loghead, char *aendp);
+				int *loghead);
 extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
 				struct xfs_dabuf **bpp);
 extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp,
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index b1cf1fbf423d..db14ea71459f 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -133,8 +133,7 @@ xfs_dir2_block_to_leaf(
 	 */
 	block->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
 	if (needscan)
-		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
-			NULL);
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
 	/*
 	 * Set up leaf tail and bests table.
 	 */
@@ -414,7 +413,7 @@ xfs_dir2_leaf_addname(
 	 * Need to scan fix up the bestfree table.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+		xfs_dir2_data_freescan(mp, data, &needlog);
 	/*
 	 * Need to log the data block's header.
 	 */
@@ -1496,7 +1495,7 @@ xfs_dir2_leaf_removename(
 	 * log the data block header if necessary.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+		xfs_dir2_data_freescan(mp, data, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dbp);
 	/*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 9ca71719b683..d083c3819934 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -904,7 +904,7 @@ xfs_dir2_leafn_remove(
 	 * Log the data block header if needed.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+		xfs_dir2_data_freescan(mp, data, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dbp);
 	xfs_dir2_data_check(dp, dbp);
@@ -1705,7 +1705,7 @@ xfs_dir2_node_addname_int(
 	 * Rescan the block for bestfree if needed.
 	 */
 	if (needscan)
-		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+		xfs_dir2_data_freescan(mp, data, &needlog);
 	/*
 	 * Log the data block header if needed.
 	 */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index b1af54464f00..8c4331631337 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -80,7 +80,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
 	int i;
 	int64_t fsid;
 
-	if (random() % randfactor)
+	if (random32() % randfactor)
 		return 0;
 
 	memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 32c37c1c47ab..b599e6be9ec1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -346,7 +346,7 @@ xfs_growfs_data_private(
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
 	if (dpct)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
-	error = xfs_trans_commit(tp, 0, NULL);
+	error = xfs_trans_commit(tp, 0);
 	if (error) {
 		return error;
 	}
@@ -605,7 +605,7 @@ xfs_fs_log_dummy(
 	xfs_trans_ihold(tp, ip);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	xfs_trans_set_sync(tp);
-	xfs_trans_commit(tp, 0, NULL);
+	xfs_trans_commit(tp, 0);
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index c1c89dac19cc..114433a22baa 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -879,17 +879,17 @@ xfs_ilock(xfs_inode_t	*ip,
 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 
 	if (lock_flags & XFS_IOLOCK_EXCL) {
-		mrupdate(&ip->i_iolock);
+		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 	} else if (lock_flags & XFS_IOLOCK_SHARED) {
-		mraccess(&ip->i_iolock);
+		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 	}
 	if (lock_flags & XFS_ILOCK_EXCL) {
-		mrupdate(&ip->i_lock);
+		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 	} else if (lock_flags & XFS_ILOCK_SHARED) {
-		mraccess(&ip->i_lock);
+		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 	}
 	xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address);
 }
@@ -923,7 +923,7 @@ xfs_ilock_nowait(xfs_inode_t	*ip,
 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 
 	iolocked = 0;
 	if (lock_flags & XFS_IOLOCK_EXCL) {
@@ -983,7 +983,8 @@ xfs_iunlock(xfs_inode_t	*ip,
 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0);
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY |
+			XFS_LOCK_DEP_MASK)) == 0);
 	ASSERT(lock_flags != 0);
 
 	if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3da9829c19d5..3ca5d43b8345 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -442,6 +442,7 @@ xfs_iformat(
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		ip->i_d.di_size = 0;
+		ip->i_size = 0;
 		ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
 		break;
 
@@ -980,6 +981,7 @@ xfs_iread(
 	}
 
 	ip->i_delayed_blks = 0;
+	ip->i_size = ip->i_d.di_size;
 
 	/*
 	 * Mark the buffer containing the inode as something to keep
@@ -1170,6 +1172,7 @@ xfs_ialloc(
 	}
 
 	ip->i_d.di_size = 0;
+	ip->i_size = 0;
 	ip->i_d.di_nextents = 0;
 	ASSERT(ip->i_d.di_nblocks == 0);
 	xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
@@ -1340,7 +1343,7 @@ xfs_file_last_byte(
 	} else {
 		last_block = 0;
 	}
-	size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size);
+	size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size);
 	last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
 
 	last_byte = XFS_FSB_TO_B(mp, last_block);
@@ -1421,7 +1424,7 @@ xfs_itrunc_trace(
  * must be called again with all the same restrictions as the initial
  * call.
  */
-void
+int
 xfs_itruncate_start(
 	xfs_inode_t	*ip,
 	uint		flags,
@@ -1431,9 +1434,10 @@ xfs_itruncate_start(
 	xfs_off_t	toss_start;
 	xfs_mount_t	*mp;
 	bhv_vnode_t	*vp;
+	int		error = 0;
 
 	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
-	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
+	ASSERT((new_size == 0) || (new_size <= ip->i_size));
 	ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
 	       (flags == XFS_ITRUNC_MAYBE));
 
@@ -1468,7 +1472,7 @@ xfs_itruncate_start(
 		 * file size, so there is no way that the data extended
 		 * out there.
 		 */
-		return;
+		return 0;
 	}
 	last_byte = xfs_file_last_byte(ip);
 	xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
@@ -1477,7 +1481,7 @@ xfs_itruncate_start(
 		if (flags & XFS_ITRUNC_DEFINITE) {
 			bhv_vop_toss_pages(vp, toss_start, -1, FI_REMAPF_LOCKED);
 		} else {
-			bhv_vop_flushinval_pages(vp, toss_start, -1, FI_REMAPF_LOCKED);
+			error = bhv_vop_flushinval_pages(vp, toss_start, -1, FI_REMAPF_LOCKED);
 		}
 	}
 
@@ -1486,6 +1490,7 @@ xfs_itruncate_start(
 		ASSERT(VN_CACHED(vp) == 0);
 	}
 #endif
+	return error;
 }
 
 /*
@@ -1556,7 +1561,7 @@ xfs_itruncate_finish(
 
 	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
-	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
+	ASSERT((new_size == 0) || (new_size <= ip->i_size));
 	ASSERT(*tp != NULL);
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(ip->i_transp == *tp);
@@ -1630,8 +1635,20 @@ xfs_itruncate_finish(
 	 */
 	if (fork == XFS_DATA_FORK) {
 		if (ip->i_d.di_nextents > 0) {
-			ip->i_d.di_size = new_size;
-			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+			/*
+			 * If we are not changing the file size then do
+			 * not update the on-disk file size - we may be
+			 * called from xfs_inactive_free_eofblocks().  If we
+			 * update the on-disk file size and then the system
+			 * crashes before the contents of the file are
+			 * flushed to disk then the files may be full of
+			 * holes (ie NULL files bug).
+			 */
+			if (ip->i_size != new_size) {
+				ip->i_d.di_size = new_size;
+				ip->i_size = new_size;
+				xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+			}
 		}
 	} else if (sync) {
 		ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
@@ -1746,7 +1763,7 @@ xfs_itruncate_finish(
 			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
 		}
 		ntp = xfs_trans_dup(ntp);
-		(void) xfs_trans_commit(*tp, 0, NULL);
+		(void) xfs_trans_commit(*tp, 0);
 		*tp = ntp;
 		error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
 					  XFS_TRANS_PERM_LOG_RES,
@@ -1767,7 +1784,19 @@ xfs_itruncate_finish(
 	 */
 	if (fork == XFS_DATA_FORK) {
 		xfs_isize_check(mp, ip, new_size);
-		ip->i_d.di_size = new_size;
+		/*
+		 * If we are not changing the file size then do
+		 * not update the on-disk file size - we may be
+		 * called from xfs_inactive_free_eofblocks().  If we
+		 * update the on-disk file size and then the system
+		 * crashes before the contents of the file are
+		 * flushed to disk then the files may be full of
+		 * holes (ie NULL files bug).
+		 */
+		if (ip->i_size != new_size) {
+			ip->i_d.di_size = new_size;
+			ip->i_size = new_size;
+		}
 	}
 	xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
 	ASSERT((new_size != 0) ||
@@ -1800,7 +1829,7 @@ xfs_igrow_start(
 
 	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
 	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
-	ASSERT(new_size > ip->i_d.di_size);
+	ASSERT(new_size > ip->i_size);
 
 	/*
 	 * Zero any pages that may have been created by
@@ -1808,7 +1837,7 @@ xfs_igrow_start(
 	 * and any blocks between the old and new file sizes.
 	 */
 	error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size,
-			     ip->i_d.di_size);
+			     ip->i_size);
 	return error;
 }
 
@@ -1832,13 +1861,14 @@ xfs_igrow_finish(
 	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
 	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
 	ASSERT(ip->i_transp == tp);
-	ASSERT(new_size > ip->i_d.di_size);
+	ASSERT(new_size > ip->i_size);
 
 	/*
 	 * Update the file size.  Update the inode change timestamp
 	 * if change_flag set.
 	 */
 	ip->i_d.di_size = new_size;
+	ip->i_size = new_size;
 	if (change_flag)
 		xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -2321,7 +2351,7 @@ xfs_ifree(
 	ASSERT(ip->i_d.di_nlink == 0);
 	ASSERT(ip->i_d.di_nextents == 0);
 	ASSERT(ip->i_d.di_anextents == 0);
-	ASSERT((ip->i_d.di_size == 0) ||
+	ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
 	       ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
 	ASSERT(ip->i_d.di_nblocks == 0);
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index bc823720d88f..f75afecef8e7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -287,6 +287,7 @@ typedef struct xfs_inode {
 	struct xfs_inode	*i_cnext;	/* cluster hash link forward */
 	struct xfs_inode	*i_cprev;	/* cluster hash link backward */
 
+	xfs_fsize_t		i_size;		/* in-memory size */
 	/* Trace buffers per inode. */
 #ifdef XFS_BMAP_TRACE
 	struct ktrace		*i_xtrace;	/* inode extent list trace */
@@ -305,6 +306,8 @@ typedef struct xfs_inode {
 #endif
 } xfs_inode_t;
 
+#define XFS_ISIZE(ip)	(((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
+				(ip)->i_size : (ip)->i_d.di_size;
 
 /*
  * i_flags helper functions
@@ -379,26 +382,58 @@ xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
 
 /*
  * Flags for inode locking.
+ * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
+ *		1<<16 - 1<<32-1 -- lockdep annotation (integers)
  */
-#define	XFS_IOLOCK_EXCL		0x001
-#define	XFS_IOLOCK_SHARED	0x002
-#define	XFS_ILOCK_EXCL		0x004
-#define	XFS_ILOCK_SHARED	0x008
-#define	XFS_IUNLOCK_NONOTIFY	0x010
-/*	XFS_IOLOCK_NESTED	0x020 */
-#define XFS_EXTENT_TOKEN_RD	0x040
-#define XFS_SIZE_TOKEN_RD	0x080
+#define	XFS_IOLOCK_EXCL		(1<<0)
+#define	XFS_IOLOCK_SHARED	(1<<1)
+#define	XFS_ILOCK_EXCL		(1<<2)
+#define	XFS_ILOCK_SHARED	(1<<3)
+#define	XFS_IUNLOCK_NONOTIFY	(1<<4)
+/*	#define XFS_IOLOCK_NESTED	(1<<5)	*/
+#define XFS_EXTENT_TOKEN_RD	(1<<6)
+#define XFS_SIZE_TOKEN_RD	(1<<7)
 #define XFS_EXTSIZE_RD		(XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD)
-#define XFS_WILLLEND		0x100	/* Always acquire tokens for lending */
+#define XFS_WILLLEND		(1<<8)	/* Always acquire tokens for lending */
 #define XFS_EXTENT_TOKEN_WR	(XFS_EXTENT_TOKEN_RD | XFS_WILLLEND)
 #define XFS_SIZE_TOKEN_WR       (XFS_SIZE_TOKEN_RD | XFS_WILLLEND)
 #define XFS_EXTSIZE_WR		(XFS_EXTSIZE_RD | XFS_WILLLEND)
-/*	XFS_SIZE_TOKEN_WANT	0x200 */
+/* TODO:XFS_SIZE_TOKEN_WANT	(1<<9) */
 
-#define XFS_LOCK_MASK	\
-	(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \
-	 XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \
-	 XFS_WILLLEND)
+#define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
+				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
+				| XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD \
+				| XFS_WILLLEND)
+
+/*
+ * Flags for lockdep annotations.
+ *
+ * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes
+ * (ie directory operations that require locking a directory inode and
+ * an entry inode).  The first inode gets locked with this flag so it
+ * gets a lockdep subclass of 1 and the second lock will have a lockdep
+ * subclass of 0.
+ *
+ * XFS_I[O]LOCK_INUMORDER - for locking several inodes at the some time
+ * with xfs_lock_inodes().  This flag is used as the starting subclass
+ * and each subsequent lock acquired will increment the subclass by one.
+ * So the first lock acquired will have a lockdep subclass of 2, the
+ * second lock will have a lockdep subclass of 3, and so on.
+ */
+#define XFS_IOLOCK_SHIFT	16
+#define	XFS_IOLOCK_PARENT	(1 << XFS_IOLOCK_SHIFT)
+#define	XFS_IOLOCK_INUMORDER	(2 << XFS_IOLOCK_SHIFT)
+
+#define XFS_ILOCK_SHIFT		24
+#define	XFS_ILOCK_PARENT	(1 << XFS_ILOCK_SHIFT)
+#define	XFS_ILOCK_INUMORDER	(2 << XFS_ILOCK_SHIFT)
+
+#define XFS_IOLOCK_DEP_MASK	0x00ff0000
+#define XFS_ILOCK_DEP_MASK	0xff000000
+#define XFS_LOCK_DEP_MASK	(XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK)
+
+#define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
+#define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 
 /*
  * Flags for xfs_iflush()
@@ -481,7 +516,7 @@ uint		xfs_ip2xflags(struct xfs_inode *);
 uint		xfs_dic2xflags(struct xfs_dinode_core *);
 int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
 			   struct xfs_bmap_free *);
-void		xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
+int		xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
 int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
 				     xfs_fsize_t, int, int);
 int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c
index 06d710c9ce4b..81548ec72ba6 100644
--- a/fs/xfs/xfs_iocore.c
+++ b/fs/xfs/xfs_iocore.c
@@ -52,7 +52,7 @@ STATIC xfs_fsize_t
 xfs_size_fn(
 	xfs_inode_t		*ip)
 {
-	return (ip->i_d.di_size);
+	return XFS_ISIZE(ip);
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index cc6a7b5a9912..3f2b9f2a7b94 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -458,7 +458,7 @@ xfs_iomap_write_direct(
 		extsz = ip->i_d.di_extsize;
 	}
 
-	isize = ip->i_d.di_size;
+	isize = ip->i_size;
 	if (io->io_new_size > isize)
 		isize = io->io_new_size;
 
@@ -524,7 +524,7 @@ xfs_iomap_write_direct(
 	xfs_trans_ihold(tp, ip);
 
 	bmapi_flag = XFS_BMAPI_WRITE;
-	if ((flags & BMAPI_DIRECT) && (offset < ip->i_d.di_size || extsz))
+	if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
 		bmapi_flag |= XFS_BMAPI_PREALLOC;
 
 	/*
@@ -543,7 +543,7 @@ xfs_iomap_write_direct(
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
 	if (error)
 		goto error0;
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (error)
 		goto error_out;
 
@@ -676,7 +676,7 @@ xfs_iomap_write_delay(
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
 retry:
-	isize = ip->i_d.di_size;
+	isize = ip->i_size;
 	if (io->io_new_size > isize)
 		isize = io->io_new_size;
 
@@ -817,7 +817,7 @@ xfs_iomap_write_allocate(
 			 * we dropped the ilock in the interim.
 			 */
 
-			end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size);
+			end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
 			xfs_bmap_last_offset(NULL, ip, &last_block,
 				XFS_DATA_FORK);
 			last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
@@ -840,8 +840,7 @@ xfs_iomap_write_allocate(
 			if (error)
 				goto trans_cancel;
 
-			error = xfs_trans_commit(tp,
-					XFS_TRANS_RELEASE_LOG_RES, NULL);
+			error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 			if (error)
 				goto error0;
 
@@ -948,7 +947,7 @@ xfs_iomap_write_unwritten(
 		if (error)
 			goto error_on_bmapi_transaction;
 
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (error)
 			return XFS_ERROR(error);
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 3ce204a524b0..df441ee936b2 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -22,6 +22,7 @@
 
 
 typedef enum {				/* iomap_flags values */
+	IOMAP_READ =		0,	/* mapping for a read */
 	IOMAP_EOF =		0x01,	/* mapping contains EOF   */
 	IOMAP_HOLE =		0x02,	/* mapping covers a hole  */
 	IOMAP_DELAY =		0x04,	/* mapping covers delalloc region  */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7775ddc0b3c6..e725ddd3de5f 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -809,7 +809,7 @@ xfs_inumbers(
 				xfs_buf_relse(agbp);
 				agbp = NULL;
 				/*
-				 * Move up the the last inode in the current
+				 * Move up the last inode in the current
 				 * chunk.  The lookup_ge will always get
 				 * us the first inode in the next chunk.
 				 */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index ca74d3f5910e..080fabf61c92 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1509,7 +1509,6 @@ xlog_recover_insert_item_frontq(
 
 STATIC int
 xlog_recover_reorder_trans(
-	xlog_t			*log,
 	xlog_recover_t		*trans)
 {
 	xlog_recover_item_t	*first_item, *itemq, *itemq_next;
@@ -1867,7 +1866,6 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
-	xfs_mount_t		*mp,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
@@ -2083,7 +2081,7 @@ xlog_recover_do_dquot_buffer(
 	if (log->l_quotaoffs_flag & type)
 		return;
 
-	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	xlog_recover_do_reg_buffer(item, bp, buf_f);
 }
 
 /*
@@ -2184,7 +2182,7 @@ xlog_recover_do_buffer_trans(
 		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
-		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+		xlog_recover_do_reg_buffer(item, bp, buf_f);
 	}
 	if (error)
 		return XFS_ERROR(error);
@@ -2765,7 +2763,7 @@ xlog_recover_do_trans(
 	int			error = 0;
 	xlog_recover_item_t	*item, *first_item;
 
-	if ((error = xlog_recover_reorder_trans(log, trans)))
+	if ((error = xlog_recover_reorder_trans(trans)))
 		return error;
 	first_item = item = trans->r_itemq;
 	do {
@@ -3016,7 +3014,7 @@ xlog_recover_process_efi(
 	}
 
 	efip->efi_flags |= XFS_EFI_RECOVERED;
-	xfs_trans_commit(tp, 0, NULL);
+	xfs_trans_commit(tp, 0);
 }
 
 /*
@@ -3143,7 +3141,7 @@ xlog_recover_clear_agi_bucket(
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
 
-	(void) xfs_trans_commit(tp, 0, NULL);
+	(void) xfs_trans_commit(tp, 0);
 }
 
 /*
@@ -3886,8 +3884,7 @@ xlog_recover(
 		 * under the vfs layer, so we can get away with it unless
 		 * the device itself is read-only, in which case we fail.
 		 */
-		if ((error = xfs_dev_is_read_only(log->l_mp,
-						"recovery required"))) {
+		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
 			return error;
 		}
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3bed0cf0d8af..a96bde6df96d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1653,7 +1653,7 @@ xfs_mount_log_sbunit(
 		return;
 	}
 	xfs_mod_sb(tp, fields);
-	xfs_trans_commit(tp, 0, NULL);
+	xfs_trans_commit(tp, 0);
 }
 
 
@@ -1734,11 +1734,13 @@ xfs_icsb_cpu_notify(
 			per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
 	switch (action) {
 	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		/* Easy Case - initialize the area and locks, and
 		 * then rebalance when online does everything else for us. */
 		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
 		break;
 	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
 		xfs_icsb_lock(mp);
 		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
 		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
@@ -1746,6 +1748,7 @@ xfs_icsb_cpu_notify(
 		xfs_icsb_unlock(mp);
 		break;
 	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		/* Disable all the counters, then fold the dead cpu's
 		 * count into the total on the global superblock and
 		 * re-enable the counters. */
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 320d63ff9ca2..0d594ed7efef 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -78,7 +78,7 @@ xfs_mount_reset_sbqflags(xfs_mount_t *mp)
 		return error;
 	}
 	xfs_mod_sb(tp, XFS_SB_QFLAGS);
-	error = xfs_trans_commit(tp, 0, NULL);
+	error = xfs_trans_commit(tp, 0);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 9dcb32aa4e2e..6f14df976f73 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -154,10 +154,11 @@ typedef struct xfs_qoff_logformat {
 #define XFS_ALL_QUOTA_CHKD	(XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
 
 #define XFS_IS_QUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
-#define XFS_IS_QUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ENFD)
 #define XFS_IS_UQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_UQUOTA_ACCT)
 #define XFS_IS_PQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_PQUOTA_ACCT)
 #define XFS_IS_GQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_OQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_OQUOTA_ENFD)
 
 /*
  * Incore only flags for quotaoff - these bits get cleared when quota(s)
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 4c6573d784cd..7679d7a7022d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -584,7 +584,7 @@ xfs_rename(
 	 * trans_commit will unlock src_ip, target_ip & decrement
 	 * the vnode references.
 	 */
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (target_ip != NULL) {
 		xfs_refcache_purge_ip(target_ip);
 		IRELE(target_ip);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6fff19dc3cf9..b3a5f07bd073 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -150,7 +150,7 @@ xfs_growfs_rt_alloc(
 		error = xfs_bmap_finish(&tp, &flist, &committed);
 		if (error)
 			goto error_exit;
-		xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 		/*
 		 * Now we need to clear the allocated blocks.
 		 * Do this one block per transaction, to keep it simple.
@@ -187,7 +187,7 @@ xfs_growfs_rt_alloc(
 			/*
 			 * Commit the transaction.
 			 */
-			xfs_trans_commit(tp, 0, NULL);
+			xfs_trans_commit(tp, 0);
 		}
 		/*
 		 * Go on to the next extent, if any.
@@ -2042,7 +2042,7 @@ xfs_growfs_rt(
 		/*
 		 * Commit the transaction.
 		 */
-		xfs_trans_commit(tp, 0, NULL);
+		xfs_trans_commit(tp, 0);
 	}
 
 	if (error)
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 1ea7c0ca6ae0..905d1c008be7 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -83,7 +83,7 @@ xfs_write_clear_setuid(
 	}
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0, NULL);
+	error = xfs_trans_commit(tp, 0);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return 0;
 }
@@ -164,7 +164,7 @@ xfs_write_sync_logforce(
 			xfs_trans_ihold(tp, ip);
 			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 			xfs_trans_set_sync(tp);
-			error = xfs_trans_commit(tp, 0, NULL);
+			error = xfs_trans_commit(tp, 0);
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		}
 	}
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 301ff9445b6f..cc2d60951e21 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -753,7 +753,6 @@ int
 _xfs_trans_commit(
 	xfs_trans_t	*tp,
 	uint		flags,
-	xfs_lsn_t	*commit_lsn_p,
 	int		*log_flushed)
 {
 	xfs_log_iovec_t		*log_vector;
@@ -812,8 +811,6 @@ shut_us_down:
 		xfs_trans_free_busy(tp);
 		xfs_trans_free(tp);
 		XFS_STATS_INC(xs_trans_empty);
-		if (commit_lsn_p)
-			*commit_lsn_p = commit_lsn;
 		return (shutdown);
 	}
 	ASSERT(tp->t_ticket != NULL);
@@ -864,9 +861,6 @@ shut_us_down:
 		kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t));
 	}
 
-	if (commit_lsn_p)
-		*commit_lsn_p = commit_lsn;
-
 	/*
 	 * If we got a log write error. Unpin the logitems that we
 	 * had pinned, clean up, free trans structure, and return error.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f1d7ab236726..7dfcc450366f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -988,10 +988,8 @@ void		xfs_trans_log_efd_extent(xfs_trans_t *,
 					 xfs_extlen_t);
 int		_xfs_trans_commit(xfs_trans_t *,
 				  uint flags,
-				  xfs_lsn_t *,
 				  int *);
-#define xfs_trans_commit(tp, flags, lsn) \
-	_xfs_trans_commit(tp, flags, lsn, NULL)
+#define xfs_trans_commit(tp, flags)	_xfs_trans_commit(tp, flags, NULL)
 void		xfs_trans_cancel(xfs_trans_t *, int);
 void		xfs_trans_ail_init(struct xfs_mount *);
 xfs_lsn_t	xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 9014d7e44488..20ffec308e1e 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -222,7 +222,7 @@ xfs_dir_ialloc(
 		}
 
 		ntp = xfs_trans_dup(tp);
-		code = xfs_trans_commit(tp, 0, NULL);
+		code = xfs_trans_commit(tp, 0);
 		tp = ntp;
 		if (committed != NULL) {
 			*committed = 1;
@@ -420,7 +420,11 @@ xfs_truncate_file(
 	 * in a transaction.
 	 */
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-	xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
+	error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
+	if (error) {
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return error;
+	}
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
 	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
@@ -460,8 +464,7 @@ xfs_truncate_file(
 				 XFS_TRANS_ABORT);
 	} else {
 		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
-					 NULL);
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	}
 	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 29f72f613782..65c561201cb8 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -696,7 +696,7 @@ xfs_unmount_flush(
 	bhv_vnode_t	*rvp = XFS_ITOV(rip);
 	int		error;
 
-	xfs_ilock(rip, XFS_ILOCK_EXCL);
+	xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	xfs_iflock(rip);
 
 	/*
@@ -1147,7 +1147,7 @@ xfs_sync_inodes(
 			if (XFS_FORCED_SHUTDOWN(mp)) {
 				bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF);
 			} else {
-				bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF);
+				error = bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF);
 			}
 
 			xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -1539,7 +1539,7 @@ xfs_syncsub(
 		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 		xfs_trans_ihold(tp, ip);
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		error = xfs_trans_commit(tp, 0, NULL);
+		error = xfs_trans_commit(tp, 0);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
 	}
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 52c41714ec54..de17aed578f0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -133,7 +133,7 @@ xfs_getattr(
 	if (!(flags & ATTR_LAZY))
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
 
-	vap->va_size = ip->i_d.di_size;
+	vap->va_size = XFS_ISIZE(ip);
 	if (vap->va_mask == XFS_AT_SIZE)
 		goto all_done;
 
@@ -496,7 +496,7 @@ xfs_setattr(
 	if (mask & XFS_AT_SIZE) {
 		/* Short circuit the truncate case for zero length files */
 		if ((vap->va_size == 0) &&
-		   (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
+		   (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 			lock_flags &= ~XFS_ILOCK_EXCL;
 			if (mask & XFS_AT_CTIME)
@@ -614,7 +614,7 @@ xfs_setattr(
 	 */
 	if (mask & XFS_AT_SIZE) {
 		code = 0;
-		if ((vap->va_size > ip->i_d.di_size) && 
+		if ((vap->va_size > ip->i_size) &&
 		    (flags & ATTR_NOSIZETOK) == 0) {
 			code = xfs_igrow_start(ip, vap->va_size, credp);
 		}
@@ -654,10 +654,10 @@ xfs_setattr(
 	 * Truncate file.  Must have write permission and not be a directory.
 	 */
 	if (mask & XFS_AT_SIZE) {
-		if (vap->va_size > ip->i_d.di_size) {
+		if (vap->va_size > ip->i_size) {
 			xfs_igrow_finish(tp, ip, vap->va_size,
 			    !(flags & ATTR_DMI));
-		} else if ((vap->va_size <= ip->i_d.di_size) ||
+		} else if ((vap->va_size <= ip->i_size) ||
 			   ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 			/*
 			 * signal a sync transaction unless
@@ -873,7 +873,7 @@ xfs_setattr(
 		if (mp->m_flags & XFS_MOUNT_WSYNC)
 			xfs_trans_set_sync(tp);
 
-		code = xfs_trans_commit(tp, commit_flags, NULL);
+		code = xfs_trans_commit(tp, commit_flags);
 	}
 
 	/*
@@ -1176,7 +1176,7 @@ xfs_fsync(
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 		if (flag & FSYNC_WAIT)
 			xfs_trans_set_sync(tp);
-		error = _xfs_trans_commit(tp, 0, NULL, &log_flushed);
+		error = _xfs_trans_commit(tp, 0, &log_flushed);
 
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
@@ -1221,7 +1221,7 @@ xfs_inactive_free_eofblocks(
 	 * Figure out if there are any blocks beyond the end
 	 * of the file.  If not, then there is nothing to do.
 	 */
-	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
+	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
 	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
 	map_len = last_fsb - end_fsb;
 	if (map_len <= 0)
@@ -1257,8 +1257,12 @@ xfs_inactive_free_eofblocks(
 		 * do that within a transaction.
 		 */
 		xfs_ilock(ip, XFS_IOLOCK_EXCL);
-		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
-				    ip->i_d.di_size);
+		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+				    ip->i_size);
+		if (error) {
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return error;
+		}
 
 		error = xfs_trans_reserve(tp, 0,
 					  XFS_ITRUNCATE_LOG_RES(mp),
@@ -1278,7 +1282,7 @@ xfs_inactive_free_eofblocks(
 		xfs_trans_ihold(tp, ip);
 
 		error = xfs_itruncate_finish(&tp, ip,
-					     ip->i_d.di_size,
+					     ip->i_size,
 					     XFS_DATA_FORK,
 					     0);
 		/*
@@ -1291,8 +1295,7 @@ xfs_inactive_free_eofblocks(
 					  XFS_TRANS_ABORT));
 		} else {
 			error = xfs_trans_commit(tp,
-						XFS_TRANS_RELEASE_LOG_RES,
-						NULL);
+						XFS_TRANS_RELEASE_LOG_RES);
 		}
 		xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 	}
@@ -1406,7 +1409,7 @@ xfs_inactive_symlink_rmt(
 	 * we need to unlock the inode since the new transaction doesn't
 	 * have the inode attached.
 	 */
-	error = xfs_trans_commit(tp, 0, NULL);
+	error = xfs_trans_commit(tp, 0);
 	tp = ntp;
 	if (error) {
 		ASSERT(XFS_FORCED_SHUTDOWN(mp));
@@ -1503,7 +1506,7 @@ xfs_inactive_attrs(
 	tp = *tpp;
 	mp = ip->i_mount;
 	ASSERT(ip->i_d.di_forkoff != 0);
-	xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	error = xfs_attr_inactive(ip);
@@ -1565,7 +1568,7 @@ xfs_release(
 
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
+		     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
 		       ip->i_delayed_blks > 0)) &&
 		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 		    (!(ip->i_d.di_flags &
@@ -1626,8 +1629,8 @@ xfs_inactive(
 	 * only one with a reference to the inode.
 	 */
 	truncate = ((ip->i_d.di_nlink == 0) &&
-            ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) ||
-             (ip->i_delayed_blks > 0)) &&
+	    ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+	     (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
 	    ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
 
 	mp = ip->i_mount;
@@ -1645,7 +1648,7 @@ xfs_inactive(
 
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
+                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
                        ip->i_delayed_blks > 0)) &&
 		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
 		     (!(ip->i_d.di_flags &
@@ -1675,7 +1678,11 @@ xfs_inactive(
 		 */
 		xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
+		error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
+		if (error) {
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return VN_INACTIVE_CACHE;
+		}
 
 		error = xfs_trans_reserve(tp, 0,
 					  XFS_ITRUNCATE_LOG_RES(mp),
@@ -1790,7 +1797,7 @@ xfs_inactive(
 		 * nothing we can do except to try to keep going.
 		 */
 		(void) xfs_bmap_finish(&tp,  &free_list, &committed);
-		(void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		(void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	}
 	/*
 	 * Release the dquots held by inode, if any.
@@ -1940,7 +1947,7 @@ xfs_create(
 		goto error_return;
 	}
 
-	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 
@@ -2026,7 +2033,7 @@ xfs_create(
 		goto abort_rele;
 	}
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (error) {
 		IRELE(ip);
 		tp = NULL;
@@ -2121,7 +2128,6 @@ int xfs_rm_attempts;
 STATIC int
 xfs_lock_dir_and_entry(
 	xfs_inode_t	*dp,
-	bhv_vname_t	*dentry,
 	xfs_inode_t	*ip)	/* inode of entry 'name' */
 {
 	int		attempts;
@@ -2135,7 +2141,7 @@ xfs_lock_dir_and_entry(
 	attempts = 0;
 
 again:
-	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 
 	e_inum = ip->i_ino;
 
@@ -2204,6 +2210,21 @@ int xfs_lock_delays;
 #endif
 
 /*
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * a different value
+ */
+static inline int
+xfs_lock_inumorder(int lock_mode, int subclass)
+{
+	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+		lock_mode |= (subclass + XFS_IOLOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
+		lock_mode |= (subclass + XFS_ILOCK_INUMORDER) << XFS_ILOCK_SHIFT;
+
+	return lock_mode;
+}
+
+/*
  * The following routine will lock n inodes in exclusive mode.
  * We assume the caller calls us with the inodes in i_ino order.
  *
@@ -2270,7 +2291,7 @@ again:
 			 * that is in the AIL.
 			 */
 			ASSERT(i != 0);
-			if (!xfs_ilock_nowait(ips[i], lock_mode)) {
+			if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
 				attempts++;
 
 				/*
@@ -2305,7 +2326,7 @@ again:
 				goto again;
 			}
 		} else {
-			xfs_ilock(ips[i], lock_mode);
+			xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
 		}
 	}
 
@@ -2440,7 +2461,7 @@ xfs_remove(
 		return error;
 	}
 
-	error = xfs_lock_dir_and_entry(dp, dentry, ip);
+	error = xfs_lock_dir_and_entry(dp, ip);
 	if (error) {
 		REMOVE_DEBUG_TRACE(__LINE__);
 		xfs_trans_cancel(tp, cancel_flags);
@@ -2511,7 +2532,7 @@ xfs_remove(
 		goto error_rele;
 	}
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (error) {
 		IRELE(ip);
 		goto std_return;
@@ -2719,7 +2740,7 @@ xfs_link(
 		goto abort_return;
 	}
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (error)
 		goto std_return;
 
@@ -2839,7 +2860,7 @@ xfs_mkdir(
 		goto error_return;
 	}
 
-	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 
 	/*
 	 * Check for directory link count overflow.
@@ -2936,7 +2957,7 @@ xfs_mkdir(
 		goto error2;
 	}
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	XFS_QM_DQRELE(mp, udqp);
 	XFS_QM_DQRELE(mp, gdqp);
 	if (error) {
@@ -3096,7 +3117,7 @@ xfs_rmdir(
 	 * that the directory entry for the child directory inode has
 	 * not changed while we were obtaining a log reservation.
 	 */
-	error = xfs_lock_dir_and_entry(dp, dentry, cdp);
+	error = xfs_lock_dir_and_entry(dp, cdp);
 	if (error) {
 		xfs_trans_cancel(tp, cancel_flags);
 		IRELE(cdp);
@@ -3190,7 +3211,7 @@ xfs_rmdir(
 		goto std_return;
 	}
 
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (error) {
 		IRELE(cdp);
 		goto std_return;
@@ -3393,7 +3414,7 @@ xfs_symlink(
 		goto error_return;
 	}
 
-	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 
 	/*
 	 * Check whether the directory allows new symlinks or not.
@@ -3535,7 +3556,7 @@ xfs_symlink(
 	if (error) {
 		goto error2;
 	}
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	XFS_QM_DQRELE(mp, udqp);
 	XFS_QM_DQRELE(mp, gdqp);
 
@@ -3790,7 +3811,7 @@ xfs_set_dmattrs (
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	IHOLD(ip);
-	error = xfs_trans_commit(tp, 0, NULL);
+	error = xfs_trans_commit(tp, 0);
 
 	return error;
 }
@@ -4049,14 +4070,14 @@ xfs_alloc_file_space(
 	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
 
 	/*	Generate a DMAPI event if needed.	*/
-	if (alloc_type != 0 && offset < ip->i_d.di_size &&
+	if (alloc_type != 0 && offset < ip->i_size &&
 			(attr_flags&ATTR_DMI) == 0  &&
 			DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
 		xfs_off_t           end_dmi_offset;
 
 		end_dmi_offset = offset+len;
-		if (end_dmi_offset > ip->i_d.di_size)
-			end_dmi_offset = ip->i_d.di_size;
+		if (end_dmi_offset > ip->i_size)
+			end_dmi_offset = ip->i_size;
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
 			offset, end_dmi_offset - offset,
 			0, NULL);
@@ -4148,7 +4169,7 @@ retry:
 			goto error0;
 		}
 
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (error) {
 			break;
@@ -4283,7 +4304,6 @@ xfs_free_file_space(
 	int			error;
 	xfs_fsblock_t		firstfsb;
 	xfs_bmap_free_t		free_list;
-	xfs_off_t		ilen;
 	xfs_bmbt_irec_t		imap;
 	xfs_off_t		ioffset;
 	xfs_extlen_t		mod=0;
@@ -4312,11 +4332,11 @@ xfs_free_file_space(
 	end_dmi_offset = offset + len;
 	endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
 
-	if (offset < ip->i_d.di_size &&
+	if (offset < ip->i_size &&
 	    (attr_flags & ATTR_DMI) == 0 &&
 	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
-		if (end_dmi_offset > ip->i_d.di_size)
-			end_dmi_offset = ip->i_d.di_size;
+		if (end_dmi_offset > ip->i_size)
+			end_dmi_offset = ip->i_size;
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
 				offset, end_dmi_offset - offset,
 				AT_DELAY_FLAG(attr_flags), NULL);
@@ -4332,16 +4352,15 @@ xfs_free_file_space(
 	}
 
 	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
-	ilen = len + (offset & (rounding - 1));
 	ioffset = offset & ~(rounding - 1);
-	if (ilen & (rounding - 1))
-		ilen = (ilen + rounding) & ~(rounding - 1);
 
 	if (VN_CACHED(vp) != 0) {
 		xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
 				ctooff(offtoct(ioffset)), -1);
-		bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
+		error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
 				-1, FI_REMAPF_LOCKED);
+		if (error)
+			goto out_unlock_iolock;
 	}
 
 	/*
@@ -4455,7 +4474,7 @@ xfs_free_file_space(
 			goto error0;
 		}
 
-		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 
@@ -4533,7 +4552,7 @@ xfs_change_file_space(
 		bf->l_start += offset;
 		break;
 	case 2: /*SEEK_END*/
-		bf->l_start += ip->i_d.di_size;
+		bf->l_start += ip->i_size;
 		break;
 	default:
 		return XFS_ERROR(EINVAL);
@@ -4550,7 +4569,7 @@ xfs_change_file_space(
 	bf->l_whence = 0;
 
 	startoffset = bf->l_start;
-	fsize = ip->i_d.di_size;
+	fsize = ip->i_size;
 
 	/*
 	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
@@ -4649,7 +4668,7 @@ xfs_change_file_space(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	xfs_trans_set_sync(tp);
 
-	error = xfs_trans_commit(tp, 0, NULL);
+	error = xfs_trans_commit(tp, 0);
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);