From 4879b44829d94a1f8facf90cced3c5f23c5a8c62 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 19 Oct 2007 21:57:39 +0000
Subject: [CIFS] ACL support part 5

Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c   | 23 +++++++++++++++++++++++
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/inode.c     |  6 ++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index e8e56353f5a1..e8083043a26c 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -129,6 +129,29 @@ int compare_sids(struct cifs_sid *ctsid, struct cifs_sid *cwsid)
 	return (1); /* sids compare/match */
 }
 
+void get_mode_from_acl(struct inode * inode, const char * path)
+{
+	
+	if (inode == NULL)
+		return;
+
+	/* find an open readable handle
+	   if handle found
+		 lock handle 
+	   else open file
+	      if no open file can not hurt to check if path is null
+	   GetCIFSACL
+	   for all ACEs in ACL {
+		   if U or G or O
+			   inode->i_mode = parse_ace(file_type, UG or O, ace->perms, inode->i_mode)
+		   else continue
+	   }
+	   if handle open close it
+	   else unlock handle */
+
+	return;
+}
+
 
 static void parse_ace(struct cifs_ace *pace, char *end_of_acl)
 {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1a883663b22d..7c445f8f233f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,7 +92,7 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
-
+extern void get_mode_from_acl(struct inode * inode, const char * search_path);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 			const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5e8b388be3b6..9a5c0c925bab 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -527,6 +527,12 @@ int cifs_get_inode_info(struct inode **pinode,
 
 		/* BB fill in uid and gid here? with help from winbind?
 		   or retrieve from NTFS stream extended attribute */
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+			cFYI(1, ("Getting mode bits from ACL"));
+			get_mode_from_acl(inode, search_path);
+		}
+#endif
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
 			/* fill in uid, gid, mode from server ACL */
 			/* BB FIXME this should also take into account the
-- 
cgit v1.2.3


From c94897790e7c67dcfe3a0b6f035996398c268313 Mon Sep 17 00:00:00 2001
From: Parag Warudkar <kernel-stuff@comcast.net>
Date: Tue, 23 Oct 2007 18:09:48 +0000
Subject: [CIFS] remove unused funtion compile warning when experimental off

get rid of couple of unused function warnings which
show up when CONFIG_CIFS_EXPERIMENTAL is not defined - wrap them in
#ifdef CONFIG_CIFS_EXPERIMENTAL. Patch against current git.

Signed-off-by: Parag Warudkar <kernel-stuff@comcast.net>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f0d9a485d095..61d24f6ee64e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2486,6 +2486,7 @@ querySymLinkRetry:
 	return rc;
 }
 
+#ifdef CONFIG_CIFS_EXPERIMENTAL
 /* Initialize NT TRANSACT SMB into small smb request buffer.
    This assumes that all NT TRANSACTS that we init here have
    total parm and data under about 400 bytes (to fit in small cifs
@@ -2569,6 +2570,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 	}
 	return 0;
 }
+#endif /* CIFS_EXPERIMENTAL */
 
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
-- 
cgit v1.2.3


From 44093ca2fef3c52dc7d186116862d74f9a676e0f Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 23 Oct 2007 21:22:55 +0000
Subject: [CIFS] acl support part 6

CC: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 79 +++++++++++++++++--------------------------------------
 fs/cifs/cifsacl.h | 12 +++------
 fs/cifs/dir.c     |  2 +-
 3 files changed, 28 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index e8083043a26c..154cb8449b9b 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -38,8 +38,8 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 	{{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
 	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
 	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"}
-};
+	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
+;
 
 
 /* security id for everyone */
@@ -131,6 +131,8 @@ int compare_sids(struct cifs_sid *ctsid, struct cifs_sid *cwsid)
 
 void get_mode_from_acl(struct inode * inode, const char * path)
 {
+
+	cFYI(1, ("get mode from ACL for %s", path));
 	
 	if (inode == NULL)
 		return;
@@ -159,50 +161,36 @@ static void parse_ace(struct cifs_ace *pace, char *end_of_acl)
 
 	/* validate that we do not go past end of acl */
 
-	/* XXX this if statement can be removed
-	if (end_of_acl < (char *)pace + sizeof(struct cifs_ace)) {
+	if (le16_to_cpu(pace->size) < 16) {
+		cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size)));
+		return;
+	}
+
+	if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
 		cERROR(1, ("ACL too small to parse ACE"));
 		return;
-	} */
+	}
 
-	num_subauth = pace->num_subauth;
+	num_subauth = pace->sid.num_subauth;
 	if (num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
 		int i;
-		cFYI(1, ("ACE revision %d num_subauth %d",
-			pace->revision, pace->num_subauth));
+		cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
+			pace->sid.revision, pace->sid.num_subauth, pace->type,
+			pace->flags, pace->size));
 		for (i = 0; i < num_subauth; ++i) {
 			cFYI(1, ("ACE sub_auth[%d]: 0x%x", i,
-				le32_to_cpu(pace->sub_auth[i])));
+				le32_to_cpu(pace->sid.sub_auth[i])));
 		}
 
 		/* BB add length check to make sure that we do not have huge
 			num auths and therefore go off the end */
-
-		cFYI(1, ("RID %d", le32_to_cpu(pace->sub_auth[num_subauth-1])));
 #endif
 	}
 
 	return;
 }
 
-static void parse_ntace(struct cifs_ntace *pntace, char *end_of_acl)
-{
-	/* validate that we do not go past end of acl */
-	if (end_of_acl < (char *)pntace + sizeof(struct cifs_ntace)) {
-		cERROR(1, ("ACL too small to parse NT ACE"));
-		return;
-	}
-
-#ifdef CONFIG_CIFS_DEBUG2
-	cFYI(1, ("NTACE type %d flags 0x%x size %d, access Req 0x%x",
-		pntace->type, pntace->flags, pntace->size,
-		pntace->access_req));
-#endif
-	return;
-}
-
-
 
 static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 		       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid)
@@ -211,7 +199,6 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 	int num_aces = 0;
 	int acl_size;
 	char *acl_base;
-	struct cifs_ntace **ppntace;
 	struct cifs_ace **ppace;
 
 	/* BB need to add parm so we can store the SID BB */
@@ -233,45 +220,27 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 	num_aces = le32_to_cpu(pdacl->num_aces);
 	if (num_aces  > 0) {
-		ppntace = kmalloc(num_aces * sizeof(struct cifs_ntace *),
-				GFP_KERNEL);
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
 
 /*		cifscred->cecount = pdacl->num_aces;
-		cifscred->ntaces = kmalloc(num_aces *
-			sizeof(struct cifs_ntace *), GFP_KERNEL);
 		cifscred->aces = kmalloc(num_aces *
 			sizeof(struct cifs_ace *), GFP_KERNEL);*/
 
 		for (i = 0; i < num_aces; ++i) {
-			ppntace[i] = (struct cifs_ntace *)
-					(acl_base + acl_size);
-			ppace[i] = (struct cifs_ace *) ((char *)ppntace[i] +
-					sizeof(struct cifs_ntace));
-
-			parse_ntace(ppntace[i], end_of_acl);
-			if (end_of_acl < ((char *)ppace[i] +
-					(le16_to_cpu(ppntace[i]->size) -
-					sizeof(struct cifs_ntace)))) {
-				cERROR(1, ("ACL too small to parse ACE"));
-				break;
-			} else
-				parse_ace(ppace[i], end_of_acl);
+			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
+
+			parse_ace(ppace[i], end_of_acl);
 
-/*			memcpy((void *)(&(cifscred->ntaces[i])),
-				(void *)ppntace[i],
-				sizeof(struct cifs_ntace));
-			memcpy((void *)(&(cifscred->aces[i])),
+/*			memcpy((void *)(&(cifscred->aces[i])),
 				(void *)ppace[i],
 				sizeof(struct cifs_ace)); */
 
-			acl_base = (char *)ppntace[i];
-			acl_size = le16_to_cpu(ppntace[i]->size);
+			acl_base = (char *)ppace[i];
+			acl_size = le16_to_cpu(ppace[i]->size);
 		}
 
 		kfree(ppace);
-		kfree(ppntace);
 	}
 
 	return;
@@ -292,8 +261,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 	if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
 		int i;
-		cFYI(1, ("SID revision %d num_auth %d First subauth 0x%x",
-			psid->revision, psid->num_subauth, psid->sub_auth[0]));
+		cFYI(1, ("SID revision %d num_auth %d",
+			psid->revision, psid->num_subauth));
 
 		for (i = 0; i < psid->num_subauth; i++) {
 			cFYI(1, ("SID sub_auth[%d]: 0x%x ", i,
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 420f87813647..06d52006bf26 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -48,7 +48,7 @@ struct cifs_sid {
 	__u8 revision; /* revision level */
 	__u8 num_subauth;
 	__u8 authority[6];
-	__le32 sub_auth[5]; /* sub_auth[num_subauth] */ /* BB FIXME endianness BB */
+	__le32 sub_auth[5]; /* sub_auth[num_subauth] */
 } __attribute__((packed));
 
 struct cifs_acl {
@@ -57,18 +57,12 @@ struct cifs_acl {
 	__le32 num_aces;
 } __attribute__((packed));
 
-struct cifs_ntace { /* first part of ACE which contains perms */
+struct cifs_ace {
 	__u8 type;
 	__u8 flags;
 	__le16 size;
 	__le32 access_req;
-} __attribute__((packed));
-
-struct cifs_ace { /* last part of ACE which includes user info */
-	__u8 revision; /* revision level */
-	__u8 num_subauth;
-	__u8 authority[6];
-	__le32 sub_auth[5];
+	struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
 } __attribute__((packed));
 
 struct cifs_wksid {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 793404b10925..37dc97af1487 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -593,7 +593,7 @@ static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
 		 * case take precedence.  If a is not a negative dentry, this
 		 * should have no side effects
 		 */
-		memcpy((unsigned char *)a->name, b->name, a->len);
+		memcpy(a->name, b->name, a->len);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.2.3


From 630f3f0c45a80ab907d216191ef4a205c249fa1b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 25 Oct 2007 21:17:17 +0000
Subject: [CIFS] acl support part 6

Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com>
CC: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c    | 91 ++++++++++++++++++++++++++++++++++++++++------------
 fs/cifs/cifsproto.h  |  9 +++---
 fs/cifs/cifssmb.c    | 55 ++++++++++++++++++++++---------
 fs/cifs/file.c       | 31 ++++++++++++++++++
 fs/cifs/inode.c      |  2 +-
 fs/cifs/md5.c        |  8 ++---
 fs/cifs/misc.c       | 10 +++---
 fs/cifs/netmisc.c    | 12 +++----
 fs/cifs/smbencrypt.c |  4 +--
 fs/cifs/xattr.c      |  7 ++--
 10 files changed, 169 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 154cb8449b9b..14200bd45b30 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -97,7 +97,7 @@ int match_sid(struct cifs_sid *ctsid)
 
 /* if the two SIDs (roughly equivalent to a UUID for a user or group) are
    the same returns 1, if they do not match returns 0 */
-int compare_sids(struct cifs_sid *ctsid, struct cifs_sid *cwsid)
+int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 {
 	int i;
 	int num_subauth, num_sat, num_saw;
@@ -129,28 +129,77 @@ int compare_sids(struct cifs_sid *ctsid, struct cifs_sid *cwsid)
 	return (1); /* sids compare/match */
 }
 
-void get_mode_from_acl(struct inode * inode, const char * path)
+/*
+   change posix mode to reflect permissions
+   pmode is the existing mode (we only want to overwrite part of this
+   bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007
+*/
+static void access_flags_to_mode(__u32 access_flags, umode_t * pmode,
+				 umode_t bits_to_set)
+{
+
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1, ("access flags 0x%x mode now 0x%x", access_flags, *pmode);
+#endif
+
+	return;
+}
+
+/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
+
+void acl_to_uid_mode(struct inode *inode, const char *path)
 {
+	struct cifsFileInfo *open_file;
+	int unlock_file = FALSE;
+	int xid;
+	int rc = -EIO;
+	__u16 fid;
+	struct super_block *sb;
+	struct cifs_sb_info *cifs_sb;
 
 	cFYI(1, ("get mode from ACL for %s", path));
 	
 	if (inode == NULL)
 		return;
 
-	/* find an open readable handle
-	   if handle found
-		 lock handle 
-	   else open file
-	      if no open file can not hurt to check if path is null
-	   GetCIFSACL
-	   for all ACEs in ACL {
-		   if U or G or O
-			   inode->i_mode = parse_ace(file_type, UG or O, ace->perms, inode->i_mode)
-		   else continue
-	   }
-	   if handle open close it
-	   else unlock handle */
+	xid = GetXid();
+	open_file = find_readable_file(CIFS_I(inode));
+	if (open_file) {
+		unlock_file = TRUE;
+		fid = open_file->netfid;
+	} else {
+		int oplock = FALSE;
+		/* open file */
+		sb = inode->i_sb;
+		if (sb == NULL) {
+			FreeXid(xid);
+			return;
+		}
+		cifs_sb = CIFS_SB(sb);
+		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
+				GENERIC_READ, 0, &fid, &oplock, NULL,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc != 0) {
+			cERROR(1, ("Unable to open file to get ACL"));
+			FreeXid(xid);
+			return;
+		}
+	}
+
+	/*   rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, pntsd, acllen,
+				    ACL_TYPE_ACCESS); */
+
+	if (unlock_file == TRUE)
+		atomic_dec(&open_file->wrtPending);
+	else
+		CIFSSMBClose(xid, cifs_sb->tcon, fid);
+
+/* parse ACEs e.g.
+	rc = parse_sec_desc(pntsd, acllen, inode);
+*/
 
+	FreeXid(xid);
 	return;
 }
 
@@ -193,7 +242,8 @@ static void parse_ace(struct cifs_ace *pace, char *end_of_acl)
 
 
 static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
-		       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid)
+		       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid
+		       struct inode *inode)
 {
 	int i;
 	int num_aces = 0;
@@ -281,7 +331,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 
 
 /* Convert CIFS ACL to POSIX form */
-int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len)
+static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
+			  struct inode *inode)
 {
 	int rc;
 	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
@@ -310,14 +361,14 @@ int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len)
 	if (rc)
 		return rc;
 
-	parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr);
+	parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, inode);
 
 /*	cifscred->uid = owner_sid_ptr->rid;
 	cifscred->gid = group_sid_ptr->rid;
 	memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
-			sizeof (struct cifs_sid));
+			sizeof(struct cifs_sid));
 	memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
-			sizeof (struct cifs_sid)); */
+			sizeof(struct cifs_sid)); */
 
 
 	return (0);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 7c445f8f233f..88c02ac97c3f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -61,6 +61,9 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
 extern int is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
+#endif
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -92,7 +95,7 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
-extern void get_mode_from_acl(struct inode * inode, const char * search_path);
+extern void acl_to_uid_mode(struct inode *inode, const char *search_path);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 			const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
@@ -311,7 +314,6 @@ extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key);
 #endif /* CIFS_WEAK_PW_HASH */
-extern int parse_sec_desc(struct cifs_ntsd *, int);
 extern int CIFSSMBCopy(int xid,
 			struct cifsTconInfo *source_tcon,
 			const char *fromName,
@@ -336,8 +338,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
 		const void *ea_value, const __u16 ea_value_len,
 		const struct nls_table *nls_codepage, int remap_special_chars);
 extern int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon,
-			__u16 fid, char *acl_inf, const int buflen,
-			const int acl_type /* ACCESS vs. DEFAULT */);
+			__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
 extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
 		const unsigned char *searchName,
 		char *acl_inf, const int buflen, const int acl_type,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 61d24f6ee64e..cc17e98991f3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2526,12 +2526,15 @@ smb_init_ntransact(const __u16 sub_command, const int setup_count,
 
 static int
 validate_ntransact(char *buf, char **ppparm, char **ppdata,
-		   int *pdatalen, int *pparmlen)
+		   __u32 *pdatalen, __u32 *pparmlen)
 {
 	char *end_of_smb;
 	__u32 data_count, data_offset, parm_count, parm_offset;
 	struct smb_com_ntransact_rsp *pSMBr;
 
+	*pdatalen = 0;
+	*pparmlen = 0;
+
 	if (buf == NULL)
 		return -EINVAL;
 
@@ -2568,6 +2571,8 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 		cFYI(1, ("parm count and data count larger than SMB"));
 		return -EINVAL;
 	}
+	*pdatalen = data_count;
+	*pparmlen = parm_count;
 	return 0;
 }
 #endif /* CIFS_EXPERIMENTAL */
@@ -3069,8 +3074,7 @@ GetExtAttrOut:
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
 CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
-		/* BB fix up return info */ char *acl_inf, const int buflen,
-		  const int acl_type)
+		  struct cifs_ntsd **acl_inf, __u32 *pbuflen)
 {
 	int rc = 0;
 	int buf_type = 0;
@@ -3079,6 +3083,9 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 
 	cFYI(1, ("GetCifsACL"));
 
+	*pbuflen = 0;
+	*acl_inf = NULL;
+
 	rc = smb_init_ntransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0,
 			8 /* parm len */, tcon, (void **) &pSMB);
 	if (rc)
@@ -3101,34 +3108,52 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	if (rc) {
 		cFYI(1, ("Send error in QuerySecDesc = %d", rc));
 	} else {                /* decode response */
-		struct cifs_ntsd *psec_desc;
 		__le32 * parm;
-		int parm_len;
-		int data_len;
-		int acl_len;
+		__u32 parm_len;
+		__u32 acl_len;
 		struct smb_com_ntransact_rsp *pSMBr;
+		char *pdata;
 
 /* validate_nttransact */
 		rc = validate_ntransact(iov[0].iov_base, (char **)&parm,
-					(char **)&psec_desc,
-					&parm_len, &data_len);
+					&pdata, &parm_len, pbuflen);
 		if (rc)
 			goto qsec_out;
 		pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
 
-		cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, psec_desc));
+		cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf));
 
 		if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
 			rc = -EIO;      /* bad smb */
+			*pbuflen = 0;
 			goto qsec_out;
 		}
 
 /* BB check that data area is minimum length and as big as acl_len */
 
 		acl_len = le32_to_cpu(*parm);
-		/* BB check if (acl_len > bufsize) */
+		if (acl_len != *pbuflen) {
+			cERROR(1, ("acl length %d does not match %d",
+				   acl_len, *pbuflen));
+			if (*pbuflen > acl_len)
+				*pbuflen = acl_len;
+		}
 
-		parse_sec_desc(psec_desc, acl_len);
+		/* check if buffer is big enough for the acl
+		   header followed by the smallest SID */
+		if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
+		    (*pbuflen >= 64 * 1024)) {
+			cERROR(1, ("bad acl length %d", *pbuflen));
+			rc = -EINVAL;
+			*pbuflen = 0;
+		} else {
+			*acl_inf = kmalloc(*pbuflen, GFP_KERNEL);
+			if (*acl_inf == NULL) {
+				*pbuflen = 0;
+				rc = -ENOMEM;
+			}
+			memcpy(*acl_inf, pdata, *pbuflen);
+		}
 	}
 qsec_out:
 	if (buf_type == CIFS_SMALL_BUFFER)
@@ -3383,7 +3408,7 @@ UnixQPathInfoRetry:
 			memcpy((char *) pFindData,
 			       (char *) &pSMBr->hdr.Protocol +
 			       data_offset,
-			       sizeof (FILE_UNIX_BASIC_INFO));
+			       sizeof(FILE_UNIX_BASIC_INFO));
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -3651,7 +3676,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 	pSMB->SubCommand = cpu_to_le16(TRANS2_FIND_NEXT);
 	pSMB->SearchHandle = searchHandle;      /* always kept as le */
 	pSMB->SearchCount =
-		cpu_to_le16(CIFSMaxBufSize / sizeof (FILE_UNIX_INFO));
+		cpu_to_le16(CIFSMaxBufSize / sizeof(FILE_UNIX_INFO));
 	pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
 	pSMB->ResumeKey = psrch_inf->resume_key;
 	pSMB->SearchFlags =
@@ -4333,7 +4358,7 @@ QFSDeviceRetry:
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < sizeof (FILE_SYSTEM_DEVICE_INFO)))
+		if (rc || (pSMBr->ByteCount < sizeof(FILE_SYSTEM_DEVICE_INFO)))
 			rc = -EIO;	/* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1e7e4c06d9e3..68ad4ca0cfa3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1026,6 +1026,37 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	return total_written;
 }
 
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
+{
+	struct cifsFileInfo *open_file = NULL;
+
+	read_lock(&GlobalSMBSeslock);
+	/* we could simply get the first_list_entry since write-only entries
+	   are always at the end of the list but since the first entry might
+	   have a close pending, we go through the whole list */
+	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
+		if (open_file->closePend)
+			continue;
+		if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) ||
+		    (open_file->pfile->f_flags & O_RDONLY))) {
+			if (!open_file->invalidHandle) {
+				/* found a good file */
+				/* lock it so it will not be closed on us */
+				atomic_inc(&open_file->wrtPending);
+				read_unlock(&GlobalSMBSeslock);
+				return open_file;
+			} /* else might as well continue, and look for
+			     another, or simply have the caller reopen it
+			     again rather than trying to fix this handle */
+		} else /* write only file */
+			break; /* write only files are last so must be done */
+	}
+	read_unlock(&GlobalSMBSeslock);
+	return NULL;
+}
+#endif
+
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 {
 	struct cifsFileInfo *open_file;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9a5c0c925bab..9be0bbd20dfd 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -530,7 +530,7 @@ int cifs_get_inode_info(struct inode **pinode,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 			cFYI(1, ("Getting mode bits from ACL"));
-			get_mode_from_acl(inode, search_path);
+			acl_to_uid_mode(inode, search_path);
 		}
 #endif
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index e5c3e1212697..f13f96d42fcf 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -276,8 +276,8 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
 	}
 
 	/* start out by storing key in pads */
-	memset(ctx->k_ipad, 0, sizeof (ctx->k_ipad));
-	memset(ctx->k_opad, 0, sizeof (ctx->k_opad));
+	memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
+	memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
 	memcpy(ctx->k_ipad, key, key_len);
 	memcpy(ctx->k_opad, key, key_len);
 
@@ -307,8 +307,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
 	}
 
 	/* start out by storing key in pads */
-	memset(ctx->k_ipad, 0, sizeof (ctx->k_ipad));
-	memset(ctx->k_opad, 0, sizeof (ctx->k_opad));
+	memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
+	memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
 	memcpy(ctx->k_ipad, key, key_len);
 	memcpy(ctx->k_opad, key, key_len);
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 51ec681fe74a..15546c2354c5 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -73,7 +73,7 @@ sesInfoAlloc(void)
 {
 	struct cifsSesInfo *ret_buf;
 
-	ret_buf = kzalloc(sizeof (struct cifsSesInfo), GFP_KERNEL);
+	ret_buf = kzalloc(sizeof(struct cifsSesInfo), GFP_KERNEL);
 	if (ret_buf) {
 		write_lock(&GlobalSMBSeslock);
 		atomic_inc(&sesInfoAllocCount);
@@ -109,7 +109,7 @@ struct cifsTconInfo *
 tconInfoAlloc(void)
 {
 	struct cifsTconInfo *ret_buf;
-	ret_buf = kzalloc(sizeof (struct cifsTconInfo), GFP_KERNEL);
+	ret_buf = kzalloc(sizeof(struct cifsTconInfo), GFP_KERNEL);
 	if (ret_buf) {
 		write_lock(&GlobalSMBSeslock);
 		atomic_inc(&tconInfoAllocCount);
@@ -298,7 +298,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 	memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
 
 	buffer->smb_buf_length =
-	    (2 * word_count) + sizeof (struct smb_hdr) -
+	    (2 * word_count) + sizeof(struct smb_hdr) -
 	    4 /*  RFC 1001 length field does not count */  +
 	    2 /* for bcc field itself */ ;
 	/* Note that this is the only network field that has to be converted
@@ -422,8 +422,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 	__u32 clc_len;  /* calculated length */
 	cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
 
-	if (length < 2 + sizeof (struct smb_hdr)) {
-		if ((length >= sizeof (struct smb_hdr) - 1)
+	if (length < 2 + sizeof(struct smb_hdr)) {
+		if ((length >= sizeof(struct smb_hdr) - 1)
 			    && (smb->Status.CifsError != 0)) {
 			smb->WordCount = 0;
 			/* some error cases do not return wct and bcc */
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index f06359cb22ee..4d35c034755a 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -793,8 +793,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
 	if (smberrclass == ERRDOS) {  /* 1 byte field no need to byte reverse */
 		for (i = 0;
 		     i <
-		     sizeof (mapping_table_ERRDOS) /
-		     sizeof (struct smb_to_posix_error); i++) {
+		     sizeof(mapping_table_ERRDOS) /
+		     sizeof(struct smb_to_posix_error); i++) {
 			if (mapping_table_ERRDOS[i].smb_err == 0)
 				break;
 			else if (mapping_table_ERRDOS[i].smb_err ==
@@ -807,8 +807,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
 	} else if (smberrclass == ERRSRV) {   /* server class of error codes */
 		for (i = 0;
 		     i <
-		     sizeof (mapping_table_ERRSRV) /
-		     sizeof (struct smb_to_posix_error); i++) {
+		     sizeof(mapping_table_ERRSRV) /
+		     sizeof(struct smb_to_posix_error); i++) {
 			if (mapping_table_ERRSRV[i].smb_err == 0)
 				break;
 			else if (mapping_table_ERRSRV[i].smb_err ==
@@ -837,14 +837,14 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
 unsigned int
 smbCalcSize(struct smb_hdr *ptr)
 {
-	return (sizeof (struct smb_hdr) + (2 * ptr->WordCount) +
+	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
 		2 /* size of the bcc field */ + BCC(ptr));
 }
 
 unsigned int
 smbCalcSize_LE(struct smb_hdr *ptr)
 {
-	return (sizeof (struct smb_hdr) + (2 * ptr->WordCount) +
+	return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
 		2 /* size of the bcc field */ + le16_to_cpu(BCC_LE(ptr)));
 }
 
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 90542a39be17..bd3c4674f2ba 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -135,7 +135,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
 
 	wpwd[len] = 0;	/* Ensure string is null terminated */
 	/* Calculate length in bytes */
-	len = _my_wcslen(wpwd) * sizeof (__u16);
+	len = _my_wcslen(wpwd) * sizeof(__u16);
 
 	mdfour(p16, (unsigned char *) wpwd, len);
 	memset(wpwd, 0, 129 * 2);
@@ -167,7 +167,7 @@ nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
 	E_P16((unsigned char *) passwd, (unsigned char *) p16);
 
 	/* clear out local copy of user's password (just being paranoid). */
-	memset(passwd, '\0', sizeof (passwd));
+	memset(passwd, '\0', sizeof(passwd));
 }
 #endif
 
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 369e838bebd3..12b125ff0bd0 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -265,6 +265,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 		else if(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 			__u16 fid;
 			int oplock = FALSE;
+			struct cifs_ntsd *pacl = NULL;
+			__u32 buflen = 0;
 			if (experimEnabled) 
 				rc = CIFSSMBOpen(xid, pTcon, full_path,
 					FILE_OPEN, GENERIC_READ, 0, &fid,
@@ -274,9 +276,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			/* else rc is EOPNOTSUPP from above */
 
 			if(rc == 0) {
-				rc = CIFSSMBGetCIFSACL(xid, pTcon, fid,
-					ea_value, buf_size,
-					ACL_TYPE_ACCESS);
+				rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
+						      &buflen);
 				CIFSSMBClose(xid, pTcon, fid);
 			}
 		}
-- 
cgit v1.2.3


From d61e5808d9a4e7c7f25914ceae50664a6454c3ca Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 26 Oct 2007 04:32:43 +0000
Subject: [CIFS] acl support part 7

Also fixes typo, build break

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 21 ++++++++++++++++++---
 fs/cifs/cifsacl.h |  2 +-
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 14200bd45b30..3a2d67b182d4 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -134,14 +134,29 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
    pmode is the existing mode (we only want to overwrite part of this
    bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007
 */
-static void access_flags_to_mode(__u32 access_flags, umode_t * pmode,
+static void access_flags_to_mode(__u32 ace_flags, umode_t *pmode,
 				 umode_t bits_to_set)
 {
 
+	*pmode &= ~bits_to_set;
+
+	if (ace_flags & GENERIC_ALL) {
+		*pmode |= (S_IRWXUGO & bits_to_set);
 #ifdef CONFIG_CIFS_DEBUG2
-	cFYI(1, ("access flags 0x%x mode now 0x%x", access_flags, *pmode);
+		cFYI(1, ("all perms"));
 #endif
+		return;
+	}
+	if ((ace_flags & GENERIC_WRITE) || (ace_flags & FILE_WRITE_RIGHTS))
+		*pmode |= (S_IWUGO & bits_to_set);
+	if ((ace_flags & GENERIC_READ) || (ace_flags & FILE_READ_RIGHTS))
+		*pmode |= (S_IRUGO & bits_to_set);
+	if ((ace_flags & GENERIC_EXECUTE) || (ace_flags & FILE_EXEC_RIGHTS))
+		*pmode |= (S_IXUGO & bits_to_set);
 
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1, ("access flags 0x%x mode now 0x%x", ace_flags, *pmode);
+#endif
 	return;
 }
 
@@ -242,7 +257,7 @@ static void parse_ace(struct cifs_ace *pace, char *end_of_acl)
 
 
 static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
-		       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid
+		       struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
 		       struct inode *inode)
 {
 	int i;
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 06d52006bf26..30b0caf66786 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -73,7 +73,7 @@ struct cifs_wksid {
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 
 extern int match_sid(struct cifs_sid *);
-extern int compare_sids(struct cifs_sid *, struct cifs_sid *);
+extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
 
 #endif /*  CONFIG_CIFS_EXPERIMENTAL */
 
-- 
cgit v1.2.3


From b9c7a2bb1e57f571d3b0763bdce1ce15510a7b78 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 26 Oct 2007 23:40:20 +0000
Subject: [CIFS] ACL support part 8

Now GetACL in getinodeinfo path when cifsacl mount option used, and
ACL is parsed for SIDs.  Missing only one piece now to be able
to retrieve the mode

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 133 ++++++++++++++++++++++++++++--------------------------
 fs/cifs/cifssmb.c |   6 +--
 2 files changed, 73 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 3a2d67b182d4..cad2da3a447d 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -155,69 +155,11 @@ static void access_flags_to_mode(__u32 ace_flags, umode_t *pmode,
 		*pmode |= (S_IXUGO & bits_to_set);
 
 #ifdef CONFIG_CIFS_DEBUG2
-	cFYI(1, ("access flags 0x%x mode now 0x%x", ace_flags, *pmode);
+	cFYI(1, ("access flags 0x%x mode now 0x%x", ace_flags, *pmode));
 #endif
 	return;
 }
 
-/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-
-void acl_to_uid_mode(struct inode *inode, const char *path)
-{
-	struct cifsFileInfo *open_file;
-	int unlock_file = FALSE;
-	int xid;
-	int rc = -EIO;
-	__u16 fid;
-	struct super_block *sb;
-	struct cifs_sb_info *cifs_sb;
-
-	cFYI(1, ("get mode from ACL for %s", path));
-	
-	if (inode == NULL)
-		return;
-
-	xid = GetXid();
-	open_file = find_readable_file(CIFS_I(inode));
-	if (open_file) {
-		unlock_file = TRUE;
-		fid = open_file->netfid;
-	} else {
-		int oplock = FALSE;
-		/* open file */
-		sb = inode->i_sb;
-		if (sb == NULL) {
-			FreeXid(xid);
-			return;
-		}
-		cifs_sb = CIFS_SB(sb);
-		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-				GENERIC_READ, 0, &fid, &oplock, NULL,
-				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != 0) {
-			cERROR(1, ("Unable to open file to get ACL"));
-			FreeXid(xid);
-			return;
-		}
-	}
-
-	/*   rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, pntsd, acllen,
-				    ACL_TYPE_ACCESS); */
-
-	if (unlock_file == TRUE)
-		atomic_dec(&open_file->wrtPending);
-	else
-		CIFSSMBClose(xid, cifs_sb->tcon, fid);
-
-/* parse ACEs e.g.
-	rc = parse_sec_desc(pntsd, acllen, inode);
-*/
-
-	FreeXid(xid);
-	return;
-}
-
 
 static void parse_ace(struct cifs_ace *pace, char *end_of_acl)
 {
@@ -314,12 +256,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 {
-
 	/* BB need to add parm so we can store the SID BB */
 
-	/* validate that we do not go past end of acl */
-	if (end_of_acl < (char *)psid + sizeof(struct cifs_sid)) {
-		cERROR(1, ("ACL too small to parse SID"));
+	/* validate that we do not go past end of ACL - sid must be at least 8
+	   bytes long (assuming no sub-auths - e.g. the null SID */
+	if (end_of_acl < (char *)psid + 8) {
+		cERROR(1, ("ACL too small to parse SID %p", psid));
 		return -EINVAL;
 	}
 
@@ -354,6 +296,9 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 	struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
 	char *end_of_acl = ((char *)pntsd) + acl_len;
 
+	if ((inode == NULL) || (pntsd == NULL))
+		return -EIO;
+
 	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->osidoffset));
 	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
@@ -368,6 +313,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 		 le32_to_cpu(pntsd->sacloffset),
 		 le32_to_cpu(pntsd->dacloffset)));
 #endif
+/*	cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
 	rc = parse_sid(owner_sid_ptr, end_of_acl);
 	if (rc)
 		return rc;
@@ -388,4 +334,65 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 
 	return (0);
 }
+
+
+/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
+
+void acl_to_uid_mode(struct inode *inode, const char *path)
+{
+	struct cifsFileInfo *open_file;
+	int unlock_file = FALSE;
+	int xid;
+	int rc = -EIO;
+	__u16 fid;
+	struct super_block *sb;
+	struct cifs_sb_info *cifs_sb;
+	struct cifs_ntsd *pntsd = NULL;
+	__u32 acllen;
+
+	cFYI(1, ("get mode from ACL for %s", path));
+
+	if (inode == NULL)
+		return;
+
+	xid = GetXid();
+	open_file = find_readable_file(CIFS_I(inode));
+	sb = inode->i_sb;
+	if (sb == NULL) {
+		FreeXid(xid);
+		return;
+	}
+	cifs_sb = CIFS_SB(sb);
+
+	if (open_file) {
+		unlock_file = TRUE;
+		fid = open_file->netfid;
+	} else {
+		int oplock = FALSE;
+		/* open file */
+		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
+				GENERIC_READ, 0, &fid, &oplock, NULL,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc != 0) {
+			cERROR(1, ("Unable to open file to get ACL"));
+			FreeXid(xid);
+			return;
+		}
+	}
+
+	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, &acllen);
+	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, acllen));
+	if (unlock_file == TRUE)
+		atomic_dec(&open_file->wrtPending);
+	else
+		CIFSSMBClose(xid, cifs_sb->tcon, fid);
+
+	/* parse ACEs */
+	if (!rc)
+		rc = parse_sec_desc(pntsd, acllen, inode);
+	kfree(pntsd);
+	FreeXid(xid);
+	return;
+}
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index cc17e98991f3..0bb3e431ee01 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2495,7 +2495,7 @@ querySymLinkRetry:
 	MaxSetupCount (size of returned setup area) and
 	MaxParameterCount (returned parms size) must be set by caller */
 static int
-smb_init_ntransact(const __u16 sub_command, const int setup_count,
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
 		   const int parm_len, struct cifsTconInfo *tcon,
 		   void **ret_buf)
 {
@@ -2526,7 +2526,7 @@ smb_init_ntransact(const __u16 sub_command, const int setup_count,
 
 static int
 validate_ntransact(char *buf, char **ppparm, char **ppdata,
-		   __u32 *pdatalen, __u32 *pparmlen)
+		   __u32 *pparmlen, __u32 *pdatalen)
 {
 	char *end_of_smb;
 	__u32 data_count, data_offset, parm_count, parm_offset;
@@ -3086,7 +3086,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	*pbuflen = 0;
 	*acl_inf = NULL;
 
-	rc = smb_init_ntransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0,
+	rc = smb_init_nttransact(NT_TRANSACT_QUERY_SECURITY_DESC, 0,
 			8 /* parm len */, tcon, (void **) &pSMB);
 	if (rc)
 		return rc;
-- 
cgit v1.2.3


From 73a2bcb0edb9ffb0b007b3546b430e2c6e415eee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 29 Oct 2007 21:18:11 +0100
Subject: sched: keep utime/stime monotonic

keep utime/stime monotonic.

cpustats use utime/stime as a ratio against sum_exec_runtime, as a
consequence it can happen - when the ratio changes faster than time
accumulates - that either can be appear to go backwards.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 63c95afb561f..d80baaabf835 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -358,7 +358,8 @@ static cputime_t task_utime(struct task_struct *p)
 	}
 	utime = (clock_t)temp;
 
-	return clock_t_to_cputime(utime);
+	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+	return p->prev_utime;
 }
 
 static cputime_t task_stime(struct task_struct *p)
-- 
cgit v1.2.3


From 9301899be75b464ef097f0b5af7af6d9bd8f68a7 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Tue, 30 Oct 2007 00:26:32 +0100
Subject: sched: fix /proc/<PID>/stat stime/utime monotonicity, part 2

Extend Peter's patch to fix accounting issues, by keeping stime
monotonic too.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Frans Pop <elendil@planet.nl>
---
 fs/proc/array.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index d80baaabf835..eba339ecba27 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -374,7 +374,8 @@ static cputime_t task_stime(struct task_struct *p)
 	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
 			cputime_to_clock_t(task_utime(p));
 
-	return clock_t_to_cputime(stime);
+	p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+	return p->prev_stime;
 }
 #endif
 
-- 
cgit v1.2.3


From e01b64001359034d04c695388870936ed3d1b56b Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishp@us.ibm.com>
Date: Tue, 30 Oct 2007 04:45:14 +0000
Subject: [CIFS] enable get mode from ACL when cifsacl mount option specified

Part 9 of ACL patch series.  getting mode from ACL now works in
some cases (and requires CIFS_EXPERIMENTAL config option).

Signed-off-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   |  4 +++-
 fs/cifs/cifsacl.c | 28 +++++++++++++++++++++-------
 2 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 3d419163c3d3..c65c9da863f3 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -12,7 +12,9 @@ leak that causes cifsd not to stop and rmmod to fail to cleanup
 cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on
 bigendian architectures. Fix possible memory corruption when
 EAGAIN returned on kern_recvmsg. Return better error if server
-requires packet signing but client has disabled it.
+requires packet signing but client has disabled it. When mounted
+with cifsacl mount option - mode bits are approximated based
+on the contents of the files ACL.
 
 Version 1.50
 ------------
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index cad2da3a447d..629b96c21639 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -43,8 +43,8 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 
 
 /* security id for everyone */
-static const struct cifs_sid sid_everyone =
-		{1, 1, {0, 0, 0, 0, 0, 0}, {} };
+static const struct cifs_sid sid_everyone = {
+	1, 1, {0, 0, 0, 0, 0, 1}, {0} };
 /* group users */
 static const struct cifs_sid sid_user =
 		{1, 2 , {0, 0, 0, 0, 0, 5}, {} };
@@ -138,8 +138,6 @@ static void access_flags_to_mode(__u32 ace_flags, umode_t *pmode,
 				 umode_t bits_to_set)
 {
 
-	*pmode &= ~bits_to_set;
-
 	if (ace_flags & GENERIC_ALL) {
 		*pmode |= (S_IRWXUGO & bits_to_set);
 #ifdef CONFIG_CIFS_DEBUG2
@@ -147,11 +145,14 @@ static void access_flags_to_mode(__u32 ace_flags, umode_t *pmode,
 #endif
 		return;
 	}
-	if ((ace_flags & GENERIC_WRITE) || (ace_flags & FILE_WRITE_RIGHTS))
+	if ((ace_flags & GENERIC_WRITE) ||
+			((ace_flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
 		*pmode |= (S_IWUGO & bits_to_set);
-	if ((ace_flags & GENERIC_READ) || (ace_flags & FILE_READ_RIGHTS))
+	if ((ace_flags & GENERIC_READ) ||
+			((ace_flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
 		*pmode |= (S_IRUGO & bits_to_set);
-	if ((ace_flags & GENERIC_EXECUTE) || (ace_flags & FILE_EXEC_RIGHTS))
+	if ((ace_flags & GENERIC_EXECUTE) ||
+			((ace_flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
 		*pmode |= (S_IXUGO & bits_to_set);
 
 #ifdef CONFIG_CIFS_DEBUG2
@@ -234,11 +235,24 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 		cifscred->aces = kmalloc(num_aces *
 			sizeof(struct cifs_ace *), GFP_KERNEL);*/
 
+		/* reset rwx permissions for user/group/other */
+		inode->i_mode &= ~(S_IRWXUGO);
+
 		for (i = 0; i < num_aces; ++i) {
 			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
 
 			parse_ace(ppace[i], end_of_acl);
 
+			if (compare_sids(&(ppace[i]->sid), pownersid))
+				access_flags_to_mode(ppace[i]->access_req,
+						&(inode->i_mode), S_IRWXU);
+			if (compare_sids(&(ppace[i]->sid), pgrpsid))
+				access_flags_to_mode(ppace[i]->access_req,
+						&(inode->i_mode), S_IRWXG);
+			if (compare_sids(&(ppace[i]->sid), &sid_everyone))
+				access_flags_to_mode(ppace[i]->access_req,
+						&(inode->i_mode), S_IRWXO);
+
 /*			memcpy((void *)(&(cifscred->aces[i])),
 				(void *)ppace[i],
 				sizeof(struct cifs_ace)); */
-- 
cgit v1.2.3


From f664f1f9b77d6c64f3cee1875dcb4faba0da6dd4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 29 Oct 2007 14:37:21 -0700
Subject: revert "ufs: Fix mount check in ufs_fill_super()"

Evgeniy said:

  I wonder on what type of UFS do you test this patch?  NetBSD and FreeBSD
  do not use "fs_state", they use "fs_clean" flag, only Solaris does check
  like this: fs_state + fs_time == FSOK.

  That's why parentheses was like that.

  At now with linux-2.6.24-rc1-git1, I get: fs need fsck, but NetBSD's fsck
  says that's all ok.

  I suggest revert this patch.

Cc: Evgeniy Dushistov <dushistov@mail.ru>
Cc: Satyam Sharma <satyam.sharma@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ufs/super.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 584cf12cc40f..c78c04fd993f 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -933,20 +933,19 @@ magic_found:
 		goto again;
 	}
 
-	/* Set sbi->s_flags here, used by ufs_get_fs_state() below */
-	sbi->s_flags = flags;
+	sbi->s_flags = flags;/*after that line some functions use s_flags*/
 	ufs_print_super_stuff(sb, usb1, usb2, usb3);
 
 	/*
 	 * Check, if file system was correctly unmounted.
 	 * If not, make it read only.
 	 */
-	if ((((flags & UFS_ST_MASK) == UFS_ST_44BSD)	||
-	     ((flags & UFS_ST_MASK) == UFS_ST_OLD)	||
-	     ((flags & UFS_ST_MASK) == UFS_ST_SUN)	||
-	     ((flags & UFS_ST_MASK) == UFS_ST_SUNOS)	||
-	     ((flags & UFS_ST_MASK) == UFS_ST_SUNx86))	&&
-	    (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)))) {
+	if (((flags & UFS_ST_MASK) == UFS_ST_44BSD) ||
+	  ((flags & UFS_ST_MASK) == UFS_ST_OLD) ||
+	  (((flags & UFS_ST_MASK) == UFS_ST_SUN ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
+	  (flags & UFS_ST_MASK) == UFS_ST_SUNx86) &&
+	  (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
 		switch(usb1->fs_clean) {
 		case UFS_FSCLEAN:
 			UFSD("fs is clean\n");
-- 
cgit v1.2.3


From 97855b49b6bac0bd25f16b017883634d13591d00 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 30 Oct 2007 11:20:02 -0400
Subject: locks: fix possible infinite loop in posix deadlock detection

It's currently possible to send posix_locks_deadlock() into an infinite
loop (under the BKL).

For now, fix this just by bailing out after a few iterations.  We may
want to fix this in a way that better clarifies the semantics of
deadlock detection.  But that will take more time, and this minimal fix
is probably adequate for any realistic scenario, and is simple enough to
be appropriate for applying to stable kernels now.

Thanks to George Davis for reporting the problem.

Cc: "George G. Davis" <gdavis@mvista.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/locks.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 0127a2846819..8b8388eca05e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -696,17 +696,28 @@ EXPORT_SYMBOL(posix_test_lock);
  * Note: the above assumption may not be true when handling lock requests
  * from a broken NFS client. But broken NFS clients have a lot more to
  * worry about than proper deadlock detection anyway... --okir
+ *
+ * However, the failure of this assumption (also possible in the case of
+ * multiple tasks sharing the same open file table) also means there's no
+ * guarantee that the loop below will terminate.  As a hack, we give up
+ * after a few iterations.
  */
+
+#define MAX_DEADLK_ITERATIONS 10
+
 static int posix_locks_deadlock(struct file_lock *caller_fl,
 				struct file_lock *block_fl)
 {
 	struct file_lock *fl;
+	int i = 0;
 
 next_task:
 	if (posix_same_owner(caller_fl, block_fl))
 		return 1;
 	list_for_each_entry(fl, &blocked_list, fl_link) {
 		if (posix_same_owner(fl, block_fl)) {
+			if (i++ > MAX_DEADLK_ITERATIONS)
+				return 0;
 			fl = fl->fl_next;
 			block_fl = fl;
 			goto next_task;
-- 
cgit v1.2.3


From e403149c92a2a0643211debbbb0a9ec7cc04cff7 Mon Sep 17 00:00:00 2001
From: Dirk Hohndel <hohndel@linux.intel.com>
Date: Tue, 30 Oct 2007 13:37:19 -0700
Subject: Kbuild/doc: fix links to Documentation files

Fix links to files in Documentation/* in various Kconfig files

Signed-off-by: Dirk Hohndel <hohndel@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index cc28a69246a7..c75c95406497 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -504,7 +504,7 @@ config INOTIFY
 	  including multiple file events, one-shot support, and unmount
 	  notification.
 
-	  For more information, see Documentation/filesystems/inotify.txt
+	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
 	  If unsure, say Y.
 
@@ -518,7 +518,7 @@ config INOTIFY_USER
 	  directories via a single open fd.  Events are read from the file
 	  descriptor, which is also select()- and poll()-able.
 
-	  For more information, see Documentation/filesystems/inotify.txt
+	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
 	  If unsure, say Y.
 
@@ -1089,7 +1089,7 @@ config ECRYPT_FS
 	depends on EXPERIMENTAL && KEYS && CRYPTO && NET
 	help
 	  Encrypted filesystem that operates on the VFS layer.  See
-	  <file:Documentation/ecryptfs.txt> to learn more about
+	  <file:Documentation/filesystems/ecryptfs.txt> to learn more about
 	  eCryptfs.  Userspace components are required and can be
 	  obtained from <http://ecryptfs.sf.net>.
 
-- 
cgit v1.2.3


From be48be08a829db09a4f786f44a1872ef0f533c85 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 30 Oct 2007 20:40:45 -0700
Subject: [COMPAT]: Fix new dev_ifname32 returning -EFAULT

A stray semicolon slipped in the patch that updated dev_ifname32 to
not be inline, causing it to always return -EFAULT. This fixes it.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/compat_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a4284ccac1f9..bd26e4cbb994 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -322,7 +322,7 @@ static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg)
 	int err;
 
 	uifr = compat_alloc_user_space(sizeof(struct ifreq));
-	if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)));
+	if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)))
 		return -EFAULT;
 
 	err = sys_ioctl(fd, SIOCGIFNAME, (unsigned long)uifr);
-- 
cgit v1.2.3


From 78e9d3678c8362aad2b2a48c242966aebb089dbd Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 24 Oct 2007 18:23:32 +0200
Subject: sysfs: make sysfs_{get,put}_active() static

sysfs_{get,put}_active() can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 4 ++--
 fs/sysfs/sysfs.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7a8ce9e98b32..337162935d21 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -132,7 +132,7 @@ struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
  *	RETURNS:
  *	Pointer to @sd on success, NULL on failure.
  */
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 {
 	if (unlikely(!sd))
 		return NULL;
@@ -161,7 +161,7 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
  *	Put an active reference to @sd.  This function is noop if @sd
  *	is NULL.
  */
-void sysfs_put_active(struct sysfs_dirent *sd)
+static void sysfs_put_active(struct sysfs_dirent *sd)
 {
 	struct completion *cmpl;
 	int v;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index f8417988f6b0..ff17f8da9b43 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -103,8 +103,6 @@ extern const struct file_operations sysfs_dir_operations;
 extern const struct inode_operations sysfs_dir_inode_operations;
 
 struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-void sysfs_put_active(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
 void sysfs_put_active_two(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
-- 
cgit v1.2.3


From 953f868138dbf4300196780379476ab9f07f263a Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 31 Oct 2007 04:54:42 +0000
Subject: [CIFS] Don't request too much permission when reading an ACL

We were requesting GENERIC_READ but that fails when  we do not have
read permission on the file (even if we could read the ACL).

Also move the dump access control entry code into debug ifdef.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c   | 32 +++++++++++++++++++++++++-------
 fs/cifs/cifspdu.h   | 17 +++++++++++++++++
 fs/cifs/cifsproto.h |  2 ++
 fs/cifs/inode.c     |  9 ++++-----
 4 files changed, 48 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 629b96c21639..f1215df7fbee 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -162,7 +162,8 @@ static void access_flags_to_mode(__u32 ace_flags, umode_t *pmode,
 }
 
 
-static void parse_ace(struct cifs_ace *pace, char *end_of_acl)
+#ifdef CONFIG_CIFS_DEBUG2
+static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
 {
 	int num_subauth;
 
@@ -180,7 +181,6 @@ static void parse_ace(struct cifs_ace *pace, char *end_of_acl)
 
 	num_subauth = pace->sid.num_subauth;
 	if (num_subauth) {
-#ifdef CONFIG_CIFS_DEBUG2
 		int i;
 		cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
 			pace->sid.revision, pace->sid.num_subauth, pace->type,
@@ -192,11 +192,11 @@ static void parse_ace(struct cifs_ace *pace, char *end_of_acl)
 
 		/* BB add length check to make sure that we do not have huge
 			num auths and therefore go off the end */
-#endif
 	}
 
 	return;
 }
+#endif
 
 
 static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
@@ -240,9 +240,9 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 		for (i = 0; i < num_aces; ++i) {
 			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
-
-			parse_ace(ppace[i], end_of_acl);
-
+#ifdef CONFIG_CIFS_DEBUG2
+			dump_ace(ppace[i], end_of_acl);
+#endif
 			if (compare_sids(&(ppace[i]->sid), pownersid))
 				access_flags_to_mode(ppace[i]->access_req,
 						&(inode->i_mode), S_IRWXU);
@@ -385,7 +385,7 @@ void acl_to_uid_mode(struct inode *inode, const char *path)
 		int oplock = FALSE;
 		/* open file */
 		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-				GENERIC_READ, 0, &fid, &oplock, NULL,
+				READ_CONTROL, 0, &fid, &oplock, NULL,
 				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 		if (rc != 0) {
@@ -409,4 +409,22 @@ void acl_to_uid_mode(struct inode *inode, const char *path)
 	FreeXid(xid);
 	return;
 }
+
+int mode_to_acl(struct inode *inode, const char *path)
+{
+	int rc = 0;
+	__u32 acllen = 0;
+	struct cifs_ntsd *pntsd = NULL;
+
+	cFYI(1, ("set ACL from mode for %s", path));
+
+	/* Get the security descriptor */
+
+	/* Add/Modify the three ACEs for owner, group, everyone */
+
+	/* Set the security descriptor */
+	kfree(pntsd);
+
+	return rc;
+}
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index c41ff74e9128..07464b6ac129 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -220,6 +220,23 @@
 				| FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
 #define FILE_EXEC_RIGHTS (FILE_EXECUTE)
 
+#define SET_FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_WRITE_EA \
+				| FILE_READ_ATTRIBUTES \
+				| FILE_WRITE_ATTRIBUTES \
+				| DELETE | READ_CONTROL | WRITE_DAC \
+				| WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+				| FILE_READ_EA | FILE_WRITE_EA \
+				| FILE_DELETE_CHILD | FILE_READ_ATTRIBUTES \
+				| FILE_WRITE_ATTRIBUTES \
+				| DELETE | READ_CONTROL | WRITE_DAC \
+				| WRITE_OWNER | SYNCHRONIZE)
+#define SET_FILE_EXEC_RIGHTS (FILE_READ_EA | FILE_WRITE_EA | FILE_EXECUTE \
+				| FILE_READ_ATTRIBUTES \
+				| FILE_WRITE_ATTRIBUTES \
+				| DELETE | READ_CONTROL | WRITE_DAC \
+				| WRITE_OWNER | SYNCHRONIZE)
+
 
 /*
  * Invalid readdir handle
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 88c02ac97c3f..1ffe25592b25 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -96,6 +96,8 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
 extern void acl_to_uid_mode(struct inode *inode, const char *search_path);
+extern int mode_to_acl(struct inode *inode, const char *path);
+
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 			const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9be0bbd20dfd..7d907e84e032 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -289,7 +289,7 @@ static int decode_sfu_inode(struct inode *inode, __u64 size,
 
 #define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID)  /* SETFILEBITS valid bits */
 
-static int get_sfu_uid_mode(struct inode *inode,
+static int get_sfu_mode(struct inode *inode,
 			const unsigned char *path,
 			struct cifs_sb_info *cifs_sb, int xid)
 {
@@ -528,16 +528,15 @@ int cifs_get_inode_info(struct inode **pinode,
 		/* BB fill in uid and gid here? with help from winbind?
 		   or retrieve from NTFS stream extended attribute */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
+		/* fill in 0777 bits from ACL */
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 			cFYI(1, ("Getting mode bits from ACL"));
 			acl_to_uid_mode(inode, search_path);
 		}
 #endif
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-			/* fill in uid, gid, mode from server ACL */
-			/* BB FIXME this should also take into account the
-			 * default uid specified on mount if present */
-			get_sfu_uid_mode(inode, search_path, cifs_sb, xid);
+			/* fill in remaining high mode bits e.g. SUID, VTX */
+			get_sfu_mode(inode, search_path, cifs_sb, xid);
 		} else if (atomic_read(&cifsInfo->inUse) == 0) {
 			inode->i_uid = cifs_sb->mnt_uid;
 			inode->i_gid = cifs_sb->mnt_gid;
-- 
cgit v1.2.3


From 1fb64bfc45b9ee5092b72474a5df216b8a0c7ff9 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 1 Nov 2007 02:12:10 +0000
Subject: [CIFS] when mount helper missing fix slash wrong direction in share

Kernel bugzilla bug #9228

If mount helper (mount.cifs) missing, mounts with form like
//10.11.12.13/c$ would not work (only mounts with slash e.g.
//10.11.12.13\\c$ would work) due to problem with slash supposed
to be converted to backslash by the mount helper (which is not
there).

If we fail on converting an IPv4 address in in4_pton then
try to canonicalize the first slash (ie between sharename
and host ip address) if necessary.  If we have to retry
to check for IPv6 address the slash is already converted
if necessary.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   |  5 ++++-
 fs/cifs/netmisc.c | 40 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index c65c9da863f3..6d3e736612ba 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -14,7 +14,10 @@ bigendian architectures. Fix possible memory corruption when
 EAGAIN returned on kern_recvmsg. Return better error if server
 requires packet signing but client has disabled it. When mounted
 with cifsacl mount option - mode bits are approximated based
-on the contents of the files ACL.
+on the contents of the ACL of the file or directory. When cifs
+mount helper is missing convert make sure that UNC name 
+has backslash (not forward slash) between ip address of server
+and the share name.
 
 Version 1.50
 ------------
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 4d35c034755a..e1704da43836 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -132,6 +132,34 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
 	{0, 0}
 };
 
+
+/* if the mount helper is missing we need to reverse the 1st slash
+   from '/' to backslash in order to format the UNC properly for
+   ip address parsing and for tree connect (unless the user
+   remembered to put the UNC name in properly). Fortunately we do
+   not have to call this twice (we check for IPv4 addresses
+   first, so it is already converted by the time we
+   try IPv6 addresses */
+static int canonicalize_unc(char *cp)
+{
+	int i;
+
+	for (i = 0; i <= 46 /* INET6_ADDRSTRLEN */ ; i++) {
+		if (cp[i] == 0)
+			break;
+		if (cp[i] == '\\')
+			break;
+		if (cp[i] == '/') {
+#ifdef CONFIG_CIFS_DEBUG2
+			cFYI(1, ("change slash to backslash in malformed UNC"));
+#endif
+			cp[i] = '\\';
+			return 1;
+		}
+	}
+	return 0;
+}
+
 /* Convert string containing dotted ip address to binary form */
 /* returns 0 if invalid address */
 
@@ -141,11 +169,13 @@ cifs_inet_pton(int address_family, char *cp, void *dst)
 	int ret = 0;
 
 	/* calculate length by finding first slash or NULL */
-	/* BB Should we convert '/' slash to '\' here since it seems already
-	 * done before this */
-	if ( address_family == AF_INET ) {
-		ret = in4_pton(cp, -1 /* len */, dst , '\\', NULL);
-	} else if ( address_family == AF_INET6 ) {
+	if (address_family == AF_INET) {
+		ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL);
+		if (ret == 0) {
+			if (canonicalize_unc(cp))
+				ret = in4_pton(cp, -1, dst, '\\', NULL);
+		}
+	} else if (address_family == AF_INET6) {
 		ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
 	}
 #ifdef CONFIG_CIFS_DEBUG2
-- 
cgit v1.2.3


From 7505e0525c914cdfdb54f43a7e70f038a16a5486 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 1 Nov 2007 18:03:01 +0000
Subject: [CIFS] If no Access Control Entries, set mode perm bits to zero

Also clean up ACL code

Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 77 +++++++++++++++++++++++++++++++++++++++----------------
 fs/cifs/cifspdu.h | 23 +++++++++++++++++
 fs/cifs/connect.c |  2 +-
 3 files changed, 79 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index f1215df7fbee..bd75a3b8caff 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -223,6 +223,17 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 		le32_to_cpu(pdacl->num_aces)));
 #endif
 
+	/* reset rwx permissions for user/group/other.
+	   Also, if num_aces is 0 i.e. DACL has no ACEs,
+	   user/group/other have no permissions */
+	inode->i_mode &= ~(S_IRWXUGO);
+
+	if (!pdacl) {
+		/* no DACL in the security descriptor, set
+		   all the permissions for user/group/other */
+		inode->i_mode |= S_IRWXUGO;
+		return;
+	}
 	acl_base = (char *)pdacl;
 	acl_size = sizeof(struct cifs_acl);
 
@@ -235,9 +246,6 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 		cifscred->aces = kmalloc(num_aces *
 			sizeof(struct cifs_ace *), GFP_KERNEL);*/
 
-		/* reset rwx permissions for user/group/other */
-		inode->i_mode &= ~(S_IRWXUGO);
-
 		for (i = 0; i < num_aces; ++i) {
 			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
 #ifdef CONFIG_CIFS_DEBUG2
@@ -309,6 +317,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
 	struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
 	char *end_of_acl = ((char *)pntsd) + acl_len;
+	__u32 dacloffset;
 
 	if ((inode == NULL) || (pntsd == NULL))
 		return -EIO;
@@ -317,15 +326,14 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 				le32_to_cpu(pntsd->osidoffset));
 	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->gsidoffset));
-	dacl_ptr = (struct cifs_acl *)((char *)pntsd +
-				le32_to_cpu(pntsd->dacloffset));
+	dacloffset = le32_to_cpu(pntsd->dacloffset);
+	dacl_ptr = (struct cifs_acl *)(char *)pntsd + dacloffset;
 #ifdef CONFIG_CIFS_DEBUG2
 	cFYI(1, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
 		 "sacloffset 0x%x dacloffset 0x%x",
 		 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
 		 le32_to_cpu(pntsd->gsidoffset),
-		 le32_to_cpu(pntsd->sacloffset),
-		 le32_to_cpu(pntsd->dacloffset)));
+		 le32_to_cpu(pntsd->sacloffset), dacloffset));
 #endif
 /*	cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
 	rc = parse_sid(owner_sid_ptr, end_of_acl);
@@ -336,7 +344,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 	if (rc)
 		return rc;
 
-	parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, inode);
+	if (dacloffset)
+		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
+		group_sid_ptr, inode);
+	else
+		cFYI(1, ("no ACL")); /* BB grant all or default perms? */
 
 /*	cifscred->uid = owner_sid_ptr->rid;
 	cifscred->gid = group_sid_ptr->rid;
@@ -350,9 +362,9 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 }
 
 
-/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-
-void acl_to_uid_mode(struct inode *inode, const char *path)
+/* Retrieve an ACL from the server */
+static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
+				       const char *path)
 {
 	struct cifsFileInfo *open_file;
 	int unlock_file = FALSE;
@@ -362,19 +374,18 @@ void acl_to_uid_mode(struct inode *inode, const char *path)
 	struct super_block *sb;
 	struct cifs_sb_info *cifs_sb;
 	struct cifs_ntsd *pntsd = NULL;
-	__u32 acllen;
 
 	cFYI(1, ("get mode from ACL for %s", path));
 
 	if (inode == NULL)
-		return;
+		return NULL;
 
 	xid = GetXid();
 	open_file = find_readable_file(CIFS_I(inode));
 	sb = inode->i_sb;
 	if (sb == NULL) {
 		FreeXid(xid);
-		return;
+		return NULL;
 	}
 	cifs_sb = CIFS_SB(sb);
 
@@ -391,25 +402,44 @@ void acl_to_uid_mode(struct inode *inode, const char *path)
 		if (rc != 0) {
 			cERROR(1, ("Unable to open file to get ACL"));
 			FreeXid(xid);
-			return;
+			return NULL;
 		}
 	}
 
-	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, &acllen);
-	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, acllen));
+	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
 	if (unlock_file == TRUE)
 		atomic_dec(&open_file->wrtPending);
 	else
 		CIFSSMBClose(xid, cifs_sb->tcon, fid);
 
-	/* parse ACEs */
-	if (!rc)
+	FreeXid(xid);
+	return pntsd;
+}
+
+/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
+void acl_to_uid_mode(struct inode *inode, const char *path)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	u32 acllen = 0;
+	int rc = 0;
+
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1, ("converting ACL to mode for %s", path));
+#endif
+	pntsd = get_cifs_acl(&acllen, inode, path);
+
+	/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
+	if (pntsd)
 		rc = parse_sec_desc(pntsd, acllen, inode);
+	if (rc)
+		cFYI(1, ("parse sec desc failed rc = %d", rc));
+
 	kfree(pntsd);
-	FreeXid(xid);
 	return;
 }
 
+/* Convert mode bits to an ACL so we can update the ACL on the server */
 int mode_to_acl(struct inode *inode, const char *path)
 {
 	int rc = 0;
@@ -419,12 +449,15 @@ int mode_to_acl(struct inode *inode, const char *path)
 	cFYI(1, ("set ACL from mode for %s", path));
 
 	/* Get the security descriptor */
+	pntsd = get_cifs_acl(&acllen, inode, path);
 
-	/* Add/Modify the three ACEs for owner, group, everyone */
+	/* Add/Modify the three ACEs for owner, group, everyone
+	   while retaining the other ACEs */
 
 	/* Set the security descriptor */
-	kfree(pntsd);
 
+
+	kfree(pntsd);
 	return rc;
 }
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 07464b6ac129..dbe6b846f37f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1228,6 +1228,29 @@ typedef struct smb_com_transaction_qsec_req {
 	__le32 AclFlags;
 } __attribute__((packed)) QUERY_SEC_DESC_REQ;
 
+
+typedef struct smb_com_transaction_ssec_req {
+	struct smb_hdr hdr;     /* wct = 19 */
+	__u8 MaxSetupCount;
+	__u16 Reserved;
+	__le32 TotalParameterCount;
+	__le32 TotalDataCount;
+	__le32 MaxParameterCount;
+	__le32 MaxDataCount;
+	__le32 ParameterCount;
+	__le32 ParameterOffset;
+	__le32 DataCount;
+	__le32 DataOffset;
+	__u8 SetupCount; /* no setup words follow subcommand */
+	/* SNIA spec incorrectly included spurious pad here */
+	__le16 SubCommand; /* 3 = SET_SECURITY_DESC */
+	__le16 ByteCount; /* bcc = 3 + 8 */
+	__u8 Pad[3];
+	__u16 Fid;
+	__u16 Reserved2;
+	__le32 AclFlags;
+} __attribute__((packed)) SET_SEC_DESC_REQ;
+
 typedef struct smb_com_transaction_change_notify_req {
 	struct smb_hdr hdr;     /* wct = 23 */
 	__u8 MaxSetupCount;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 19ee11f7f35a..380ee9991f20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -793,7 +793,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 	vol->linux_gid = current->gid;
 	vol->dir_mode = S_IRWXUGO;
 	/* 2767 perms indicate mandatory locking support */
-	vol->file_mode = S_IALLUGO & ~(S_ISUID | S_IXGRP);
+	vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
 
 	/* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
 	vol->rw = TRUE;
-- 
cgit v1.2.3


From 87ae9afdcada236d0a1b38ce2c465a65916961dc Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Tue, 30 Oct 2007 10:35:04 +0100
Subject: cleanup asm/scatterlist.h includes

Not architecture specific code should not #include <asm/scatterlist.h>.

This patch therefore either replaces them with
#include <linux/scatterlist.h> or simply removes them if they were
unused.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/nfsd/nfs4recover.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 6f03918018a3..1602cd00dd45 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -43,7 +43,7 @@
 #include <linux/file.h>
 #include <linux/namei.h>
 #include <asm/uaccess.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
 
-- 
cgit v1.2.3


From 745542e210b3b15751ea9d511321924ac36b85db Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 3 Nov 2007 04:34:04 +0000
Subject: [CIFS] allow cifs_calc_signature2 to deal with a zero length iovec

Currently, cifs_calc_signature2 errors out if it gets a zero-length
iovec. Fix it to silently continue in that case.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 632070b4275d..788f0ad6feda 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -99,11 +99,12 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 	MD5Init(&context);
 	MD5Update(&context, (char *)&key->data, key->len);
 	for (i = 0; i < n_vec; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
 		if (iov[i].iov_base == NULL) {
 			cERROR(1, ("null iovec entry"));
 			return -EIO;
-		} else if (iov[i].iov_len == 0)
-			break; /* bail out if we are sent nothing to sign */
+		}
 		/* The first entry includes a length field (which does not get
 		   signed that occupies the first 4 bytes before the header */
 		if (i == 0) {
-- 
cgit v1.2.3


From 09fe7ba78dedb9017401ed555ecc4435c99a7556 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 3 Nov 2007 04:48:29 +0000
Subject: [CIFS] implement upcalls for SPNEGO blob via keyctl API

Add routines to handle upcalls to userspace via keyctl for the purpose
of getting a SPNEGO blob for a particular uid and server combination.

Clean up the Makefile a bit and set it up to only compile cifs_spnego
if CONFIG_CIFS_UPCALL is set. Also change CONFIG_CIFS_UPCALL to depend
on CONFIG_KEYS rather than CONFIG_CONNECTOR.

cifs_spnego.h defines the communications between kernel and userspace
and is intended to be shared with userspace programs.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/Kconfig          | 2 +-
 fs/cifs/Makefile    | 7 ++++++-
 fs/cifs/cifsproto.h | 2 ++
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index cc28a69246a7..e431c38a7262 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -2007,7 +2007,7 @@ config CIFS_EXPERIMENTAL
 config CIFS_UPCALL
 	  bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
 	  depends on CIFS_EXPERIMENTAL
-	  depends on CONNECTOR
+	  depends on KEYS
 	  help
 	    Enables an upcall mechanism for CIFS which will be used to contact
 	    userspace helper utilities to provide SPNEGO packaged Kerberos
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index ff6ba8d823f0..45e42fb97c19 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -3,4 +3,9 @@
 #
 obj-$(CONFIG_CIFS) += cifs.o
 
-cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o sess.o export.o cifsacl.o
+cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
+	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
+	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \
+	  readdir.o ioctl.o sess.o export.o cifsacl.o
+
+cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1ffe25592b25..dd1d7c200ee6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -76,6 +76,8 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
 				struct cifsSesInfo *ses,
 				void **request_buf);
+extern struct key *cifs_get_spnego_key(struct cifsSesInfo *sesInfo,
+					const char *hostname);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 			     const int stage,
 			     const struct nls_table *nls_cp);
-- 
cgit v1.2.3


From 84a15b935481fa651cc6ec60aed015312b67adda Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 3 Nov 2007 05:02:24 +0000
Subject: [CIFS] Register and unregister cifs_spnego_key_type on module
 init/exit

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a6fbea57c4b1..94c0f55d7669 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -43,6 +43,7 @@
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
 #include <linux/mm.h>
+#include <linux/key-type.h>
 #define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
 
 #ifdef CONFIG_CIFS_QUOTA
@@ -1005,12 +1006,16 @@ init_cifs(void)
 	rc = register_filesystem(&cifs_fs_type);
 	if (rc)
 		goto out_destroy_request_bufs;
-
+#ifdef CONFIG_CIFS_UPCALL
+	rc = register_key_type(&cifs_spnego_key_type);
+	if (rc)
+		goto out_unregister_filesystem;
+#endif
 	oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd");
 	if (IS_ERR(oplockThread)) {
 		rc = PTR_ERR(oplockThread);
 		cERROR(1, ("error %d create oplock thread", rc));
-		goto out_unregister_filesystem;
+		goto out_unregister_key_type;
 	}
 
 	dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
@@ -1024,7 +1029,11 @@ init_cifs(void)
 
  out_stop_oplock_thread:
 	kthread_stop(oplockThread);
+ out_unregister_key_type:
+#ifdef CONFIG_CIFS_UPCALL
+	unregister_key_type(&cifs_spnego_key_type);
  out_unregister_filesystem:
+#endif
 	unregister_filesystem(&cifs_fs_type);
  out_destroy_request_bufs:
 	cifs_destroy_request_bufs();
@@ -1045,6 +1054,9 @@ exit_cifs(void)
 	cFYI(0, ("exit_cifs"));
 #ifdef CONFIG_PROC_FS
 	cifs_proc_clean();
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+	unregister_key_type(&cifs_spnego_key_type);
 #endif
 	unregister_filesystem(&cifs_fs_type);
 	cifs_destroy_inodecache();
-- 
cgit v1.2.3


From e545937a51fe0cc78cea55752764daabb81ec96d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 3 Nov 2007 05:11:06 +0000
Subject: [CIFS] add OIDs for KRB5 and MSKRB5 to ASN1 parsing routines

Also, fix the parser to recognize them and set the secType
accordingly. Make CIFSSMBNegotiate not error out automatically
after parsing the securityBlob.

Also thanks to Q (Igor) and Simo for their help on this
set of kerberos patches (and Dave Howells for help on the
upcall).

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/asn1.c    | 35 ++++++++++++++++++++++++-----------
 fs/cifs/cifsfs.c  |  1 +
 fs/cifs/cifssmb.c |  3 +--
 3 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 2a01f3ef96a0..bcda2c6b6a04 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -77,8 +77,12 @@
 
 #define SPNEGO_OID_LEN 7
 #define NTLMSSP_OID_LEN  10
+#define KRB5_OID_LEN  7
+#define MSKRB5_OID_LEN  7
 static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
 static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
+static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
+static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
 
 /*
  * ASN.1 context.
@@ -457,6 +461,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 	unsigned long *oid = NULL;
 	unsigned int cls, con, tag, oidlen, rc;
 	int use_ntlmssp = FALSE;
+	int use_kerberos = FALSE;
 
 	*secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
 
@@ -545,18 +550,28 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 				return 0;
 			}
 			if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
-				rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
-				if (rc) {
+				if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
+
 					cFYI(1,
 					  ("OID len = %d oid = 0x%lx 0x%lx "
 					   "0x%lx 0x%lx",
 					   oidlen, *oid, *(oid + 1),
 					   *(oid + 2), *(oid + 3)));
-					rc = compare_oid(oid, oidlen,
-						 NTLMSSP_OID, NTLMSSP_OID_LEN);
-					kfree(oid);
-					if (rc)
+
+					if (compare_oid(oid, oidlen,
+							MSKRB5_OID,
+							MSKRB5_OID_LEN))
+						use_kerberos = TRUE;
+					else if (compare_oid(oid, oidlen,
+							     KRB5_OID,
+							     KRB5_OID_LEN))
+						use_kerberos = TRUE;
+					else if (compare_oid(oid, oidlen,
+							     NTLMSSP_OID,
+							     NTLMSSP_OID_LEN))
 						use_ntlmssp = TRUE;
+
+					kfree(oid);
 				}
 			} else {
 				cFYI(1, ("Should be an oid what is going on?"));
@@ -609,12 +624,10 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 			 ctx.pointer));	/* is this UTF-8 or ASCII? */
 	}
 
-	/* if (use_kerberos)
-	   *secType = Kerberos
-	   else */
-	if (use_ntlmssp) {
+	if (use_kerberos)
+		*secType = Kerberos;
+	else if (use_ntlmssp)
 		*secType = NTLMSSP;
-	}
 
 	return 1;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 94c0f55d7669..416dc9fe8961 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -44,6 +44,7 @@
 #include "cifs_fs_sb.h"
 #include <linux/mm.h>
 #include <linux/key-type.h>
+#include "cifs_spnego.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
 
 #ifdef CONFIG_CIFS_QUOTA
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 0bb3e431ee01..59d7b7c037ad 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -647,8 +647,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 						 count - 16,
 						 &server->secType);
 			if (rc == 1) {
-			/* BB Need to fill struct for sessetup here */
-				rc = -EOPNOTSUPP;
+				rc = 0;
 			} else {
 				rc = -EINVAL;
 			}
-- 
cgit v1.2.3


From ebab89909e0dc716282d5e7f6e73a3155fe66d4a Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cam.ac.uk>
Date: Sat, 3 Nov 2007 07:38:59 +0000
Subject: NTFS: Fix read regression.

The regression was caused by:
        commit[a32ea1e1f925399e0d81ca3f7394a44a6dafa12c] Fix read/truncate race

This causes ntfs_readpage() to be called for a zero i_size inode, which
failed when the file was compressed and non-resident.

Thanks a lot to Mike Galbraith for reporting the issue and tracking down
the commit that caused the regression.

Looking into it I found three bugs which the patch fixes.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
Tested-by:  Mike Galbraith <efault@gmx.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/aops.c     | 10 +++++++++-
 fs/ntfs/attrib.c   |  5 +----
 fs/ntfs/compress.c | 10 ++++++++++
 3 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfdc7900d271..ad87cb01299b 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -405,6 +405,15 @@ static int ntfs_readpage(struct file *file, struct page *page)
 
 retry_readpage:
 	BUG_ON(!PageLocked(page));
+	vi = page->mapping->host;
+	i_size = i_size_read(vi);
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
+			PAGE_CACHE_SHIFT)) {
+		zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+		ntfs_debug("Read outside i_size - truncated?");
+		goto done;
+	}
 	/*
 	 * This can potentially happen because we clear PageUptodate() during
 	 * ntfs_writepage() of MstProtected() attributes.
@@ -413,7 +422,6 @@ retry_readpage:
 		unlock_page(page);
 		return 0;
 	}
-	vi = page->mapping->host;
 	ni = NTFS_I(vi);
 	/*
 	 * Only $DATA attributes can be encrypted and only unnamed $DATA
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 92dabdcf2b80..50d3b0c258e3 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -179,10 +179,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
 	 * ntfs_mapping_pairs_decompress() fails.
 	 */
 	end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
-	if (!a->data.non_resident.lowest_vcn && end_vcn == 1)
-		end_vcn = sle64_to_cpu(a->data.non_resident.allocated_size) >>
-				ni->vol->cluster_size_bits;
-	if (unlikely(vcn >= end_vcn)) {
+	if (unlikely(vcn && vcn >= end_vcn)) {
 		err = -ENOENT;
 		goto err_out;
 	}
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index d98daf59e0b6..d1619d05eb23 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -561,6 +561,16 @@ int ntfs_read_compressed_block(struct page *page)
 	read_unlock_irqrestore(&ni->size_lock, flags);
 	max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
 			offset;
+	/* Is the page fully outside i_size? (truncate in progress) */
+	if (xpage >= max_page) {
+		kfree(bhs);
+		kfree(pages);
+		zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+		ntfs_debug("Compressed read outside i_size - truncated?");
+		SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
 	if (nr_pages < max_page)
 		max_page = nr_pages;
 	for (i = 0; i < max_page; i++, offset++) {
-- 
cgit v1.2.3


From f1d662a7d5e5322e583aad6b3cfec03d8f27b435 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 5 Nov 2007 14:38:08 +0000
Subject: [CIFS] Add upcall files for cifs to use spnego/kerberos

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/cifs_spnego.h |  46 +++++++++++++++++++
 2 files changed, 170 insertions(+)
 create mode 100644 fs/cifs/cifs_spnego.c
 create mode 100644 fs/cifs/cifs_spnego.h

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
new file mode 100644
index 000000000000..e142faf2d0ae
--- /dev/null
+++ b/fs/cifs/cifs_spnego.c
@@ -0,0 +1,124 @@
+/*
+ *   fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS
+ *
+ *   Copyright (c) 2007 Red Hat, Inc.
+ *   Author(s): Jeff Layton (jlayton@redhat.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/string.h>
+#include <keys/user-type.h>
+#include <linux/key-type.h>
+#include "cifsglob.h"
+#include "cifs_spnego.h"
+#include "cifs_debug.h"
+
+/* create a new cifs key */
+static int
+cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
+{
+	char *payload;
+	int ret;
+
+	ret = -ENOMEM;
+	payload = kmalloc(datalen, GFP_KERNEL);
+	if (!payload)
+		goto error;
+
+	/* attach the data */
+	memcpy(payload, data, datalen);
+	rcu_assign_pointer(key->payload.data, payload);
+	ret = 0;
+
+error:
+	return ret;
+}
+
+static void
+cifs_spnego_key_destroy(struct key *key)
+{
+	kfree(key->payload.data);
+}
+
+
+/*
+ * keytype for CIFS spnego keys
+ */
+struct key_type cifs_spnego_key_type = {
+	.name		= "cifs.spnego",
+	.instantiate	= cifs_spnego_key_instantiate,
+	.match		= user_match,
+	.destroy	= cifs_spnego_key_destroy,
+	.describe	= user_describe,
+};
+
+/* get a key struct with a SPNEGO security blob, suitable for session setup */
+struct key *
+cifs_get_spnego_key(struct cifsSesInfo *sesInfo, const char *hostname)
+{
+	struct TCP_Server_Info *server = sesInfo->server;
+	char *description, *dp;
+	size_t desc_len;
+	struct key *spnego_key;
+
+
+	/* version + ;ip{4|6}= + address + ;host=hostname + ;sec= + NULL */
+	desc_len = 2 + 5 + 32 + 1 + 5 + strlen(hostname) +
+		   strlen(";sec=krb5") + 1;
+	spnego_key = ERR_PTR(-ENOMEM);
+	description = kzalloc(desc_len, GFP_KERNEL);
+	if (description == NULL)
+		goto out;
+
+	dp = description;
+	/* start with version and hostname portion of UNC string */
+	spnego_key = ERR_PTR(-EINVAL);
+	sprintf(dp, "%2.2x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
+		hostname);
+	dp = description + strlen(description);
+
+	/* add the server address */
+	if (server->addr.sockAddr.sin_family == AF_INET)
+		sprintf(dp, "ip4=" NIPQUAD_FMT,
+			NIPQUAD(server->addr.sockAddr.sin_addr));
+	else if (server->addr.sockAddr.sin_family == AF_INET6)
+		sprintf(dp, "ip6=" NIP6_SEQFMT,
+			NIP6(server->addr.sockAddr6.sin6_addr));
+	else
+		goto out;
+
+	dp = description + strlen(description);
+
+	/* for now, only sec=krb5 is valid */
+	if (server->secType == Kerberos)
+		sprintf(dp, ";sec=krb5");
+	else
+		goto out;
+
+	cFYI(1, ("key description = %s", description));
+	spnego_key = request_key(&cifs_spnego_key_type, description, "");
+
+	if (cifsFYI && !IS_ERR(spnego_key)) {
+		struct cifs_spnego_msg *msg = spnego_key->payload.data;
+		cifs_dump_mem("SPNEGO reply blob:", msg->data,
+				msg->secblob_len + msg->sesskey_len);
+	}
+
+out:
+	kfree(description);
+	return spnego_key;
+}
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
new file mode 100644
index 000000000000..f443f3b35134
--- /dev/null
+++ b/fs/cifs/cifs_spnego.h
@@ -0,0 +1,46 @@
+/*
+ *   fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS
+ *
+ *   Copyright (c) 2007 Red Hat, Inc.
+ *   Author(s): Jeff Layton (jlayton@redhat.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _CIFS_SPNEGO_H
+#define _CIFS_SPNEGO_H
+
+#define CIFS_SPNEGO_UPCALL_VERSION 1
+
+/*
+ * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
+ * The flags field is for future use. The request-key callout should set
+ * sesskey_len and secblob_len, and then concatenate the SessKey+SecBlob
+ * and stuff it in the data field.
+ */
+struct cifs_spnego_msg {
+	uint32_t	version;
+	uint32_t	flags;
+	uint32_t	sesskey_len;
+	uint32_t	secblob_len;
+	uint8_t		data[1];
+};
+
+#ifdef __KERNEL__
+extern struct key_type cifs_spnego_key_type;
+#endif /* KERNEL */
+
+#endif /* _CIFS_SPNEGO_H */
-- 
cgit v1.2.3


From 63d2583f5a1a0b72fea3f2171f23f0ca8fa556ec Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 5 Nov 2007 21:46:10 +0000
Subject: [CIFS] Fix walking out end of cifs dacl

Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c     |  4 ++--
 fs/cifs/cifsacl.h     |  3 +++
 fs/cifs/cifsencrypt.c |  4 ++--
 fs/cifs/netmisc.c     |  2 +-
 fs/cifs/readdir.c     |  2 +-
 fs/cifs/smbencrypt.c  | 14 ++++++++------
 fs/cifs/xattr.c       |  4 ++--
 7 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index bd75a3b8caff..38d09fa8c1e6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -327,7 +327,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->gsidoffset));
 	dacloffset = le32_to_cpu(pntsd->dacloffset);
-	dacl_ptr = (struct cifs_acl *)(char *)pntsd + dacloffset;
+	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
 #ifdef CONFIG_CIFS_DEBUG2
 	cFYI(1, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
 		 "sacloffset 0x%x dacloffset 0x%x",
@@ -346,7 +346,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 
 	if (dacloffset)
 		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
-		group_sid_ptr, inode);
+			   group_sid_ptr, inode);
 	else
 		cFYI(1, ("no ACL")); /* BB grant all or default perms? */
 
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 30b0caf66786..93a7c3462ea2 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -35,6 +35,9 @@
 #define UBITSHIFT	6
 #define GBITSHIFT	3
 
+#define ACCESS_ALLOWED	0
+#define ACCESS_DENIED	1
+
 struct cifs_ntsd {
 	__le16 revision; /* revision level */
 	__le16 type;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 788f0ad6feda..4ff8939c6cc7 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -108,7 +108,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		/* The first entry includes a length field (which does not get
 		   signed that occupies the first 4 bytes before the header */
 		if (i == 0) {
-			if (iov[0].iov_len <= 8 ) /* cmd field at offset 9 */
+			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
 				break; /* nothing to sign or corrupt header */
 			MD5Update(&context, iov[0].iov_base+4,
 				  iov[0].iov_len-4);
@@ -123,7 +123,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 
 
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
-		   __u32 * pexpected_response_sequence_number)
+		   __u32 *pexpected_response_sequence_number)
 {
 	int rc = 0;
 	char smb_signature[20];
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index e1704da43836..646e1f06941b 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -770,7 +770,7 @@ cifs_print_status(__u32 status_code)
 
 
 static void
-ntstatus_to_dos(__u32 ntstatus, __u8 * eclass, __u16 * ecode)
+ntstatus_to_dos(__u32 ntstatus, __u8 *eclass, __u16 *ecode)
 {
 	int i;
 	if (ntstatus == 0) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 3746580e9701..82497d47429a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -495,7 +495,7 @@ ffirst_retry:
 static int cifs_unicode_bytelen(char *str)
 {
 	int len;
-	__le16 * ustr = (__le16 *)str;
+	__le16 *ustr = (__le16 *)str;
 
 	for (len = 0; len <= PATH_MAX; len++) {
 		if (ustr[len] == 0)
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index bd3c4674f2ba..58bbfd992cc0 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -80,7 +80,7 @@ SMBencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 
 /* Routines for Windows NT MD4 Hash functions. */
 static int
-_my_wcslen(__u16 * str)
+_my_wcslen(__u16 *str)
 {
 	int len = 0;
 	while (*str++ != 0)
@@ -96,7 +96,7 @@ _my_wcslen(__u16 * str)
  */
 
 static int
-_my_mbstowcs(__u16 * dst, const unsigned char *src, int len)
+_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
 {	/* BB not a very good conversion routine - change/fix */
 	int i;
 	__u16 val;
@@ -125,9 +125,9 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
 	/* Password cannot be longer than 128 characters */
 	if (passwd) {
 		len = strlen((char *) passwd);
-		if (len > 128) {
+		if (len > 128)
 			len = 128;
-		}
+
 		/* Password must be converted to NT unicode */
 		_my_mbstowcs(wpwd, passwd, len);
 	} else
@@ -189,8 +189,10 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
 		return;
 	dom_u = user_u + 1024;
 
-	/* push_ucs2(NULL, user_u, user_n, (user_l+1)*2, STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
-	   push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2, STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
+	/* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
+			STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
+	   push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
+			STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
 
 	/* BB user and domain may need to be uppercased */
 	user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 12b125ff0bd0..54e8ef96cb79 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -267,7 +267,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			int oplock = FALSE;
 			struct cifs_ntsd *pacl = NULL;
 			__u32 buflen = 0;
-			if (experimEnabled) 
+			if (experimEnabled)
 				rc = CIFSSMBOpen(xid, pTcon, full_path,
 					FILE_OPEN, GENERIC_READ, 0, &fid,
 					&oplock, NULL, cifs_sb->local_nls,
@@ -275,7 +275,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 			/* else rc is EOPNOTSUPP from above */
 
-			if(rc == 0) {
+			if (rc == 0) {
 				rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
 						      &buflen);
 				CIFSSMBClose(xid, pTcon, fid);
-- 
cgit v1.2.3


From 6551198a201a70cb11e25712b1d0b2a369bb8a4c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 5 Nov 2007 14:50:57 -0800
Subject: fs/afs/vlocation.c: fix off-by-one

This patch fixes an off-by-one error spotted by the Coverity checker.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/vlocation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 7b4bbe48112d..849fc3160cb5 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -382,7 +382,7 @@ struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
 	       cell->name, key_serial(key),
 	       (int) namesz, (int) namesz, name, namesz);
 
-	if (namesz > sizeof(vl->vldb.name)) {
+	if (namesz >= sizeof(vl->vldb.name)) {
 		_leave(" = -ENAMETOOLONG");
 		return ERR_PTR(-ENAMETOOLONG);
 	}
-- 
cgit v1.2.3


From 778d1a2bd42ae862a6c6d20a1c3af5e45b3c1924 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Mon, 5 Nov 2007 14:51:03 -0800
Subject: eCryptfs: increment extent_offset once per loop interation

The extent_offset is getting incremented twice per loop iteration through any
given page.  It should only be getting incremented once.  This bug should only
impact hosts with >4K page sizes.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 9d70289f7df3..9ea4769fbb66 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -504,7 +504,6 @@ int ecryptfs_encrypt_page(struct page *page)
 					"\n", rc);
 			goto out;
 		}
-		extent_offset++;
 	}
 out:
 	kfree(enc_extent_virt);
@@ -640,7 +639,6 @@ int ecryptfs_decrypt_page(struct page *page)
 			       "rc = [%d]\n", __FUNCTION__, rc);
 			goto out;
 		}
-		extent_offset++;
 	}
 out:
 	kfree(enc_extent_virt);
-- 
cgit v1.2.3


From 8a29f2b0288ba2a8fb302f9a639521ac9ff302e5 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Mon, 5 Nov 2007 14:51:04 -0800
Subject: eCryptfs: release mutex on hash error path

Release the crypt_stat hash mutex on allocation error. Check for error
conditions when doing crypto hash calls.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Reported-by: Kazuki Ohta <kazuki.ohta@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 9ea4769fbb66..bbed2fd40fdc 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -115,11 +115,29 @@ static int ecryptfs_calculate_md5(char *dst,
 		}
 		crypt_stat->hash_tfm = desc.tfm;
 	}
-	crypto_hash_init(&desc);
-	crypto_hash_update(&desc, &sg, len);
-	crypto_hash_final(&desc, dst);
-	mutex_unlock(&crypt_stat->cs_hash_tfm_mutex);
+	rc = crypto_hash_init(&desc);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error initializing crypto hash; rc = [%d]\n",
+		       __FUNCTION__, rc);
+		goto out;
+	}
+	rc = crypto_hash_update(&desc, &sg, len);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error updating crypto hash; rc = [%d]\n",
+		       __FUNCTION__, rc);
+		goto out;
+	}
+	rc = crypto_hash_final(&desc, dst);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error finalizing crypto hash; rc = [%d]\n",
+		       __FUNCTION__, rc);
+		goto out;
+	}
 out:
+	mutex_unlock(&crypt_stat->cs_hash_tfm_mutex);
 	return rc;
 }
 
-- 
cgit v1.2.3


From dda6b022f3222f09d3fb49f5dfabd31d33e0d10b Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Tue, 6 Nov 2007 08:02:53 -0600
Subject: 9p: fix memory leak in v9fs_get_sb

This patch fixes a memory leak in v9fs_get_sb.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Acked-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_super.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index bb0cef9a6b8a..678c02f1ae23 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -119,6 +119,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 
 	P9_DPRINTK(P9_DEBUG_VFS, " \n");
 
+	st = NULL;
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
 		return -ENOMEM;
@@ -164,10 +165,12 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
 	v9fs_stat2inode(st, root->d_inode, sb);
 	v9fs_fid_add(root, fid);
+	kfree(st);
 
 	return simple_set_mnt(mnt, sb);
 
 error:
+	kfree(st);
 	if (fid)
 		p9_client_clunk(fid);
 
-- 
cgit v1.2.3


From 8999e04f3b7930f0c6f091a541237de51d8dd372 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Tue, 6 Nov 2007 08:02:53 -0600
Subject: 9p: use copy of the options value instead of original

v9fs_parse_options function uses strsep which modifies the value of the
v9ses->options field. That modified value is later passed to the function
that creates the transport potentially making the transport creation
function to fail.

This patch creates a copy of v9ses->option field that v9fs_parse_options
function uses instead of the original value.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Acked-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 756f7e9beb2e..fbb12dadba83 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -82,7 +82,7 @@ static match_table_t tokens = {
 
 static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 {
-	char *options = v9ses->options;
+	char *options;
 	substring_t args[MAX_OPT_ARGS];
 	char *p;
 	int option;
@@ -96,9 +96,10 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 	v9ses->cache = 0;
 	v9ses->trans = v9fs_default_trans();
 
-	if (!options)
+	if (!v9ses->options)
 		return;
 
+	options = kstrdup(v9ses->options, GFP_KERNEL);
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
 		if (!*p)
@@ -169,6 +170,7 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 			continue;
 		}
 	}
+	kfree(options);
 }
 
 /**
-- 
cgit v1.2.3


From 0af4bd38876416d945ad6a1338798696604952a1 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 24 Oct 2007 18:23:27 +0200
Subject: [2.6 patch] make ocfs2_find_entry_el() static

ocfs2_find_entry_el() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dir.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 6a2f143e269c..63b28fdceb4a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -208,9 +208,9 @@ out:
 	return NULL;
 }
 
-struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
-					struct inode *dir,
-					struct ocfs2_dir_entry **res_dir)
+static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
+					       struct inode *dir,
+					       struct ocfs2_dir_entry **res_dir)
 {
 	struct super_block *sb;
 	struct buffer_head *bh_use[NAMEI_RA_SIZE];
-- 
cgit v1.2.3


From 3cf0c507dd28de0e1a4c24304d806e6b3976f0f5 Mon Sep 17 00:00:00 2001
From: Roel Kluin <12o3l@tiscali.nl>
Date: Sat, 27 Oct 2007 00:20:36 +0200
Subject: [PATCH] Fix priority mistakes in fs/ocfs2/{alloc.c, dlmglue.c}

Fixes priority mistakes similar to '!x & y'

Signed-off-by: Roel Kluin <12o3l@tiscali.nl>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c   | 2 +-
 fs/ocfs2/dlmglue.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4ba7f0bdc248..ce62c152823d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3946,7 +3946,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	struct ocfs2_merge_ctxt ctxt;
 	struct ocfs2_extent_list *rightmost_el;
 
-	if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
+	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
 		ret = -EIO;
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 41c76ff2fcfb..ef09fd20f3a5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -670,7 +670,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
 {
 	mlog_entry_void();
 
-	BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
 
 	if (lockres->l_requested > LKM_NLMODE &&
-- 
cgit v1.2.3


From 019d1b2247c6898589560c6f3b3e7ec280b0010a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 5 Oct 2007 12:09:05 -0700
Subject: ocfs2: Create locks at initially requested level

If we have not yet created a cluster lock, ocfs2_cluster_lock() will
first create it at NLMODE, and then convert the lock to either PRMODE or
EXMODE (whichever is requested).

Change ocfs2_cluster_lock() to just create the lock at the initially
requested level. ocfs2_locking_ast() handles this case fine, so the only
update required was in setup of locking state. This should reduce the number
of network messages required for a new lock by one, providing an incremental
performance enhancement.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlmglue.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index ef09fd20f3a5..4e97dcceaf8f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -980,18 +980,6 @@ again:
 		goto unlock;
 	}
 
-	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
-		/* lock has not been created yet. */
-		spin_unlock_irqrestore(&lockres->l_lock, flags);
-
-		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-		goto again;
-	}
-
 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
 	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
 		/* is the lock is currently blocked on behalf of
@@ -1006,7 +994,14 @@ again:
 			mlog(ML_ERROR, "lockres %s has action %u pending\n",
 			     lockres->l_name, lockres->l_action);
 
-		lockres->l_action = OCFS2_AST_CONVERT;
+		if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+			lockres->l_action = OCFS2_AST_ATTACH;
+			lkm_flags &= ~LKM_CONVERT;
+		} else {
+			lockres->l_action = OCFS2_AST_CONVERT;
+			lkm_flags |= LKM_CONVERT;
+		}
+
 		lockres->l_requested = level;
 		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -1021,7 +1016,7 @@ again:
 		status = dlmlock(osb->dlm,
 				 level,
 				 &lockres->l_lksb,
-				 lkm_flags|LKM_CONVERT,
+				 lkm_flags,
 				 lockres->l_name,
 				 OCFS2_LOCK_ID_MAX_LEN - 1,
 				 ocfs2_locking_ast,
-- 
cgit v1.2.3


From 9f70968af3e6e21612e06e153aa71c62dee5a09b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 18 Oct 2007 12:36:10 -0700
Subject: ocfs2: Re-order iput in ocfs2_drop_dentry_lock

Do this to avoid a theoretical (I haven't seen this in practice) race where
the downconvert thread might drop the dentry lock, allowing a remote unlink
to proceed before dropping the inode locks. This could bounce access to the
orphan dir between nodes.

There doesn't seem to be a need to do the same in ocfs2_dentry_iput() as
that's never called for the last ref drop from the downconvert thread.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 3094ddb7a254..1957a5ed219e 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -318,9 +318,9 @@ out_attach:
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
 				   struct ocfs2_dentry_lock *dl)
 {
+	iput(dl->dl_inode);
 	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
 	ocfs2_lock_res_free(&dl->dl_lockres);
-	iput(dl->dl_inode);
 	kfree(dl);
 }
 
-- 
cgit v1.2.3


From 9ea2d32f40434589ea0e136373f7d1545afb411f Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 18 Oct 2007 14:14:45 -0700
Subject: ocfs2: Commit journal on sync writes

We're missing a meta data commit for extending sync writes. In thoery, write
could return with the meta data required to read the data uncommitted to
disk. Fix that by detecting an allocating write and forcing a journal commit
in the sync case.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f92fe91ff260..bbac7cd33e0b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1891,9 +1891,11 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	ssize_t written = 0;
 	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
-	loff_t *ppos = &iocb->ki_pos;
+	loff_t old_size, *ppos = &iocb->ki_pos;
+	u32 old_clusters;
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	mlog_entry("(0x%p, %u, '%.*s')\n", file,
 		   (unsigned int)nr_segs,
@@ -1949,6 +1951,13 @@ relock:
 		goto relock;
 	}
 
+	/*
+	 * To later detect whether a journal commit for sync writes is
+	 * necessary, we sample i_size, and cluster count here.
+	 */
+	old_size = i_size_read(inode);
+	old_clusters = OCFS2_I(inode)->ip_clusters;
+
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb, rw_level);
 
@@ -1978,6 +1987,21 @@ out_dio:
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
+	if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
+		/*
+		 * The generic write paths have handled getting data
+		 * to disk, but since we don't make use of the dirty
+		 * inode list, a manual journal commit is necessary
+		 * here.
+		 */
+		if (old_size != i_size_read(inode) ||
+		    old_clusters != OCFS2_I(inode)->ip_clusters) {
+			ret = journal_force_commit(osb->journal->j_journal);
+			if (ret < 0)
+				written = ret;
+		}
+	}
+
 	/* 
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
 	 * function pointer which is called when o_direct io completes so that
-- 
cgit v1.2.3


From 4e9563fd55ff4479f2b118d0757d121dd0cfc39c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 1 Nov 2007 11:37:48 -0700
Subject: ocfs2: fix write() performance regression

On file systems which don't support sparse files, Ocfs2_map_page_blocks()
was reading blocks on appending writes. This caused write performance to
suffer dramatically. Fix this by detecting an appending write on a nonsparse
fs and skipping the read.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c69c1b300155..556e34ccb005 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -728,6 +728,27 @@ static void ocfs2_clear_page_regions(struct page *page,
 	kunmap_atomic(kaddr, KM_USER0);
 }
 
+/*
+ * Nonsparse file systems fully allocate before we get to the write
+ * code. This prevents ocfs2_write() from tagging the write as an
+ * allocating one, which means ocfs2_map_page_blocks() might try to
+ * read-in the blocks at the tail of our file. Avoid reading them by
+ * testing i_size against each block offset.
+ */
+static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+				 unsigned int block_start)
+{
+	u64 offset = page_offset(page) + block_start;
+
+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		return 1;
+
+	if (i_size_read(inode) > offset)
+		return 1;
+
+	return 0;
+}
+
 /*
  * Some of this taken from block_prepare_write(). We already have our
  * mapping by now though, and the entire write will be allocating or
@@ -781,6 +802,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 				set_buffer_uptodate(bh);
 		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 			   !buffer_new(bh) &&
+			   ocfs2_should_read_blk(inode, page, block_start) &&
 			   (block_start < from || block_end > to)) {
 			ll_rw_block(READ, 1, &bh);
 			*wait_bh++=bh;
-- 
cgit v1.2.3


From bc7e97cbdd4bef162e5772c74ee2cc4487a2d997 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 10 Oct 2007 16:25:42 +0200
Subject: [PATCH] Fix possibly too long write in o2hb_setup_one_bio()

We should subtract start of our IO from PAGE_CACHE_SIZE to get the right
length of the write we want to perform.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9cc7c0418b70..f02ccb34604d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -267,7 +267,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 		current_page = cs / spp;
 		page = reg->hr_slot_data[current_page];
 
-		vec_len = min(PAGE_CACHE_SIZE,
+		vec_len = min(PAGE_CACHE_SIZE - vec_start,
 			      (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
 
 		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
-- 
cgit v1.2.3


From e325a88f17196f18888f6e1426eb9fe3b4346d28 Mon Sep 17 00:00:00 2001
From: Srinivas Eeda <srinivas.eeda@oracle.com>
Date: Wed, 31 Oct 2007 16:49:43 -0700
Subject: ocfs2: fix rename vs unlink race

If another node unlinks the destination while ocfs2_rename() is waiting on a
cluster lock, ocfs2_rename() simply logs an error and continues. This causes
a crash because the renaming node is now trying to delete a non-existent
inode. The correct solution is to return -ENOENT.

Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/namei.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 729259016c18..989ac2718587 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1105,9 +1105,16 @@ static int ocfs2_rename(struct inode *old_dir,
 		goto bail;
 	}
 
-	if (!new_de && new_inode)
-		mlog(ML_ERROR, "inode %lu does not exist in it's parent "
-		     "directory!", new_inode->i_ino);
+	if (!new_de && new_inode) {
+		/*
+		 * Target was unlinked by another node while we were
+		 * waiting to get to ocfs2_rename(). There isn't
+		 * anything we can do here to help the situation, so
+		 * bubble up the appropriate error.
+		 */
+		status = -ENOENT;
+		goto bail;
+	}
 
 	/* In case we need to overwrite an existing file, we blow it
 	 * away first */
-- 
cgit v1.2.3


From 44656ba1286d82b5a5f8817eb2e4ea744143c3ca Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Wed, 7 Nov 2007 04:10:52 -0800
Subject: [NET]: Kill proc_net_create()

There are no more users.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 749def054a34..153554cf5575 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -26,13 +26,6 @@
 #include "internal.h"
 
 
-struct proc_dir_entry *proc_net_create(struct net *net,
-	const char *name, mode_t mode, get_info_t *get_info)
-{
-	return create_proc_info_entry(name,mode, net->proc_net, get_info);
-}
-EXPORT_SYMBOL_GPL(proc_net_create);
-
 struct proc_dir_entry *proc_net_fops_create(struct net *net,
 	const char *name, mode_t mode, const struct file_operations *fops)
 {
-- 
cgit v1.2.3


From df61c952622f51facac21dd8dfa4d8a24dcb9657 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 6 Nov 2007 23:48:57 -0800
Subject: [DLM] lowcomms: Do not muck with sysctl_rmem_max.

Use SO_RCVBUFFORCE instead.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/dlm/lowcomms.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 58bf3f5cdbe2..e9923ca9c2d9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1062,7 +1062,7 @@ static int sctp_listen_for_all(void)
 	subscribe.sctp_shutdown_event = 1;
 	subscribe.sctp_partial_delivery_event = 1;
 
-	result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+	result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE,
 				 (char *)&bufsize, sizeof(bufsize));
 	if (result)
 		log_print("Error increasing buffer space on socket %d", result);
@@ -1454,10 +1454,6 @@ int dlm_lowcomms_start(void)
 	if (!con_cache)
 		goto out;
 
-	/* Set some sysctl minima */
-	if (sysctl_rmem_max < NEEDED_RMEM)
-		sysctl_rmem_max = NEEDED_RMEM;
-
 	/* Start listening */
 	if (dlm_config.ci_protocol == 0)
 		error = tcp_listen_for_all();
-- 
cgit v1.2.3


From 8ec680e4c3ec818efd1652f15199ed1c216ab550 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 7 Nov 2007 13:54:07 +0100
Subject: ioprio: allow sys_ioprio_set() value of 0 to reset ioprio setting

Normally io priorities follow the CPU nice, unless a specific scheduling
class has been set. Once that is set, there's no way to reset the
behaviour to 'none' so that it follows CPU nice again.

Currently passing in 0 as the ioprio class/value will return -1/EINVAL,
change that to allow resetting of a set scheduling class.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/ioprio.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index d6ff77e8e7ec..e4e01bc7f338 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -78,6 +78,10 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 			if (!capable(CAP_SYS_ADMIN))
 				return -EPERM;
 			break;
+		case IOPRIO_CLASS_NONE:
+			if (data)
+				return -EINVAL;
+			break;
 		default:
 			return -EINVAL;
 	}
-- 
cgit v1.2.3


From 9eae8a8903c3d90283d338fad2cc58f2eb90adcb Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallan@gmail.com>
Date: Thu, 8 Nov 2007 16:13:31 +0000
Subject: [CIFS] Add uid to key description so krb can handle user mounts

Adds uid to key description fro supporting user mounts
and minor formating changes

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index e142faf2d0ae..ad54a3a6e434 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -76,9 +76,10 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo, const char *hostname)
 	struct key *spnego_key;
 
 
-	/* version + ;ip{4|6}= + address + ;host=hostname + ;sec= + NULL */
-	desc_len = 2 + 5 + 32 + 1 + 5 + strlen(hostname) +
-		   strlen(";sec=krb5") + 1;
+	/* version + ;ip{4|6}= + address + ;host=hostname +
+		;sec= + ;uid= + NULL */
+	desc_len = 4 + 5 + 32 + 1 + 5 + strlen(hostname) +
+		   strlen(";sec=krb5") + 7 + sizeof(uid_t)*2 + 1;
 	spnego_key = ERR_PTR(-ENOMEM);
 	description = kzalloc(desc_len, GFP_KERNEL);
 	if (description == NULL)
@@ -87,7 +88,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo, const char *hostname)
 	dp = description;
 	/* start with version and hostname portion of UNC string */
 	spnego_key = ERR_PTR(-EINVAL);
-	sprintf(dp, "%2.2x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
+	sprintf(dp, "0x%2.2x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
 		hostname);
 	dp = description + strlen(description);
 
@@ -109,6 +110,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo, const char *hostname)
 	else
 		goto out;
 
+	dp = description + strlen(description);
+	sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
+
 	cFYI(1, ("key description = %s", description));
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 
-- 
cgit v1.2.3


From 15b0395911eb45a0834755f0d9e84570644a8c22 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 8 Nov 2007 17:57:40 +0000
Subject: [CIFS] Fix incorrect mode when ACL had deny access control entries

When mounted with the cifsacl mount option, we were
treating any deny ACEs found like allow ACEs and it turns out for
SFU and SUA Windows set these type of access control entries often.
The order of ACEs is important too.  The canonical order that most
ACL tools and Windows explorer consruct ACLs with is to begin with
DENY entries then follow with ALLOW, otherwise an allow entry
could be encountered first, making the subsequent deny entry like "dead
code which would be superflous since Windows stops when a match is
made for the operation you are trying to perform for your user

We start with no permissions in the mode and build up as we find
permissions (ie allow ACEs).  This fixes deny ACEs so they affect
the mask used to set the subsequent allow ACEs.

Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com>
CC: Alexander Bokovoy <ab@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 38d09fa8c1e6..ec445802d903 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -134,12 +134,39 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
    pmode is the existing mode (we only want to overwrite part of this
    bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007
 */
-static void access_flags_to_mode(__u32 ace_flags, umode_t *pmode,
-				 umode_t bits_to_set)
+static void access_flags_to_mode(__u32 ace_flags, int type, umode_t *pmode,
+				 umode_t *pbits_to_set)
 {
+	/* the order of ACEs is important.  The canonical order is to begin with
+	   DENY entries then follow with ALLOW, otherwise an allow entry could be
+	   encountered first, making the subsequent deny entry like "dead code"
+           which would be superflous since Windows stops when a match is made 
+	   for the operation you are trying to perform for your user */
+
+	/* For deny ACEs we change the mask so that subsequent allow access
+	   control entries do not turn on the bits we are denying */
+	if (type == ACCESS_DENIED) {
+		if (ace_flags & GENERIC_ALL) {
+			*pbits_to_set &= ~S_IRWXUGO;
+		}
+		if ((ace_flags & GENERIC_WRITE) ||
+			((ace_flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
+			*pbits_to_set &= ~S_IWUGO;
+		if ((ace_flags & GENERIC_READ) ||
+			((ace_flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
+			*pbits_to_set &= ~S_IRUGO;
+		if ((ace_flags & GENERIC_EXECUTE) ||
+			((ace_flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
+			*pbits_to_set &= ~S_IXUGO;
+		return;
+	} else if (type != ACCESS_ALLOWED) {
+		cERROR(1, ("unknown access control type %d", type));
+		return;
+	}
+	/* else ACCESS_ALLOWED type */
 
 	if (ace_flags & GENERIC_ALL) {
-		*pmode |= (S_IRWXUGO & bits_to_set);
+		*pmode |= (S_IRWXUGO & (*pbits_to_set));
 #ifdef CONFIG_CIFS_DEBUG2
 		cFYI(1, ("all perms"));
 #endif
@@ -147,13 +174,13 @@ static void access_flags_to_mode(__u32 ace_flags, umode_t *pmode,
 	}
 	if ((ace_flags & GENERIC_WRITE) ||
 			((ace_flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
-		*pmode |= (S_IWUGO & bits_to_set);
+		*pmode |= (S_IWUGO & (*pbits_to_set));
 	if ((ace_flags & GENERIC_READ) ||
 			((ace_flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
-		*pmode |= (S_IRUGO & bits_to_set);
+		*pmode |= (S_IRUGO & (*pbits_to_set));
 	if ((ace_flags & GENERIC_EXECUTE) ||
 			((ace_flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
-		*pmode |= (S_IXUGO & bits_to_set);
+		*pmode |= (S_IXUGO & (*pbits_to_set));
 
 #ifdef CONFIG_CIFS_DEBUG2
 	cFYI(1, ("access flags 0x%x mode now 0x%x", ace_flags, *pmode));
@@ -239,6 +266,10 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 	num_aces = le32_to_cpu(pdacl->num_aces);
 	if (num_aces  > 0) {
+		umode_t user_mask = S_IRWXU;
+		umode_t group_mask = S_IRWXG;
+		umode_t other_mask = S_IRWXO;
+
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
 
@@ -253,13 +284,19 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 #endif
 			if (compare_sids(&(ppace[i]->sid), pownersid))
 				access_flags_to_mode(ppace[i]->access_req,
-						&(inode->i_mode), S_IRWXU);
+						     ppace[i]->type,
+						     &(inode->i_mode),
+						     &user_mask);
 			if (compare_sids(&(ppace[i]->sid), pgrpsid))
 				access_flags_to_mode(ppace[i]->access_req,
-						&(inode->i_mode), S_IRWXG);
+						     ppace[i]->type,
+						     &(inode->i_mode),
+						     &group_mask);
 			if (compare_sids(&(ppace[i]->sid), &sid_everyone))
 				access_flags_to_mode(ppace[i]->access_req,
-						&(inode->i_mode), S_IRWXO);
+						     ppace[i]->type,
+						     &(inode->i_mode),
+						     &other_mask);
 
 /*			memcpy((void *)(&(cifscred->aces[i])),
 				(void *)ppace[i],
-- 
cgit v1.2.3


From ce06c9f025120dbb2978d9b84641d76c25f17902 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 8 Nov 2007 21:12:01 +0000
Subject: [CIFS] add mode to acl conversion helper function

Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   |  3 +++
 fs/cifs/cifsacl.c | 35 +++++++++++++++++++++++++++++++++--
 fs/cifs/cifsfs.h  |  2 +-
 3 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 6d3e736612ba..53629b8bc8a8 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,6 @@
+Version 1.52
+------------
+
 Version 1.51
 ------------
 Fix memory leak in statfs when mounted to very old servers (e.g.
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index ec445802d903..dabbce00712b 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -138,9 +138,9 @@ static void access_flags_to_mode(__u32 ace_flags, int type, umode_t *pmode,
 				 umode_t *pbits_to_set)
 {
 	/* the order of ACEs is important.  The canonical order is to begin with
-	   DENY entries then follow with ALLOW, otherwise an allow entry could be
+	   DENY entries followed by ALLOW, otherwise an allow entry could be
 	   encountered first, making the subsequent deny entry like "dead code"
-           which would be superflous since Windows stops when a match is made 
+	   which would be superflous since Windows stops when a match is made
 	   for the operation you are trying to perform for your user */
 
 	/* For deny ACEs we change the mask so that subsequent allow access
@@ -188,6 +188,37 @@ static void access_flags_to_mode(__u32 ace_flags, int type, umode_t *pmode,
 	return;
 }
 
+/*
+   Generate access flags to reflect permissions mode is the existing mode.
+   This function is called for every ACE in the DACL whose SID matches
+   with either owner or group or everyone.
+*/
+
+static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
+				__u32 *pace_flags)
+{
+	/* reset access mask */
+	*pace_flags = 0x0;
+
+	/* bits to use are either S_IRWXU or S_IRWXG or S_IRWXO */
+	mode &= bits_to_use;
+
+	/* check for R/W/X UGO since we do not know whose flags
+	   is this but we have cleared all the bits sans RWX for
+	   either user or group or other as per bits_to_use */
+	if (mode & S_IRUGO)
+		*pace_flags |= SET_FILE_READ_RIGHTS;
+	if (mode & S_IWUGO)
+		*pace_flags |= SET_FILE_WRITE_RIGHTS;
+	if (mode & S_IXUGO)
+		*pace_flags |= SET_FILE_EXEC_RIGHTS;
+
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags));
+#endif
+	return;
+}
+
 
 #ifdef CONFIG_CIFS_DEBUG2
 static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0a3ee5a322b0..62357d228c07 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -106,5 +106,5 @@ extern int cifs_ioctl(struct inode *inode, struct file *filep,
 extern struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.51"
+#define CIFS_VERSION   "1.52"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.3


From a6f8de3d9b124c95893054fd2a78bc7be5bb9000 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 8 Nov 2007 23:10:32 +0000
Subject: [CIFS] Fix stale mode after readdir when cifsacl specified

When mounted with cifsacl mount option, readdir can not
instantiate the inode with the estimated mode based on the ACL
for each file since we have not queried for the ACL for
each of these files yet.  So set the refresh time to zero
for these inodes so that the next stat will cause the client
to go to the server for the ACL info so we can build the estimated
mode (this means we also will issue an extra QueryPathInfo if
the stat happens within 1 second, but this is trivial compared to
the time required to open/getacl/close for each).

ls -l is slower when cifsacl mount option is specified, but
displays correct mode information.

Signed-off-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/readdir.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 82497d47429a..0f22def4bdff 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -171,7 +171,13 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	/* Linux can not store file creation time unfortunately so ignore it */
 
 	cifsInfo->cifsAttrs = attr;
-	cifsInfo->time = jiffies;
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		/* get more accurate mode via ACL - so force inode refresh */
+		cifsInfo->time = 0;
+	} else
+#endif /* CONFIG_CIFS_EXPERIMENTAL */
+		cifsInfo->time = jiffies;
 
 	/* treat dos attribute of read-only as read-only mode bit e.g. 555? */
 	/* 2767 perms - indicate mandatory locking */
-- 
cgit v1.2.3


From 9b8f5f573770f33b28c45255ac82e6457278c782 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 9 Nov 2007 23:25:04 +0000
Subject: [CIFS] fix oops on second mount to same server when null auth is used

When a share is mounted using no username, cifs_mount sets
volume_info.username as a NULL pointer, and the sesInfo userName as an
empty string. The volume_info.username is passed to a couple of other
functions to see if there is an existing unc or tcp connection that can
be used. These functions assume that the username will be a valid
string that can be passed to strncmp. If the pointer is NULL, then the
kernel will oops if there's an existing session to which the string
can be compared.

This patch changes cifs_mount to set volume_info.username to an empty
string in this situation, which prevents the oops and should make it
so that the comparison to other null auth sessions match.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   | 1 +
 fs/cifs/connect.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 53629b8bc8a8..64dd22239b21 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,5 +1,6 @@
 Version 1.52
 ------------
+Fix oops on second mount to server when null auth is used.
 
 Version 1.51
 ------------
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 380ee9991f20..1102160f6661 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1790,7 +1790,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 	if (volume_info.nullauth) {
 		cFYI(1, ("null user"));
-		volume_info.username = NULL;
+		volume_info.username = "";
 	} else if (volume_info.username) {
 		/* BB fixme parse for domain name here */
 		cFYI(1, ("Username: %s", volume_info.username));
-- 
cgit v1.2.3


From 00ec99da43a7c2aed46c6595aa271b84bb1b1462 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 11 Nov 2007 19:13:43 -0800
Subject: core dump: remain dumpable

The coredump code always calls set_dumpable(0) when it starts (even
if RLIMIT_CORE prevents any core from being dumped).  The effect of
this (via task_dumpable) is to make /proc/pid/* files owned by root
instead of the user, so the user can no longer examine his own
process--in a case where there was never any privileged data to
protect.  This affects e.g. auxv, environ, fd; in Fedora (execshield)
kernels, also maps.  In practice, you can only notice this when a
debugger has requested PTRACE_EVENT_EXIT tracing.

set_dumpable was only used in do_coredump for synchronization and not
intended for any security purpose.  (It doesn't secure anything that wasn't
already unsecured when a process dies by SIGTERM instead of SIGQUIT.)

This changes do_coredump to check the core_waiters count as the means of
synchronization, which is sufficient.  Now we leave the "dumpable" bits alone.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 2c942e2d14ea..4ccaaa4b13b2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1692,7 +1692,10 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
 	down_write(&mm->mmap_sem);
-	if (!get_dumpable(mm)) {
+	/*
+	 * If another thread got here first, or we are not dumpable, bail out.
+	 */
+	if (mm->core_waiters || !get_dumpable(mm)) {
 		up_write(&mm->mmap_sem);
 		goto fail;
 	}
@@ -1706,7 +1709,6 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		flag = O_EXCL;		/* Stop rewrite attacks */
 		current->fsuid = 0;	/* Dump root private */
 	}
-	set_dumpable(mm, 0);
 
 	retval = coredump_wait(exit_code);
 	if (retval < 0)
-- 
cgit v1.2.3


From ac8587dcb58e40dd336d99d60f852041e06cc3dd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 12 Nov 2007 16:05:02 -0500
Subject: knfsd: fix spurious EINVAL errors on first access of new filesystem

The v2/v3 acl code in nfsd is translating any return from fh_verify() to
nfserr_inval.  This is particularly unfortunate in the case of an
nfserr_dropit return, which is an internal error meant to indicate to
callers that this request has been deferred and should just be dropped
pending the results of an upcall to mountd.

Thanks to Roland <devzero@web.de> for bug report and data collection.

Cc: Roland <devzero@web.de>
Acked-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Reviewed-By: NeilBrown <neilb@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfs2acl.c | 2 +-
 fs/nfsd/nfs3acl.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b61742885011..0e5fa11e6b44 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -41,7 +41,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 
 	fh = fh_copy(&resp->fh, &argp->fh);
 	if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
-		RETURN_STATUS(nfserr_inval);
+		RETURN_STATUS(nfserr);
 
 	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
 		RETURN_STATUS(nfserr_inval);
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 3e3f2de82c36..b647f2f872dc 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -37,7 +37,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
 
 	fh = fh_copy(&resp->fh, &argp->fh);
 	if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
-		RETURN_STATUS(nfserr_inval);
+		RETURN_STATUS(nfserr);
 
 	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
 		RETURN_STATUS(nfserr_inval);
-- 
cgit v1.2.3


From 6fa02839bf9412e18e773d04e96182b4cd0b5d57 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 12 Nov 2007 16:05:03 -0500
Subject: nfsd4: recheck for secure ports in fh_verify

As with commit 7fc90ec93a5eb71f4b08403baf5ba7176b3ec6b1 ("knfsd: nfsd:
call nfsd_setuser() on fh_compose(), fix nfsd4 permissions problem")
this is a case where we need to redo a security check in fh_verify()
even though the filehandle already has an associated dentry--if the
filehandle was created by fh_compose() in an earlier operation of the
nfsv4 compound, then we may not have done these checks yet.

Without this fix it is possible, for example, to traverse from an export
without the secure ports requirement to one with it in a single
compound, and bypass the secure port check on the new export.

While we're here, fix up some minor style problems and change a printk()
to a dprintk(), to make it harder for random unprivileged users to spam
the logs.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Reviewed-By: NeilBrown <neilb@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfsfh.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 4f712e970584..468f17a78441 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -95,6 +95,22 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
 	return 0;
 }
 
+static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
+					  struct svc_export *exp)
+{
+	/* Check if the request originated from a secure port. */
+	if (!rqstp->rq_secure && EX_SECURE(exp)) {
+		char buf[RPC_MAX_ADDRBUFLEN];
+		dprintk(KERN_WARNING
+		       "nfsd: request from insecure port %s!\n",
+		       svc_print_addr(rqstp, buf, sizeof(buf)));
+		return nfserr_perm;
+	}
+
+	/* Set user creds for this exportpoint */
+	return nfserrno(nfsd_setuser(rqstp, exp));
+}
+
 /*
  * Perform sanity checks on the dentry in a client's file handle.
  *
@@ -167,18 +183,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 			goto out;
 		}
 
-		/* Check if the request originated from a secure port. */
-		error = nfserr_perm;
-		if (!rqstp->rq_secure && EX_SECURE(exp)) {
-			char buf[RPC_MAX_ADDRBUFLEN];
-			printk(KERN_WARNING
-			       "nfsd: request from insecure port %s!\n",
-			       svc_print_addr(rqstp, buf, sizeof(buf)));
-			goto out;
-		}
-
-		/* Set user creds for this exportpoint */
-		error = nfserrno(nfsd_setuser(rqstp, exp));
+		error = nfsd_setuser_and_check_port(rqstp, exp);
 		if (error)
 			goto out;
 
@@ -227,18 +232,22 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 		fhp->fh_export = exp;
 		nfsd_nr_verified++;
 	} else {
-		/* just rechecking permissions
-		 * (e.g. nfsproc_create calls fh_verify, then nfsd_create does as well)
+		/*
+		 * just rechecking permissions
+		 * (e.g. nfsproc_create calls fh_verify, then nfsd_create
+		 * does as well)
 		 */
 		dprintk("nfsd: fh_verify - just checking\n");
 		dentry = fhp->fh_dentry;
 		exp = fhp->fh_export;
-		/* Set user creds for this exportpoint; necessary even
+		/*
+		 * Set user creds for this exportpoint; necessary even
 		 * in the "just checking" case because this may be a
 		 * filehandle that was created by fh_compose, and that
 		 * is about to be used in another nfsv4 compound
-		 * operation */
-		error = nfserrno(nfsd_setuser(rqstp, exp));
+		 * operation.
+		 */
+		error = nfsd_setuser_and_check_port(rqstp, exp);
 		if (error)
 			goto out;
 	}
-- 
cgit v1.2.3


From 91cf45f02af5c871251165d000c3f42a2a0b0552 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 12 Nov 2007 18:10:39 -0800
Subject: [NET]: Add the helper kernel_sock_shutdown()

...and fix a couple of bugs in the NBD, CIFS and OCFS2 socket handlers.

Looking at the sock->op->shutdown() handlers, it looks as if all of them
take a SHUT_RD/SHUT_WR/SHUT_RDWR argument instead of the
RCV_SHUTDOWN/SEND_SHUTDOWN arguments.
Add a helper, and then define the SHUT_* enum to ensure that kernel users
of shutdown() don't get confused.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Mark Fasheh <mark.fasheh@oracle.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/cifs/connect.c      | 2 +-
 fs/ocfs2/cluster/tcp.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1102160f6661..c52a76ff4bb9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -160,7 +160,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	if (server->ssocket) {
 		cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state,
 			server->ssocket->flags));
-		server->ssocket->ops->shutdown(server->ssocket, SEND_SHUTDOWN);
+		kernel_sock_shutdown(server->ssocket, SHUT_WR);
 		cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx",
 			server->ssocket->state,
 			server->ssocket->flags));
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 685c18065c82..d84bd155997b 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -58,6 +58,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/kref.h>
+#include <linux/net.h>
 #include <net/tcp.h>
 
 #include <asm/uaccess.h>
@@ -616,8 +617,7 @@ static void o2net_shutdown_sc(struct work_struct *work)
 		del_timer_sync(&sc->sc_idle_timeout);
 		o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
 		sc_put(sc);
-		sc->sc_sock->ops->shutdown(sc->sc_sock,
-					   RCV_SHUTDOWN|SEND_SHUTDOWN);
+		kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
 	}
 
 	/* not fatal so failed connects before the other guy has our
-- 
cgit v1.2.3


From 022cbae611a37eda80d498f8f379794c8ac3be47 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Tue, 13 Nov 2007 03:23:50 -0800
Subject: [NET]: Move unneeded data to initdata section.

This patch reverts Eric's commit 2b008b0a8e96b726c603c5e1a5a7a509b5f61e35

It diets .text & .data section of the kernel if CONFIG_NET_NS is not set.
This is safe after list operations cleanup.

Signed-of-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 153554cf5575..131f9c68be5f 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -178,7 +178,7 @@ static __net_exit void proc_net_ns_exit(struct net *net)
 	kfree(net->proc_net_root);
 }
 
-static struct pernet_operations proc_net_ns_ops = {
+static struct pernet_operations __net_initdata proc_net_ns_ops = {
 	.init = proc_net_ns_init,
 	.exit = proc_net_ns_exit,
 };
-- 
cgit v1.2.3


From 0b832a4b93932103d73c0c3f35ef1153e288327b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Tue, 13 Nov 2007 08:07:31 -0800
Subject: Revert "ext2/ext3/ext4: add block bitmap validation"

This reverts commit 7c9e69faa28027913ee059c285a5ea8382e24b5d, fixing up
conflicts in fs/ext4/balloc.c manually.

The cost of doing the bitmap validation on each lookup - even when the
bitmap is cached - is absolutely prohibitive.  We could, and probably
should, do it only when adding the bitmap to the buffer cache.  However,
right now we are better off just reverting it.

Peter Zijlstra measured the cost of this extra validation as a 85%
decrease in cached iozone, and while I had a patch that took it down to
just 17% by not being _quite_ so stupid in the validation, it was still
a big slowdown that could have been avoided by just doing it right.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andreas Dilger <adilger@clusterfs.com>
Cc: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/balloc.c | 50 +++++---------------------------------------------
 fs/ext3/balloc.c | 48 ++++--------------------------------------------
 fs/ext4/balloc.c | 41 -----------------------------------------
 3 files changed, 9 insertions(+), 130 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 18a42de25b55..377ad172d74b 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -69,14 +69,6 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
 	return desc + offset;
 }
 
-static inline int
-block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
-{
-	return ext2_test_bit ((block -
-		le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block)) %
-			 EXT2_BLOCKS_PER_GROUP(sb), map);
-}
-
 /*
  * Read the bitmap for a given block_group, reading into the specified 
  * slot in the superblock's bitmap cache.
@@ -86,51 +78,20 @@ block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
 static struct buffer_head *
 read_block_bitmap(struct super_block *sb, unsigned int block_group)
 {
-	int i;
 	struct ext2_group_desc * desc;
 	struct buffer_head * bh = NULL;
-	unsigned int bitmap_blk;
-
+	
 	desc = ext2_get_group_desc (sb, block_group, NULL);
 	if (!desc)
-		return NULL;
-	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
-	bh = sb_bread(sb, bitmap_blk);
+		goto error_out;
+	bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
 	if (!bh)
-		ext2_error (sb, __FUNCTION__,
+		ext2_error (sb, "read_block_bitmap",
 			    "Cannot read block bitmap - "
 			    "block_group = %d, block_bitmap = %u",
 			    block_group, le32_to_cpu(desc->bg_block_bitmap));
-
-	/* check whether block bitmap block number is set */
-	if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-		/* bad block bitmap */
-		goto error_out;
-	}
-	/* check whether the inode bitmap block number is set */
-	bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
-	if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-		/* bad block bitmap */
-		goto error_out;
-	}
-	/* check whether the inode table block number is set */
-	bitmap_blk = le32_to_cpu(desc->bg_inode_table);
-	for (i = 0; i < EXT2_SB(sb)->s_itb_per_group; i++, bitmap_blk++) {
-		if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-			/* bad block bitmap */
-			goto error_out;
-		}
-	}
-
-	return bh;
-
 error_out:
-	brelse(bh);
-	ext2_error(sb, __FUNCTION__,
-			"Invalid block bitmap - "
-			"block_group = %d, block = %u",
-			block_group, bitmap_blk);
-	return NULL;
+	return bh;
 }
 
 static void release_blocks(struct super_block *sb, int count)
@@ -1461,7 +1422,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
 #endif
 }
 
-
 static inline int test_root(int a, int b)
 {
 	int num = b;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 7a87d15523be..a8ba7e831278 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -80,14 +80,6 @@ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
 	return desc + offset;
 }
 
-static inline int
-block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
-{
-	return ext3_test_bit ((block -
-		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
-			 EXT3_BLOCKS_PER_GROUP(sb), map);
-}
-
 /**
  * read_block_bitmap()
  * @sb:			super block
@@ -101,51 +93,20 @@ block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
 static struct buffer_head *
 read_block_bitmap(struct super_block *sb, unsigned int block_group)
 {
-	int i;
 	struct ext3_group_desc * desc;
 	struct buffer_head * bh = NULL;
-	ext3_fsblk_t bitmap_blk;
 
 	desc = ext3_get_group_desc (sb, block_group, NULL);
 	if (!desc)
-		return NULL;
-	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
-	bh = sb_bread(sb, bitmap_blk);
+		goto error_out;
+	bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
 	if (!bh)
-		ext3_error (sb, __FUNCTION__,
+		ext3_error (sb, "read_block_bitmap",
 			    "Cannot read block bitmap - "
 			    "block_group = %d, block_bitmap = %u",
 			    block_group, le32_to_cpu(desc->bg_block_bitmap));
-
-	/* check whether block bitmap block number is set */
-	if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-		/* bad block bitmap */
-		goto error_out;
-	}
-	/* check whether the inode bitmap block number is set */
-	bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
-	if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-		/* bad block bitmap */
-		goto error_out;
-	}
-	/* check whether the inode table block number is set */
-	bitmap_blk = le32_to_cpu(desc->bg_inode_table);
-	for (i = 0; i < EXT3_SB(sb)->s_itb_per_group; i++, bitmap_blk++) {
-		if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-			/* bad block bitmap */
-			goto error_out;
-		}
-	}
-
-	return bh;
-
 error_out:
-	brelse(bh);
-	ext3_error(sb, __FUNCTION__,
-			"Invalid block bitmap - "
-			"block_group = %d, block = %lu",
-			block_group, bitmap_blk);
-	return NULL;
+	return bh;
 }
 /*
  * The reservation window structure operations
@@ -1772,7 +1733,6 @@ ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
 #endif
 }
 
-
 static inline int test_root(int a, int b)
 {
 	int num = b;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e906b65448e2..71ee95e534fd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -189,15 +189,6 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 	return desc;
 }
 
-static inline int
-block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
-{
-	ext4_grpblk_t offset;
-
-	ext4_get_group_no_and_offset(sb, block, NULL, &offset);
-	return ext4_test_bit (offset, map);
-}
-
 /**
  * read_block_bitmap()
  * @sb:			super block
@@ -211,7 +202,6 @@ block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
 struct buffer_head *
 read_block_bitmap(struct super_block *sb, unsigned int block_group)
 {
-	int i;
 	struct ext4_group_desc * desc;
 	struct buffer_head * bh = NULL;
 	ext4_fsblk_t bitmap_blk;
@@ -239,38 +229,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
 			    "Cannot read block bitmap - "
 			    "block_group = %d, block_bitmap = %llu",
 			    block_group, bitmap_blk);
-
-	/* check whether block bitmap block number is set */
-	if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-		/* bad block bitmap */
-		goto error_out;
-	}
-
-	/* check whether the inode bitmap block number is set */
-	bitmap_blk = ext4_inode_bitmap(sb, desc);
-	if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-		/* bad block bitmap */
-		goto error_out;
-	}
-	/* check whether the inode table block number is set */
-	bitmap_blk = ext4_inode_table(sb, desc);
-	for (i = 0; i < EXT4_SB(sb)->s_itb_per_group; i++, bitmap_blk++) {
-		if (!block_in_use(bitmap_blk, sb, bh->b_data)) {
-			/* bad block bitmap */
-			goto error_out;
-		}
-	}
-
 	return bh;
-
-error_out:
-	brelse(bh);
-	ext4_error(sb, __FUNCTION__,
-			"Invalid block bitmap - "
-			"block_group = %d, block = %llu",
-			block_group, bitmap_blk);
-	return NULL;
-
 }
 /*
  * The reservation window structure operations
-- 
cgit v1.2.3


From 133672efbc1085f9af990bdc145e1822ea93bcf3 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 13 Nov 2007 22:41:37 +0000
Subject: [CIFS] Fix buffer overflow if server sends corrupt response to small
 request

In SendReceive() function in transport.c - it memcpy's
message payload into a buffer passed via out_buf param. The function
assumes that all buffers are of size (CIFSMaxBufSize +
MAX_CIFS_HDR_SIZE) , unfortunately it is also called with smaller
(MAX_CIFS_SMALL_BUFFER_SIZE) buffers.  There are eight callers
(SMB worker functions) which are primarily affected by this change:

TreeDisconnect, uLogoff, Close, findClose, SetFileSize, SetFileTimes,
Lock and PosixLock

CC: Dave Kleikamp <shaggy@austin.ibm.com>
CC: Przemyslaw Wegrzyn <czajnik@czajsoft.pl>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h  | 11 ++++++
 fs/cifs/cifsproto.h |  5 +--
 fs/cifs/cifssmb.c   | 97 ++++++++++++++++++++---------------------------------
 fs/cifs/connect.c   |  9 ++---
 fs/cifs/file.c      | 14 ++++----
 fs/cifs/sess.c      |  2 +-
 fs/cifs/transport.c | 91 +++++++++++++++++++++++++++++++++++++------------
 7 files changed, 133 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 87f51f23276f..4ff8179df7ec 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -471,6 +471,17 @@ struct dir_notify_req {
 #define   CIFS_LARGE_BUFFER     2
 #define   CIFS_IOVEC            4    /* array of response buffers */
 
+/* Type of Request to SendReceive2 */
+#define   CIFS_STD_OP	        0    /* normal request timeout */
+#define   CIFS_LONG_OP          1    /* long op (up to 45 sec, oplock time) */
+#define   CIFS_VLONG_OP         2    /* sloow op - can take up to 180 seconds */
+#define   CIFS_BLOCKING_OP      4    /* operation can block */
+#define   CIFS_ASYNC_OP         8    /* do not wait for response */
+#define   CIFS_TIMEOUT_MASK 0x00F    /* only one of 5 above set in req */
+#define   CIFS_LOG_ERROR    0x010    /* log NT STATUS if non-zero */
+#define   CIFS_LARGE_BUF_OP 0x020    /* large request buffer */
+#define   CIFS_NO_RESP      0x040    /* no response buffer required */
+
 /* Security Flags: indicate type of session setup needed */
 #define   CIFSSEC_MAY_SIGN	0x00001
 #define   CIFSSEC_MAY_NTLM	0x00002
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index dd1d7c200ee6..0c55dff2add8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -48,10 +48,11 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
 			struct smb_hdr * /* input */ ,
 			struct smb_hdr * /* out */ ,
 			int * /* bytes returned */ , const int long_op);
+extern int SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
+			struct smb_hdr *in_buf, int flags);
 extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
 			struct kvec *, int /* nvec to send */,
-			int * /* type of buf returned */ , const int long_op,
-			const int logError /* whether to log status code*/ );
+			int * /* type of buf returned */ , const int flags);
 extern int SendReceiveBlockingLock(const unsigned int /* xid */ ,
 					struct cifsTconInfo *,
 				struct smb_hdr * /* input */ ,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 59d7b7c037ad..9e8a6bef029a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -698,9 +698,7 @@ int
 CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 {
 	struct smb_hdr *smb_buffer;
-	struct smb_hdr *smb_buffer_response; /* BB removeme BB */
 	int rc = 0;
-	int length;
 
 	cFYI(1, ("In tree disconnect"));
 	/*
@@ -737,16 +735,12 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 	if (rc) {
 		up(&tcon->tconSem);
 		return rc;
-	} else {
-		smb_buffer_response = smb_buffer; /* BB removeme BB */
 	}
-	rc = SendReceive(xid, tcon->ses, smb_buffer, smb_buffer_response,
-			 &length, 0);
+
+	rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
 	if (rc)
 		cFYI(1, ("Tree disconnect failed %d", rc));
 
-	if (smb_buffer)
-		cifs_small_buf_release(smb_buffer);
 	up(&tcon->tconSem);
 
 	/* No need to return error on this operation if tid invalidated and
@@ -760,10 +754,8 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 int
 CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 {
-	struct smb_hdr *smb_buffer_response;
 	LOGOFF_ANDX_REQ *pSMB;
 	int rc = 0;
-	int length;
 
 	cFYI(1, ("In SMBLogoff for session disconnect"));
 	if (ses)
@@ -782,8 +774,6 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 		return rc;
 	}
 
-	smb_buffer_response = (struct smb_hdr *)pSMB; /* BB removeme BB */
-
 	if (ses->server) {
 		pSMB->hdr.Mid = GetNextMid(ses->server);
 
@@ -795,8 +785,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	pSMB->hdr.Uid = ses->Suid;
 
 	pSMB->AndXCommand = 0xFF;
-	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
-			 smb_buffer_response, &length, 0);
+	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
 	if (ses->server) {
 		atomic_dec(&ses->server->socketUseCount);
 		if (atomic_read(&ses->server->socketUseCount) == 0) {
@@ -807,7 +796,6 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 		}
 	}
 	up(&ses->sesSem);
-	cifs_small_buf_release(pSMB);
 
 	/* if session dead then we do not need to do ulogoff,
 		since server closed smb session, no sense reporting
@@ -1255,7 +1243,7 @@ OldOpenRetry:
 	pSMB->ByteCount = cpu_to_le16(count);
 	/* long_op set to 1 to allow for oplock break timeouts */
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 1);
+			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
 		cFYI(1, ("Error in Open = %d", rc));
@@ -1368,7 +1356,7 @@ openRetry:
 	pSMB->ByteCount = cpu_to_le16(count);
 	/* long_op set to 1 to allow for oplock break timeouts */
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 1);
+			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
 		cFYI(1, ("Error in Open = %d", rc));
@@ -1446,7 +1434,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	iov[0].iov_base = (char *)pSMB;
 	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
-			 &resp_buf_type, 0 /* not long op */, 1 /* log err */ );
+			 &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR);
 	cifs_stats_inc(&tcon->num_reads);
 	pSMBr = (READ_RSP *)iov[0].iov_base;
 	if (rc) {
@@ -1665,7 +1653,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 
 
 	rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type,
-			  long_op, 0 /* do not log STATUS code */ );
+			  long_op);
 	cifs_stats_inc(&tcon->num_writes);
 	if (rc) {
 		cFYI(1, ("Send error Write2 = %d", rc));
@@ -1707,7 +1695,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	int timeout = 0;
 	__u16 count;
 
-	cFYI(1, ("In CIFSSMBLock - timeout %d numLock %d", waitFlag, numLock));
+	cFYI(1, ("CIFSSMBLock timeout %d numLock %d", waitFlag, numLock));
 	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -1716,10 +1704,10 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	pSMBr = (LOCK_RSP *)pSMB; /* BB removeme BB */
 
 	if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
-		timeout = -1; /* no response expected */
+		timeout = CIFS_ASYNC_OP; /* no response expected */
 		pSMB->Timeout = 0;
 	} else if (waitFlag == TRUE) {
-		timeout = 3;  /* blocking operation, no timeout */
+		timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
 		pSMB->Timeout = cpu_to_le32(-1);/* blocking - do not time out */
 	} else {
 		pSMB->Timeout = 0;
@@ -1749,15 +1737,16 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	if (waitFlag) {
 		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
 			(struct smb_hdr *) pSMBr, &bytes_returned);
+		cifs_small_buf_release(pSMB);
 	} else {
-		rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, timeout);
+		rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB,
+				      timeout);
+		/* SMB buffer freed by function above */
 	}
 	cifs_stats_inc(&tcon->num_locks);
 	if (rc) {
 		cFYI(1, ("Send error in Lock = %d", rc));
 	}
-	cifs_small_buf_release(pSMB);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 	since file handle passed in no longer valid */
@@ -1776,7 +1765,9 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	int timeout = 0;
 	int bytes_returned = 0;
+	int resp_buf_type = 0;
 	__u16 params, param_offset, offset, byte_count, count;
+	struct kvec iov[1];
 
 	cFYI(1, ("Posix Lock"));
 
@@ -1818,7 +1809,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 
 	parm_data->lock_type = cpu_to_le16(lock_type);
 	if (waitFlag) {
-		timeout = 3;  /* blocking operation, no timeout */
+		timeout = CIFS_BLOCKING_OP; /* blocking operation, no timeout */
 		parm_data->lock_flags = cpu_to_le16(1);
 		pSMB->Timeout = cpu_to_le32(-1);
 	} else
@@ -1838,8 +1829,13 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
 			(struct smb_hdr *) pSMBr, &bytes_returned);
 	} else {
-		rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *) pSMBr, &bytes_returned, timeout);
+		iov[0].iov_base = (char *)pSMB;
+		iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+		rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
+				&resp_buf_type, timeout);
+		pSMB = NULL; /* request buf already freed by SendReceive2. Do
+				not try to free it twice below on exit */
+		pSMBr = (struct smb_com_transaction2_sfi_rsp *)iov[0].iov_base;
 	}
 
 	if (rc) {
@@ -1874,6 +1870,11 @@ plk_err_exit:
 	if (pSMB)
 		cifs_small_buf_release(pSMB);
 
+	if (resp_buf_type == CIFS_SMALL_BUFFER)
+		cifs_small_buf_release(iov[0].iov_base);
+	else if (resp_buf_type == CIFS_LARGE_BUFFER)
+		cifs_buf_release(iov[0].iov_base);
+
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 	   since file handle passed in no longer valid */
 
@@ -1886,8 +1887,6 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
 	int rc = 0;
 	CLOSE_REQ *pSMB = NULL;
-	CLOSE_RSP *pSMBr = NULL;
-	int bytes_returned;
 	cFYI(1, ("In CIFSSMBClose"));
 
 /* do not retry on dead session on close */
@@ -1897,13 +1896,10 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 	if (rc)
 		return rc;
 
-	pSMBr = (CLOSE_RSP *)pSMB; /* BB removeme BB */
-
 	pSMB->FileID = (__u16) smb_file_id;
 	pSMB->LastWriteTime = 0xFFFFFFFF;
 	pSMB->ByteCount = 0;
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	cifs_stats_inc(&tcon->num_closes);
 	if (rc) {
 		if (rc != -EINTR) {
@@ -1912,8 +1908,6 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 		}
 	}
 
-	cifs_small_buf_release(pSMB);
-
 	/* Since session is dead, file will be closed on server already */
 	if (rc == -EAGAIN)
 		rc = 0;
@@ -3102,7 +3096,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
 
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
-			 0 /* not long op */, 0 /* do not log STATUS codes */ );
+			 CIFS_STD_OP);
 	cifs_stats_inc(&tcon->num_acl_get);
 	if (rc) {
 		cFYI(1, ("Send error in QuerySecDesc = %d", rc));
@@ -3763,8 +3757,6 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 {
 	int rc = 0;
 	FINDCLOSE_REQ *pSMB = NULL;
-	CLOSE_RSP *pSMBr = NULL; /* BB removeme BB */
-	int bytes_returned;
 
 	cFYI(1, ("In CIFSSMBFindClose"));
 	rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
@@ -3776,16 +3768,13 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 	if (rc)
 		return rc;
 
-	pSMBr = (CLOSE_RSP *)pSMB;  /* BB removeme BB */
 	pSMB->FileID = searchHandle;
 	pSMB->ByteCount = 0;
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc) {
 		cERROR(1, ("Send error in FindClose = %d", rc));
 	}
 	cifs_stats_inc(&tcon->num_fclose);
-	cifs_small_buf_release(pSMB);
 
 	/* Since session is dead, search handle closed on server already */
 	if (rc == -EAGAIN)
@@ -4707,11 +4696,9 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 		   __u16 fid, __u32 pid_of_opener, int SetAllocation)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
 	char *data_offset;
 	struct file_end_of_file_info *parm_data;
 	int rc = 0;
-	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
 	cFYI(1, ("SetFileSize (via SetFileInfo) %lld",
@@ -4721,8 +4708,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 	if (rc)
 		return rc;
 
-	pSMBr = (struct smb_com_transaction2_sfi_rsp *)pSMB;
-
 	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
 	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
 
@@ -4773,17 +4758,13 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 	pSMB->Reserved4 = 0;
 	pSMB->hdr.smb_buf_length += byte_count;
 	pSMB->ByteCount = cpu_to_le16(byte_count);
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc) {
 		cFYI(1,
 		     ("Send error in SetFileInfo (SetFileSize) = %d",
 		      rc));
 	}
 
-	if (pSMB)
-		cifs_small_buf_release(pSMB);
-
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
 
@@ -4801,10 +4782,8 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
 		    const FILE_BASIC_INFO *data, __u16 fid)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
 	char *data_offset;
 	int rc = 0;
-	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
 	cFYI(1, ("Set Times (via SetFileInfo)"));
@@ -4813,8 +4792,6 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
 	if (rc)
 		return rc;
 
-	pSMBr = (struct smb_com_transaction2_sfi_rsp *)pSMB;
-
 	/* At this point there is no need to override the current pid
 	with the pid of the opener, but that could change if we someday
 	use an existing handle (rather than opening one on the fly) */
@@ -4854,14 +4831,11 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
 	pSMB->hdr.smb_buf_length += byte_count;
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc) {
 		cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
 	}
 
-	cifs_small_buf_release(pSMB);
-
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
 
@@ -5152,7 +5126,8 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 	pSMB->ByteCount = 0;
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *) pSMBr, &bytes_returned, -1);
+			 (struct smb_hdr *)pSMBr, &bytes_returned,
+			 CIFS_ASYNC_OP);
 	if (rc) {
 		cFYI(1, ("Error in Notify = %d", rc));
 	} else {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c52a76ff4bb9..26e1087e081f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2374,7 +2374,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	pSMB->req_no_secext.ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-			 &bytes_returned, 1);
+			 &bytes_returned, CIFS_LONG_OP);
 	if (rc) {
 /* rc = map_smb_to_linux_error(smb_buffer_response); now done in SendReceive */
 	} else if ((smb_buffer_response->WordCount == 3)
@@ -2678,7 +2678,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 	pSMB->req.ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-			 &bytes_returned, 1);
+			 &bytes_returned, CIFS_LONG_OP);
 
 	if (smb_buffer_response->Status.CifsError ==
 	    cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
@@ -3105,7 +3105,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	pSMB->req.ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-			 &bytes_returned, 1);
+			 &bytes_returned, CIFS_LONG_OP);
 	if (rc) {
 /*   rc = map_smb_to_linux_error(smb_buffer_response) done in SendReceive now */
 	} else if ((smb_buffer_response->WordCount == 3) ||
@@ -3381,7 +3381,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	pSMB->hdr.smb_buf_length += count;
 	pSMB->ByteCount = cpu_to_le16(count);
 
-	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, 0);
+	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
+			 CIFS_STD_OP);
 
 	/* if (rc) rc = map_smb_to_linux_error(smb_buffer_response); */
 	/* above now done in SendReceive */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 68ad4ca0cfa3..82326d2142e7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -835,9 +835,9 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 	xid = GetXid();
 
 	if (*poffset > file->f_path.dentry->d_inode->i_size)
-		long_op = 2; /* writes past end of file can take a long time */
+		long_op = CIFS_VLONG_OP; /* writes past EOF take long time */
 	else
-		long_op = 1;
+		long_op = CIFS_LONG_OP;
 
 	for (total_written = 0; write_size > total_written;
 	     total_written += bytes_written) {
@@ -884,7 +884,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			}
 		} else
 			*poffset += bytes_written;
-		long_op = FALSE; /* subsequent writes fast -
+		long_op = CIFS_STD_OP; /* subsequent writes fast -
 				    15 seconds is plenty */
 	}
 
@@ -934,9 +934,9 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 	xid = GetXid();
 
 	if (*poffset > file->f_path.dentry->d_inode->i_size)
-		long_op = 2; /* writes past end of file can take a long time */
+		long_op = CIFS_VLONG_OP; /* writes past EOF can be slow */
 	else
-		long_op = 1;
+		long_op = CIFS_LONG_OP;
 
 	for (total_written = 0; write_size > total_written;
 	     total_written += bytes_written) {
@@ -1002,7 +1002,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 			}
 		} else
 			*poffset += bytes_written;
-		long_op = FALSE; /* subsequent writes fast -
+		long_op = CIFS_STD_OP; /* subsequent writes fast -
 				    15 seconds is plenty */
 	}
 
@@ -1360,7 +1360,7 @@ retry:
 						   open_file->netfid,
 						   bytes_to_write, offset,
 						   &bytes_written, iov, n_iov,
-						   1);
+						   CIFS_LONG_OP);
 				atomic_dec(&open_file->wrtPending);
 				if (rc || bytes_written < bytes_to_write) {
 					cERROR(1, ("Write2 ret %d, wrote %d",
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 899dc6078d9a..ed01ef382aa9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -514,7 +514,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	iov[1].iov_base = str_area;
 	iov[1].iov_len = count;
 	rc = SendReceive2(xid, ses, iov, 2 /* num_iovecs */, &resp_buf_type,
-			  0 /* not long op */, 1 /* log NT STATUS if any */ );
+			  CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
 	/* SMB request buf freed in SendReceive2 */
 
 	cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ed32b3cb781..50b623ad9320 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -308,7 +308,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 
 static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
 {
-	if (long_op == -1) {
+	if (long_op == CIFS_ASYNC_OP) {
 		/* oplock breaks must not be held up */
 		atomic_inc(&ses->server->inFlight);
 	} else {
@@ -337,7 +337,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
 				   as they are allowed to block on server */
 
 				/* update # of requests on the wire to server */
-				if (long_op < 3)
+				if (long_op != CIFS_BLOCKING_OP)
 					atomic_inc(&ses->server->inFlight);
 				spin_unlock(&GlobalMid_Lock);
 				break;
@@ -415,17 +415,48 @@ static int wait_for_response(struct cifsSesInfo *ses,
 	}
 }
 
+
+/*
+ *
+ * Send an SMB Request.  No response info (other than return code)
+ * needs to be parsed.
+ *
+ * flags indicate the type of request buffer and how long to wait
+ * and whether to log NT STATUS code (error) before mapping it to POSIX error
+ *
+ */
+int
+SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
+		struct smb_hdr *in_buf, int flags)
+{
+	int rc;
+	struct kvec iov[1];
+	int resp_buf_type;
+
+	iov[0].iov_base = (char *)in_buf;
+	iov[0].iov_len = in_buf->smb_buf_length + 4;
+	flags |= CIFS_NO_RESP;
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1, ("SendRcvNoR flags %d rc %d", flags, rc));
+#endif
+	return rc;
+}
+
 int
 SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	     struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
-	     const int long_op, const int logError)
+	     const int flags)
 {
 	int rc = 0;
+	int long_op;
 	unsigned int receive_len;
 	unsigned long timeout;
 	struct mid_q_entry *midQ;
 	struct smb_hdr *in_buf = iov[0].iov_base;
 
+	long_op = flags & CIFS_TIMEOUT_MASK;
+
 	*pRespBufType = CIFS_NO_BUFFER;  /* no response buf yet */
 
 	if ((ses == NULL) || (ses->server == NULL)) {
@@ -483,15 +514,22 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	if (rc < 0)
 		goto out;
 
-	if (long_op == -1)
-		goto out;
-	else if (long_op == 2) /* writes past end of file can take loong time */
+	if (long_op == CIFS_STD_OP)
+		timeout = 15 * HZ;
+	else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
 		timeout = 180 * HZ;
-	else if (long_op == 1)
+	else if (long_op == CIFS_LONG_OP)
 		timeout = 45 * HZ; /* should be greater than
 			servers oplock break timeout (about 43 seconds) */
-	else
-		timeout = 15 * HZ;
+	else if (long_op == CIFS_ASYNC_OP)
+		goto out;
+	else if (long_op == CIFS_BLOCKING_OP)
+		timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
+	else {
+		cERROR(1, ("unknown timeout flag %d", long_op));
+		rc = -EIO;
+		goto out;
+	}
 
 	/* wait for 15 seconds or until woken up due to response arriving or
 	   due to last connection to this server being unmounted */
@@ -566,7 +604,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 			}
 
 			/* BB special case reconnect tid and uid here? */
-			rc = map_smb_to_linux_error(midQ->resp_buf, logError);
+			rc = map_smb_to_linux_error(midQ->resp_buf,
+						flags & CIFS_LOG_ERROR);
 
 			/* convert ByteCount if necessary */
 			if (receive_len >= sizeof(struct smb_hdr) - 4
@@ -574,8 +613,10 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 			    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
 				BCC(midQ->resp_buf) =
 					le16_to_cpu(BCC_LE(midQ->resp_buf));
-			midQ->resp_buf = NULL;  /* mark it so will not be freed
-						by DeleteMidQEntry */
+			if ((flags & CIFS_NO_RESP) == 0)
+				midQ->resp_buf = NULL;  /* mark it so buf will
+							   not be freed by
+							   DeleteMidQEntry */
 		} else {
 			rc = -EIO;
 			cFYI(1, ("Bad MID state?"));
@@ -663,17 +704,25 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	if (rc < 0)
 		goto out;
 
-	if (long_op == -1)
+	if (long_op == CIFS_STD_OP)
+		timeout = 15 * HZ;
+	/* wait for 15 seconds or until woken up due to response arriving or
+	   due to last connection to this server being unmounted */
+	else if (long_op == CIFS_ASYNC_OP)
 		goto out;
-	else if (long_op == 2) /* writes past end of file can take loong time */
+	else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
 		timeout = 180 * HZ;
-	else if (long_op == 1)
+	else if (long_op == CIFS_LONG_OP)
 		timeout = 45 * HZ; /* should be greater than
 			servers oplock break timeout (about 43 seconds) */
-	else
-		timeout = 15 * HZ;
-	/* wait for 15 seconds or until woken up due to response arriving or
-	   due to last connection to this server being unmounted */
+	else if (long_op == CIFS_BLOCKING_OP)
+		timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
+	else {
+		cERROR(1, ("unknown timeout flag %d", long_op));
+		rc = -EIO;
+		goto out;
+	}
+
 	if (signal_pending(current)) {
 		/* if signal pending do not hold up user for full smb timeout
 		but we still give response a chance to complete */
@@ -812,7 +861,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
 	pSMB->hdr.Mid = GetNextMid(ses->server);
 
 	return SendReceive(xid, ses, in_buf, out_buf,
-			&bytes_returned, 0);
+			&bytes_returned, CIFS_STD_OP);
 }
 
 int
@@ -844,7 +893,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	   to the same server. We may make this configurable later or
 	   use ses->maxReq */
 
-	rc = wait_for_free_request(ses, 3);
+	rc = wait_for_free_request(ses, CIFS_BLOCKING_OP);
 	if (rc)
 		return rc;
 
-- 
cgit v1.2.3


From 8a146a2b0d6e97941a5c2dc5d8a3ea1e6c3ab997 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Wed, 14 Nov 2007 16:58:27 -0800
Subject: eCryptfs: cast page->index to loff_t instead of off_t

page->index should be cast to loff_t instead of off_t.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Reported-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/read_write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 2150edf9a58e..6b7474a4336a 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -87,7 +87,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 	loff_t offset;
 	int rc;
 
-	offset = ((((off_t)page_for_lower->index) << PAGE_CACHE_SHIFT)
+	offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT)
 		  + offset_in_page);
 	virt = kmap(page_for_lower);
 	rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
-- 
cgit v1.2.3


From e47776a0a41a14a5634633c96e590827f552c4b5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 14 Nov 2007 16:58:56 -0800
Subject: Forbid user to change file flags on quota files

Forbid user from changing file flags on quota files.  User has no bussiness
in playing with these flags when quota is on.  Furthermore there is a
remote possibility of deadlock due to a lock inversion between quota file's
i_mutex and transaction's start (i_mutex for quota file is locked only when
trasaction is started in quota operations) in ext3 and ext4.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: LIOU Payphone <lioupayphone@gmail.com>
Cc: <linux-ext4@vger.kernel.org>
Acked-by: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: <reiserfs-dev@namesys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/ioctl.c     | 5 +++++
 fs/ext3/ioctl.c     | 5 +++++
 fs/ext4/ioctl.c     | 5 +++++
 fs/jfs/ioctl.c      | 3 +++
 fs/reiserfs/ioctl.c | 3 +++
 5 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index c2324d5fe4ac..320b2cb3d4d2 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -47,6 +47,11 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 			flags &= ~EXT2_DIRSYNC_FL;
 
 		mutex_lock(&inode->i_mutex);
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode)) {
+			mutex_unlock(&inode->i_mutex);
+			return -EPERM;
+		}
 		oldflags = ei->i_flags;
 
 		/*
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 4a2a02c95bf9..023a070f55f1 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -51,6 +51,11 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 			flags &= ~EXT3_DIRSYNC_FL;
 
 		mutex_lock(&inode->i_mutex);
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode)) {
+			mutex_unlock(&inode->i_mutex);
+			return -EPERM;
+		}
 		oldflags = ei->i_flags;
 
 		/* The JOURNAL_DATA flag is modifiable only by root */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c04c7ccba9e3..e7f894bdb420 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -51,6 +51,11 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 			flags &= ~EXT4_DIRSYNC_FL;
 
 		mutex_lock(&inode->i_mutex);
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode)) {
+			mutex_unlock(&inode->i_mutex);
+			return -EPERM;
+		}
 		oldflags = ei->i_flags;
 
 		/* The JOURNAL_DATA flag is modifiable only by root */
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 3c8663bea98c..dfda12a073e1 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -79,6 +79,9 @@ int jfs_ioctl(struct inode * inode, struct file * filp, unsigned int cmd,
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~JFS_DIRSYNC_FL;
 
+		/* Is it quota file? Do not allow user to mess with it */
+		if (IS_NOQUOTA(inode))
+			return -EPERM;
 		jfs_get_inode_flags(jfs_inode);
 		oldflags = jfs_inode->mode2;
 
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index c438a8f83f26..e0f0f098a523 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -57,6 +57,9 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 			if (get_user(flags, (int __user *)arg))
 				return -EFAULT;
 
+			/* Is it quota file? Do not allow user to mess with it. */
+			if (IS_NOQUOTA(inode))
+				return -EPERM;
 			if (((flags ^ REISERFS_I(inode)->
 			      i_attrs) & (REISERFS_IMMUTABLE_FL |
 					  REISERFS_APPEND_FL))
-- 
cgit v1.2.3


From e1a1c997afe907e6ec4799e4be0f38cffd8b418c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 14 Nov 2007 16:59:08 -0800
Subject: proc: fix proc_kill_inodes to kill dentries on all proc superblocks

It appears we overlooked support for removing generic proc files
when we added support for multiple proc super blocks.  Handle
that now.

[akpm@linux-foundation.org: coding-style cleanups]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Alexey Dobriyan <adobriyan@sw.ru>
Acked-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c  | 39 ++++++++++++++++++++++-----------------
 fs/proc/internal.h |  2 ++
 fs/proc/root.c     |  2 +-
 3 files changed, 25 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 1bdb62435758..a9806bc21ec3 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -561,28 +561,33 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 static void proc_kill_inodes(struct proc_dir_entry *de)
 {
 	struct list_head *p;
-	struct super_block *sb = proc_mnt->mnt_sb;
+	struct super_block *sb;
 
 	/*
 	 * Actually it's a partial revoke().
 	 */
-	file_list_lock();
-	list_for_each(p, &sb->s_files) {
-		struct file * filp = list_entry(p, struct file, f_u.fu_list);
-		struct dentry * dentry = filp->f_path.dentry;
-		struct inode * inode;
-		const struct file_operations *fops;
-
-		if (dentry->d_op != &proc_dentry_operations)
-			continue;
-		inode = dentry->d_inode;
-		if (PDE(inode) != de)
-			continue;
-		fops = filp->f_op;
-		filp->f_op = NULL;
-		fops_put(fops);
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &proc_fs_type.fs_supers, s_instances) {
+		file_list_lock();
+		list_for_each(p, &sb->s_files) {
+			struct file *filp = list_entry(p, struct file,
+							f_u.fu_list);
+			struct dentry *dentry = filp->f_path.dentry;
+			struct inode *inode;
+			const struct file_operations *fops;
+
+			if (dentry->d_op != &proc_dentry_operations)
+				continue;
+			inode = dentry->d_inode;
+			if (PDE(inode) != de)
+				continue;
+			fops = filp->f_op;
+			filp->f_op = NULL;
+			fops_put(fops);
+		}
+		file_list_unlock();
 	}
-	file_list_unlock();
+	spin_unlock(&sb_lock);
 }
 
 static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1820eb2ef762..1b2b6c6bb475 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -78,3 +78,5 @@ static inline int proc_fd(struct inode *inode)
 {
 	return PROC_I(inode)->fd;
 }
+
+extern struct file_system_type proc_fs_type;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ec9cb3b6c93b..1f86bb860e04 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -98,7 +98,7 @@ static void proc_kill_sb(struct super_block *sb)
 	put_pid_ns(ns);
 }
 
-static struct file_system_type proc_fs_type = {
+struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.get_sb		= proc_get_sb,
 	.kill_sb	= proc_kill_sb,
-- 
cgit v1.2.3


From c79fb75e5a514a5a35f22c229042aa29f4237e3a Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Wed, 14 Nov 2007 16:59:38 -0800
Subject: hugetlb: fix quota management for private mappings

The hugetlbfs quota management system was never taught to handle MAP_PRIVATE
mappings when that support was added.  Currently, quota is debited at page
instantiation and credited at file truncation.  This approach works correctly
for shared pages but is incomplete for private pages.  In addition to
hugetlb_no_page(), private pages can be instantiated by hugetlb_cow(); but
this function does not respect quotas.

Private huge pages are treated very much like normal, anonymous pages.  They
are not "backed" by the hugetlbfs file and are not stored in the mapping's
radix tree.  This means that private pages are invisible to
truncate_hugepages() so that function will not credit the quota.

This patch (based on a prototype provided by Ken Chen) moves quota crediting
for all pages into free_huge_page().  page->private is used to store a pointer
to the mapping to which this page belongs.  This is used to credit quota on
the appropriate hugetlbfs instance.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: Ken Chen <kenchen@google.com>
Cc: Ken Chen <kenchen@google.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: David Gibson <hermes@gibson.dropbear.id.au>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 12aca8ed605f..6513f5655861 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -364,7 +364,6 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 			++next;
 			truncate_huge_page(page);
 			unlock_page(page);
-			hugetlb_put_quota(mapping);
 			freed++;
 		}
 		huge_pagevec_release(&pvec);
-- 
cgit v1.2.3


From 9a119c056dc2a9970901954a6d561d50a95e528d Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Wed, 14 Nov 2007 16:59:41 -0800
Subject: hugetlb: allow bulk updating in hugetlb_*_quota()

Add a second parameter 'delta' to hugetlb_get_quota and hugetlb_put_quota to
allow bulk updating of the sbinfo->free_blocks counter.  This will be used by
the next patch in the series.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: Ken Chen <kenchen@google.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: David Gibson <hermes@gibson.dropbear.id.au>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6513f5655861..09ee07f02663 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -858,15 +858,15 @@ out_free:
 	return -ENOMEM;
 }
 
-int hugetlb_get_quota(struct address_space *mapping)
+int hugetlb_get_quota(struct address_space *mapping, long delta)
 {
 	int ret = 0;
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
 
 	if (sbinfo->free_blocks > -1) {
 		spin_lock(&sbinfo->stat_lock);
-		if (sbinfo->free_blocks > 0)
-			sbinfo->free_blocks--;
+		if (sbinfo->free_blocks - delta >= 0)
+			sbinfo->free_blocks -= delta;
 		else
 			ret = -ENOMEM;
 		spin_unlock(&sbinfo->stat_lock);
@@ -875,13 +875,13 @@ int hugetlb_get_quota(struct address_space *mapping)
 	return ret;
 }
 
-void hugetlb_put_quota(struct address_space *mapping)
+void hugetlb_put_quota(struct address_space *mapping, long delta)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
 
 	if (sbinfo->free_blocks > -1) {
 		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_blocks++;
+		sbinfo->free_blocks += delta;
 		spin_unlock(&sbinfo->stat_lock);
 	}
 }
-- 
cgit v1.2.3


From f433dc56344cb72cc3de5ba0819021cec3aef807 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@gmail.com>
Date: Wed, 14 Nov 2007 16:59:47 -0800
Subject: Fixes to the BFS filesystem driver

I found a few bugs in the BFS driver.  Detailed description of the bugs as
well as the steps to reproduce the errors are given in the kernel bugzilla.
 Please follow these links for more information:

http://bugzilla.kernel.org/show_bug.cgi?id=9363
http://bugzilla.kernel.org/show_bug.cgi?id=9364
http://bugzilla.kernel.org/show_bug.cgi?id=9365
http://bugzilla.kernel.org/show_bug.cgi?id=9366

This patch fixes the bugs described above.  Besides, the patch introduces
coding style changes to make the BFS driver conform to the requirements
specified for Linux kernel code.  Finally, I made a few cosmetic changes
such as removal of trivial debug output.

Also, the patch removes the fields `si_lf_ioff' and `si_lf_sblk' of the
in-core superblock structure.  These fields are initialized but never
actually used.

If you are wondering why I need BFS, here is the answer: I am using this
driver in the context of Linux kernel classes I am teaching in the Moscow
State University and in the International Institute of Information
Technology in Pune, India.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@gmail.com>
Cc: Tigran Aivazian <tigran@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bfs/bfs.h   |   4 +-
 fs/bfs/dir.c   | 146 ++++++++++++++++++++++++++++++---------------------------
 fs/bfs/file.c  |  62 +++++++++++++++---------
 fs/bfs/inode.c | 127 ++++++++++++++++++++++++-------------------------
 4 files changed, 184 insertions(+), 155 deletions(-)

(limited to 'fs')

diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 130f6c66c5ba..ac7a8b1d6c3a 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -14,8 +14,6 @@ struct bfs_sb_info {
 	unsigned long si_blocks;
 	unsigned long si_freeb;
 	unsigned long si_freei;
-	unsigned long si_lf_ioff;
-	unsigned long si_lf_sblk;
 	unsigned long si_lf_eblk;
 	unsigned long si_lasti;
 	unsigned long * si_imap;
@@ -39,7 +37,7 @@ static inline struct bfs_sb_info *BFS_SB(struct super_block *sb)
 
 static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 {
-	return list_entry(inode, struct bfs_inode_info, vfs_inode);
+	return container_of(inode, struct bfs_inode_info, vfs_inode);
 }
 
 
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 097f1497f743..1fd056d0fc3d 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -21,29 +21,32 @@
 #define dprintf(x...)
 #endif
 
-static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino);
-static struct buffer_head * bfs_find_entry(struct inode * dir, 
-	const unsigned char * name, int namelen, struct bfs_dirent ** res_dir);
+static int bfs_add_entry(struct inode *dir, const unsigned char *name,
+						int namelen, int ino);
+static struct buffer_head *bfs_find_entry(struct inode *dir,
+				const unsigned char *name, int namelen,
+				struct bfs_dirent **res_dir);
 
-static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
+static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 {
-	struct inode * dir = f->f_path.dentry->d_inode;
-	struct buffer_head * bh;
-	struct bfs_dirent * de;
+	struct inode *dir = f->f_path.dentry->d_inode;
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
 	unsigned int offset;
 	int block;
 
 	lock_kernel();
 
-	if (f->f_pos & (BFS_DIRENT_SIZE-1)) {
-		printf("Bad f_pos=%08lx for %s:%08lx\n", (unsigned long)f->f_pos, 
-			dir->i_sb->s_id, dir->i_ino);
+	if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
+		printf("Bad f_pos=%08lx for %s:%08lx\n",
+					(unsigned long)f->f_pos,
+					dir->i_sb->s_id, dir->i_ino);
 		unlock_kernel();
 		return -EBADF;
 	}
 
 	while (f->f_pos < dir->i_size) {
-		offset = f->f_pos & (BFS_BSIZE-1);
+		offset = f->f_pos & (BFS_BSIZE - 1);
 		block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
 		bh = sb_bread(dir->i_sb, block);
 		if (!bh) {
@@ -54,7 +57,9 @@ static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
 			de = (struct bfs_dirent *)(bh->b_data + offset);
 			if (de->ino) {
 				int size = strnlen(de->name, BFS_NAMELEN);
-				if (filldir(dirent, de->name, size, f->f_pos, le16_to_cpu(de->ino), DT_UNKNOWN) < 0) {
+				if (filldir(dirent, de->name, size, f->f_pos,
+						le16_to_cpu(de->ino),
+						DT_UNKNOWN) < 0) {
 					brelse(bh);
 					unlock_kernel();
 					return 0;
@@ -62,7 +67,7 @@ static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
 			}
 			offset += BFS_DIRENT_SIZE;
 			f->f_pos += BFS_DIRENT_SIZE;
-		} while (offset < BFS_BSIZE && f->f_pos < dir->i_size);
+		} while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
 		brelse(bh);
 	}
 
@@ -78,13 +83,13 @@ const struct file_operations bfs_dir_operations = {
 
 extern void dump_imap(const char *, struct super_block *);
 
-static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
-		struct nameidata *nd)
+static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
+						struct nameidata *nd)
 {
 	int err;
-	struct inode * inode;
-	struct super_block * s = dir->i_sb;
-	struct bfs_sb_info * info = BFS_SB(s);
+	struct inode *inode;
+	struct super_block *s = dir->i_sb;
+	struct bfs_sb_info *info = BFS_SB(s);
 	unsigned long ino;
 
 	inode = new_inode(s);
@@ -97,7 +102,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 		iput(inode);
 		return -ENOSPC;
 	}
-	set_bit(ino, info->si_imap);	
+	set_bit(ino, info->si_imap);
 	info->si_freei--;
 	inode->i_uid = current->fsuid;
 	inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
@@ -113,9 +118,10 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 	BFS_I(inode)->i_eblock = 0;
 	insert_inode_hash(inode);
         mark_inode_dirty(inode);
-	dump_imap("create",s);
+	dump_imap("create", s);
 
-	err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, inode->i_ino);
+	err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
+							inode->i_ino);
 	if (err) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -127,11 +133,12 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 	return 0;
 }
 
-static struct dentry * bfs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
+						struct nameidata *nd)
 {
-	struct inode * inode = NULL;
-	struct buffer_head * bh;
-	struct bfs_dirent * de;
+	struct inode *inode = NULL;
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
 
 	if (dentry->d_name.len > BFS_NAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -152,13 +159,15 @@ static struct dentry * bfs_lookup(struct inode * dir, struct dentry * dentry, st
 	return NULL;
 }
 
-static int bfs_link(struct dentry * old, struct inode * dir, struct dentry * new)
+static int bfs_link(struct dentry *old, struct inode *dir,
+						struct dentry *new)
 {
-	struct inode * inode = old->d_inode;
+	struct inode *inode = old->d_inode;
 	int err;
 
 	lock_kernel();
-	err = bfs_add_entry(dir, new->d_name.name, new->d_name.len, inode->i_ino);
+	err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
+							inode->i_ino);
 	if (err) {
 		unlock_kernel();
 		return err;
@@ -172,23 +181,23 @@ static int bfs_link(struct dentry * old, struct inode * dir, struct dentry * new
 	return 0;
 }
 
-
-static int bfs_unlink(struct inode * dir, struct dentry * dentry)
+static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error = -ENOENT;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct bfs_dirent * de;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
 
 	inode = dentry->d_inode;
 	lock_kernel();
 	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
-	if (!bh || le16_to_cpu(de->ino) != inode->i_ino)
+	if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
 		goto out_brelse;
 
 	if (!inode->i_nlink) {
-		printf("unlinking non-existent file %s:%lu (nlink=%d)\n", inode->i_sb->s_id, 
-				inode->i_ino, inode->i_nlink);
+		printf("unlinking non-existent file %s:%lu (nlink=%d)\n",
+					inode->i_sb->s_id, inode->i_ino,
+					inode->i_nlink);
 		inode->i_nlink = 1;
 	}
 	de->ino = 0;
@@ -205,12 +214,12 @@ out_brelse:
 	return error;
 }
 
-static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry, 
-			struct inode * new_dir, struct dentry * new_dentry)
+static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
 {
-	struct inode * old_inode, * new_inode;
-	struct buffer_head * old_bh, * new_bh;
-	struct bfs_dirent * old_de, * new_de;		
+	struct inode *old_inode, *new_inode;
+	struct buffer_head *old_bh, *new_bh;
+	struct bfs_dirent *old_de, *new_de;
 	int error = -ENOENT;
 
 	old_bh = new_bh = NULL;
@@ -223,7 +232,7 @@ static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry,
 				old_dentry->d_name.name, 
 				old_dentry->d_name.len, &old_de);
 
-	if (!old_bh || le16_to_cpu(old_de->ino) != old_inode->i_ino)
+	if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino))
 		goto end_rename;
 
 	error = -EPERM;
@@ -239,7 +248,8 @@ static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry,
 	if (!new_bh) {
 		error = bfs_add_entry(new_dir, 
 					new_dentry->d_name.name,
-			 		new_dentry->d_name.len, old_inode->i_ino);
+					new_dentry->d_name.len,
+					old_inode->i_ino);
 		if (error)
 			goto end_rename;
 	}
@@ -268,11 +278,12 @@ const struct inode_operations bfs_dir_inops = {
 	.rename			= bfs_rename,
 };
 
-static int bfs_add_entry(struct inode * dir, const unsigned char * name, int namelen, int ino)
+static int bfs_add_entry(struct inode *dir, const unsigned char *name,
+							int namelen, int ino)
 {
-	struct buffer_head * bh;
-	struct bfs_dirent * de;
-	int block, sblock, eblock, off, eoff;
+	struct buffer_head *bh;
+	struct bfs_dirent *de;
+	int block, sblock, eblock, off, pos;
 	int i;
 
 	dprintf("name=%s, namelen=%d\n", name, namelen);
@@ -284,27 +295,24 @@ static int bfs_add_entry(struct inode * dir, const unsigned char * name, int nam
 
 	sblock = BFS_I(dir)->i_sblock;
 	eblock = BFS_I(dir)->i_eblock;
-	eoff = dir->i_size % BFS_BSIZE;
-	for (block=sblock; block<=eblock; block++) {
+	for (block = sblock; block <= eblock; block++) {
 		bh = sb_bread(dir->i_sb, block);
-		if(!bh) 
+		if (!bh)
 			return -ENOSPC;
-		for (off=0; off<BFS_BSIZE; off+=BFS_DIRENT_SIZE) {
+		for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) {
 			de = (struct bfs_dirent *)(bh->b_data + off);
-			if (block==eblock && off>=eoff) {
-				/* Do not read/interpret the garbage in the end of eblock. */
-				de->ino = 0;
-			}
 			if (!de->ino) {
-				if ((block-sblock)*BFS_BSIZE + off >= dir->i_size) {
+				pos = (block - sblock) * BFS_BSIZE + off;
+				if (pos >= dir->i_size) {
 					dir->i_size += BFS_DIRENT_SIZE;
 					dir->i_ctime = CURRENT_TIME_SEC;
 				}
 				dir->i_mtime = CURRENT_TIME_SEC;
 				mark_inode_dirty(dir);
 				de->ino = cpu_to_le16((u16)ino);
-				for (i=0; i<BFS_NAMELEN; i++)
-					de->name[i] = (i < namelen) ? name[i] : 0;
+				for (i = 0; i < BFS_NAMELEN; i++)
+					de->name[i] =
+						(i < namelen) ? name[i] : 0;
 				mark_buffer_dirty(bh);
 				brelse(bh);
 				return 0;
@@ -315,25 +323,26 @@ static int bfs_add_entry(struct inode * dir, const unsigned char * name, int nam
 	return -ENOSPC;
 }
 
-static inline int bfs_namecmp(int len, const unsigned char * name, const char * buffer)
+static inline int bfs_namecmp(int len, const unsigned char *name,
+							const char *buffer)
 {
-	if (len < BFS_NAMELEN && buffer[len])
+	if ((len < BFS_NAMELEN) && buffer[len])
 		return 0;
 	return !memcmp(name, buffer, len);
 }
 
-static struct buffer_head * bfs_find_entry(struct inode * dir, 
-	const unsigned char * name, int namelen, struct bfs_dirent ** res_dir)
+static struct buffer_head *bfs_find_entry(struct inode *dir,
+			const unsigned char *name, int namelen,
+			struct bfs_dirent **res_dir)
 {
-	unsigned long block, offset;
-	struct buffer_head * bh;
-	struct bfs_dirent * de;
+	unsigned long block = 0, offset = 0;
+	struct buffer_head *bh = NULL;
+	struct bfs_dirent *de;
 
 	*res_dir = NULL;
 	if (namelen > BFS_NAMELEN)
 		return NULL;
-	bh = NULL;
-	block = offset = 0;
+
 	while (block * BFS_BSIZE + offset < dir->i_size) {
 		if (!bh) {
 			bh = sb_bread(dir->i_sb, BFS_I(dir)->i_sblock + block);
@@ -344,7 +353,8 @@ static struct buffer_head * bfs_find_entry(struct inode * dir,
 		}
 		de = (struct bfs_dirent *)(bh->b_data + offset);
 		offset += BFS_DIRENT_SIZE;
-		if (le16_to_cpu(de->ino) && bfs_namecmp(namelen, name, de->name)) {
+		if (le16_to_cpu(de->ino) &&
+				bfs_namecmp(namelen, name, de->name)) {
 			*res_dir = de;
 			return bh;
 		}
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 911b4ccf470f..b11e63e8fbcd 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -2,6 +2,11 @@
  *	fs/bfs/file.c
  *	BFS file operations.
  *	Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com>
+ *
+ *	Make the file block allocation algorithm understand the size
+ *	of the underlying block device.
+ *	Copyright (C) 2007 Dmitri Vorobiev <dmitri.vorobiev@gmail.com>
+ *
  */
 
 #include <linux/fs.h>
@@ -27,7 +32,8 @@ const struct file_operations bfs_file_operations = {
 	.splice_read	= generic_file_splice_read,
 };
 
-static int bfs_move_block(unsigned long from, unsigned long to, struct super_block *sb)
+static int bfs_move_block(unsigned long from, unsigned long to,
+					struct super_block *sb)
 {
 	struct buffer_head *bh, *new;
 
@@ -43,21 +49,22 @@ static int bfs_move_block(unsigned long from, unsigned long to, struct super_blo
 }
 
 static int bfs_move_blocks(struct super_block *sb, unsigned long start,
-                           unsigned long end, unsigned long where)
+				unsigned long end, unsigned long where)
 {
 	unsigned long i;
 
 	dprintf("%08lx-%08lx->%08lx\n", start, end, where);
 	for (i = start; i <= end; i++)
 		if(bfs_move_block(i, where + i, sb)) {
-			dprintf("failed to move block %08lx -> %08lx\n", i, where + i);
+			dprintf("failed to move block %08lx -> %08lx\n", i,
+								where + i);
 			return -EIO;
 		}
 	return 0;
 }
 
-static int bfs_get_block(struct inode * inode, sector_t block, 
-	struct buffer_head * bh_result, int create)
+static int bfs_get_block(struct inode *inode, sector_t block,
+			struct buffer_head *bh_result, int create)
 {
 	unsigned long phys;
 	int err;
@@ -66,9 +73,6 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 	struct bfs_inode_info *bi = BFS_I(inode);
 	struct buffer_head *sbh = info->si_sbh;
 
-	if (block > info->si_blocks)
-		return -EIO;
-
 	phys = bi->i_sblock + block;
 	if (!create) {
 		if (phys <= bi->i_eblock) {
@@ -79,21 +83,29 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 		return 0;
 	}
 
-	/* if the file is not empty and the requested block is within the range
-	   of blocks allocated for this file, we can grant it */
-	if (inode->i_size && phys <= bi->i_eblock) {
+	/*
+	 * If the file is not empty and the requested block is within the
+	 * range of blocks allocated for this file, we can grant it.
+	 */
+	if (bi->i_sblock && (phys <= bi->i_eblock)) {
 		dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", 
 				create, (unsigned long)block, phys);
 		map_bh(bh_result, sb, phys);
 		return 0;
 	}
 
-	/* the rest has to be protected against itself */
+	/* The file will be extended, so let's see if there is enough space. */
+	if (phys >= info->si_blocks)
+		return -ENOSPC;
+
+	/* The rest has to be protected against itself. */
 	lock_kernel();
 
-	/* if the last data block for this file is the last allocated
-	   block, we can extend the file trivially, without moving it
-	   anywhere */
+	/*
+	 * If the last data block for this file is the last allocated
+	 * block, we can extend the file trivially, without moving it
+	 * anywhere.
+	 */
 	if (bi->i_eblock == info->si_lf_eblk) {
 		dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", 
 				create, (unsigned long)block, phys);
@@ -106,13 +118,19 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 		goto out;
 	}
 
-	/* Ok, we have to move this entire file to the next free block */
+	/* Ok, we have to move this entire file to the next free block. */
 	phys = info->si_lf_eblk + 1;
-	if (bi->i_sblock) { /* if data starts on block 0 then there is no data */
+	if (phys + block >= info->si_blocks) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	if (bi->i_sblock) {
 		err = bfs_move_blocks(inode->i_sb, bi->i_sblock, 
-				bi->i_eblock, phys);
+						bi->i_eblock, phys);
 		if (err) {
-			dprintf("failed to move ino=%08lx -> fs corruption\n", inode->i_ino);
+			dprintf("failed to move ino=%08lx -> fs corruption\n",
+								inode->i_ino);
 			goto out;
 		}
 	} else
@@ -124,8 +142,10 @@ static int bfs_get_block(struct inode * inode, sector_t block,
 	phys += block;
 	info->si_lf_eblk = bi->i_eblock = phys;
 
-	/* this assumes nothing can write the inode back while we are here
-	 * and thus update inode->i_blocks! (XXX)*/
+	/*
+	 * This assumes nothing can write the inode back while we are here
+	 * and thus update inode->i_blocks! (XXX)
+	 */
 	info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks;
 	mark_inode_dirty(inode);
 	mark_buffer_dirty(sbh);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 7bd9c2bbe6ee..294c41baef6e 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,25 +30,26 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
 
-void dump_imap(const char *prefix, struct super_block * s);
+void dump_imap(const char *prefix, struct super_block *s);
 
-static void bfs_read_inode(struct inode * inode)
+static void bfs_read_inode(struct inode *inode)
 {
 	unsigned long ino = inode->i_ino;
-	struct bfs_inode * di;
-	struct buffer_head * bh;
+	struct bfs_inode *di;
+	struct buffer_head *bh;
 	int block, off;
 
-	if (ino < BFS_ROOT_INO || ino > BFS_SB(inode->i_sb)->si_lasti) {
+	if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
 		printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino);
 		make_bad_inode(inode);
 		return;
 	}
 
-	block = (ino - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
+	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh) {
-		printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino);
+		printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id,
+									ino);
 		make_bad_inode(inode);
 		return;
 	}
@@ -56,7 +57,7 @@ static void bfs_read_inode(struct inode * inode)
 	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
 	di = (struct bfs_inode *)bh->b_data + off;
 
-	inode->i_mode = 0x0000FFFF &  le32_to_cpu(di->i_mode);
+	inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode);
 	if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
 		inode->i_mode |= S_IFDIR;
 		inode->i_op = &bfs_dir_inops;
@@ -70,48 +71,48 @@ static void bfs_read_inode(struct inode * inode)
 
 	BFS_I(inode)->i_sblock =  le32_to_cpu(di->i_sblock);
 	BFS_I(inode)->i_eblock =  le32_to_cpu(di->i_eblock);
+	BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino);
 	inode->i_uid =  le32_to_cpu(di->i_uid);
 	inode->i_gid =  le32_to_cpu(di->i_gid);
 	inode->i_nlink =  le32_to_cpu(di->i_nlink);
 	inode->i_size = BFS_FILESIZE(di);
 	inode->i_blocks = BFS_FILEBLOCKS(di);
-        if (inode->i_size || inode->i_blocks) dprintf("Registered inode with %lld size, %ld blocks\n", inode->i_size, inode->i_blocks);
 	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
 	inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
 	inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
 	inode->i_atime.tv_nsec = 0;
 	inode->i_mtime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
-	BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); /* can be 0 so we store a copy */
 
 	brelse(bh);
 }
 
-static int bfs_write_inode(struct inode * inode, int unused)
+static int bfs_write_inode(struct inode *inode, int unused)
 {
 	unsigned int ino = (u16)inode->i_ino;
         unsigned long i_sblock;
-	struct bfs_inode * di;
-	struct buffer_head * bh;
+	struct bfs_inode *di;
+	struct buffer_head *bh;
 	int block, off;
 
         dprintf("ino=%08x\n", ino);
 
-	if (ino < BFS_ROOT_INO || ino > BFS_SB(inode->i_sb)->si_lasti) {
+	if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
 		printf("Bad inode number %s:%08x\n", inode->i_sb->s_id, ino);
 		return -EIO;
 	}
 
 	lock_kernel();
-	block = (ino - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
+	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh) {
-		printf("Unable to read inode %s:%08x\n", inode->i_sb->s_id, ino);
+		printf("Unable to read inode %s:%08x\n",
+				inode->i_sb->s_id, ino);
 		unlock_kernel();
 		return -EIO;
 	}
 
-	off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
+	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
 	di = (struct bfs_inode *)bh->b_data + off;
 
 	if (ino == BFS_ROOT_INO)
@@ -133,27 +134,26 @@ static int bfs_write_inode(struct inode * inode, int unused)
 	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
 
 	mark_buffer_dirty(bh);
-        dprintf("Written ino=%d into %d:%d\n",le16_to_cpu(di->i_ino),block,off);
 	brelse(bh);
 	unlock_kernel();
 	return 0;
 }
 
-static void bfs_delete_inode(struct inode * inode)
+static void bfs_delete_inode(struct inode *inode)
 {
 	unsigned long ino = inode->i_ino;
-	struct bfs_inode * di;
-	struct buffer_head * bh;
+	struct bfs_inode *di;
+	struct buffer_head *bh;
 	int block, off;
-	struct super_block * s = inode->i_sb;
-	struct bfs_sb_info * info = BFS_SB(s);
-	struct bfs_inode_info * bi = BFS_I(inode);
+	struct super_block *s = inode->i_sb;
+	struct bfs_sb_info *info = BFS_SB(s);
+	struct bfs_inode_info *bi = BFS_I(inode);
 
 	dprintf("ino=%08lx\n", ino);
 
 	truncate_inode_pages(&inode->i_data, 0);
 
-	if (ino < BFS_ROOT_INO || ino > info->si_lasti) {
+	if ((ino < BFS_ROOT_INO) || (ino > info->si_lasti)) {
 		printf("invalid ino=%08lx\n", ino);
 		return;
 	}
@@ -162,31 +162,35 @@ static void bfs_delete_inode(struct inode * inode)
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	lock_kernel();
 	mark_inode_dirty(inode);
-	block = (ino - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
+
+	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
 	bh = sb_bread(s, block);
 	if (!bh) {
-		printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino);
+		printf("Unable to read inode %s:%08lx\n",
+					inode->i_sb->s_id, ino);
 		unlock_kernel();
 		return;
 	}
-	off = (ino - BFS_ROOT_INO)%BFS_INODES_PER_BLOCK;
-	di = (struct bfs_inode *) bh->b_data + off;
+	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
+	di = (struct bfs_inode *)bh->b_data + off;
+	memset((void *)di, 0, sizeof(struct bfs_inode));
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
         if (bi->i_dsk_ino) {
-		info->si_freeb += 1 + bi->i_eblock - bi->i_sblock;
+		info->si_freeb += BFS_FILEBLOCKS(bi);
 		info->si_freei++;
 		clear_bit(ino, info->si_imap);
 		dump_imap("delete_inode", s);
         }
-	di->i_ino = 0;
-	di->i_sblock = 0;
-	mark_buffer_dirty(bh);
-	brelse(bh);
 
-	/* if this was the last file, make the previous 
-	   block "last files last block" even if there is no real file there,
-	   saves us 1 gap */
-	if (info->si_lf_eblk == BFS_I(inode)->i_eblock) {
-		info->si_lf_eblk = BFS_I(inode)->i_sblock - 1;
+	/*
+	 * If this was the last file, make the previous block
+	 * "last block of the last file" even if there is no
+	 * real file there, saves us 1 gap.
+	 */
+	if (info->si_lf_eblk == bi->i_eblock) {
+		info->si_lf_eblk = bi->i_sblock - 1;
 		mark_buffer_dirty(info->si_sbh);
 	}
 	unlock_kernel();
@@ -228,7 +232,7 @@ static void bfs_write_super(struct super_block *s)
 	unlock_kernel();
 }
 
-static struct kmem_cache * bfs_inode_cachep;
+static struct kmem_cache *bfs_inode_cachep;
 
 static struct inode *bfs_alloc_inode(struct super_block *sb)
 {
@@ -279,7 +283,7 @@ static const struct super_operations bfs_sops = {
 	.statfs		= bfs_statfs,
 };
 
-void dump_imap(const char *prefix, struct super_block * s)
+void dump_imap(const char *prefix, struct super_block *s)
 {
 #ifdef DEBUG
 	int i;
@@ -287,25 +291,26 @@ void dump_imap(const char *prefix, struct super_block * s)
 
 	if (!tmpbuf)
 		return;
-	for (i=BFS_SB(s)->si_lasti; i>=0; i--) {
-		if (i > PAGE_SIZE-100) break;
+	for (i = BFS_SB(s)->si_lasti; i >= 0; i--) {
+		if (i > PAGE_SIZE - 100) break;
 		if (test_bit(i, BFS_SB(s)->si_imap))
 			strcat(tmpbuf, "1");
 		else
 			strcat(tmpbuf, "0");
 	}
-	printk(KERN_ERR "BFS-fs: %s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf);
+	printf("BFS-fs: %s: lasti=%08lx <%s>\n",
+				prefix, BFS_SB(s)->si_lasti, tmpbuf);
 	free_page((unsigned long)tmpbuf);
 #endif
 }
 
 static int bfs_fill_super(struct super_block *s, void *data, int silent)
 {
-	struct buffer_head * bh;
-	struct bfs_super_block * bfs_sb;
-	struct inode * inode;
+	struct buffer_head *bh;
+	struct bfs_super_block *bfs_sb;
+	struct inode *inode;
 	unsigned i, imap_len;
-	struct bfs_sb_info * info;
+	struct bfs_sb_info *info;
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
@@ -329,14 +334,14 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 
 	s->s_magic = BFS_MAGIC;
 	info->si_sbh = bh;
-	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE)/sizeof(struct bfs_inode)
-			+ BFS_ROOT_INO - 1;
-
-	imap_len = info->si_lasti/8 + 1;
+	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
+					sizeof(struct bfs_inode)
+					+ BFS_ROOT_INO - 1;
+	imap_len = (info->si_lasti / 8) + 1;
 	info->si_imap = kzalloc(imap_len, GFP_KERNEL);
 	if (!info->si_imap)
 		goto out;
-	for (i=0; i<BFS_ROOT_INO; i++) 
+	for (i = 0; i < BFS_ROOT_INO; i++)
 		set_bit(i, info->si_imap);
 
 	s->s_op = &bfs_sops;
@@ -352,16 +357,15 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		goto out;
 	}
 
-	info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1)>>BFS_BSIZE_BITS; /* for statfs(2) */
-	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 -  le32_to_cpu(bfs_sb->s_start))>>BFS_BSIZE_BITS;
+	info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
+	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1
+			- le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
 	info->si_freei = 0;
 	info->si_lf_eblk = 0;
-	info->si_lf_sblk = 0;
-	info->si_lf_ioff = 0;
 	bh = NULL;
-	for (i=BFS_ROOT_INO; i<=info->si_lasti; i++) {
+	for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
 		struct bfs_inode *di;
-		int block = (i - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
+		int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
 		int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
 		unsigned long sblock, eblock;
 
@@ -384,11 +388,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 
 		sblock =  le32_to_cpu(di->i_sblock);
 		eblock =  le32_to_cpu(di->i_eblock);
-		if (eblock > info->si_lf_eblk) {
+		if (eblock > info->si_lf_eblk)
 			info->si_lf_eblk = eblock;
-			info->si_lf_sblk = sblock;
-			info->si_lf_ioff = BFS_INO2OFF(i);
-		}
 	}
 	brelse(bh);
 	if (!(s->s_flags & MS_RDONLY)) {
-- 
cgit v1.2.3


From c06a018fa5362fa9ed0768bd747c0fab26bc8849 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
Date: Wed, 14 Nov 2007 16:59:54 -0800
Subject: reiserfs: don't drop PG_dirty when releasing sub-page-sized dirty
 file

This is not a new problem in 2.6.23-git17.  2.6.22/2.6.23 is buggy in the
same way.

Reiserfs could accumulate dirty sub-page-size files until umount time.
They cannot be synced to disk by pdflush routines or explicit `sync'
commands.  Only `umount' can do the trick.

The direct cause is: the dirty page's PG_dirty is wrongly _cleared_.
Call trace:
	 [<ffffffff8027e920>] cancel_dirty_page+0xd0/0xf0
	 [<ffffffff8816d470>] :reiserfs:reiserfs_cut_from_item+0x660/0x710
	 [<ffffffff8816d791>] :reiserfs:reiserfs_do_truncate+0x271/0x530
	 [<ffffffff8815872d>] :reiserfs:reiserfs_truncate_file+0xfd/0x3b0
	 [<ffffffff8815d3d0>] :reiserfs:reiserfs_file_release+0x1e0/0x340
	 [<ffffffff802a187c>] __fput+0xcc/0x1b0
	 [<ffffffff802a1ba6>] fput+0x16/0x20
	 [<ffffffff8029e676>] filp_close+0x56/0x90
	 [<ffffffff8029fe0d>] sys_close+0xad/0x110
	 [<ffffffff8020c41e>] system_call+0x7e/0x83

Fix the bug by removing the cancel_dirty_page() call. Tests show that
it causes no bad behaviors on various write sizes.

=== for the patient ===
Here are more detailed demonstrations of the problem.

1) the page has both PG_dirty(D)/PAGECACHE_TAG_DIRTY(d) after being written to;
   and then only PAGECACHE_TAG_DIRTY(d) remains after the file is closed.

------------------------------ screen 0 ------------------------------
[T0] root /home/wfg# cat > /test/tiny
[T1] hi
[T2] root /home/wfg#

------------------------------ screen 1 ------------------------------
[T1] root /home/wfg# echo /test/tiny > /proc/filecache
[T1] root /home/wfg# cat /proc/filecache
     # file /test/tiny
     # flags R:referenced A:active M:mmap U:uptodate D:dirty W:writeback O:owner B:buffer d:dirty w:writeback
     # idx   len     state   refcnt
     0       1       ___UD__Bd_      2
[T2] root /home/wfg# cat /proc/filecache
     # file /test/tiny
     # flags R:referenced A:active M:mmap U:uptodate D:dirty W:writeback O:owner B:buffer d:dirty w:writeback
     # idx   len     state   refcnt
     0       1       ___U___Bd_      2

2) note the non-zero 'cancelled_write_bytes' after /tmp/hi is copied.

------------------------------ screen 0 ------------------------------
[T0] root /home/wfg# echo hi > /tmp/hi
[T1] root /home/wfg# cp /tmp/hi /dev/stdin /test
[T2] hi
[T3] root /home/wfg#

------------------------------ screen 1 ------------------------------
[T1] root /proc/4397# cd /proc/`pidof cp`
[T1] root /proc/4713# cat io
     rchar: 8396
     wchar: 3
     syscr: 20
     syscw: 1
     read_bytes: 0
     write_bytes: 20480
     cancelled_write_bytes: 4096
[T2] root /proc/4713# cat io
     rchar: 8399
     wchar: 6
     syscr: 21
     syscw: 2
     read_bytes: 0
     write_bytes: 24576
     cancelled_write_bytes: 4096

//Question: the 'write_bytes' is a bit more than expected ;-)

Tested-by: Maxim Levitsky <maximlevitsky@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Reviewed-by: Chris Mason <chris.mason@oracle.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/stree.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index ca41567d7890..d2db2417b2bd 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1458,9 +1458,6 @@ static void unmap_buffers(struct page *page, loff_t pos)
 				}
 				bh = next;
 			} while (bh != head);
-			if (PAGE_SIZE == bh->b_size) {
-				cancel_dirty_page(page, PAGE_CACHE_SIZE);
-			}
 		}
 	}
 }
-- 
cgit v1.2.3


From 8744969a819de4ee5158f4cdb30104601cc015d4 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 14 Nov 2007 17:00:02 -0800
Subject: fuse_file_alloc(): fix NULL dereferences

Fix obvious NULL dereferences spotted by the Coverity checker.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/file.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0fcdba9d47c0..535b37399009 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -55,9 +55,10 @@ struct fuse_file *fuse_file_alloc(void)
 		if (!ff->reserved_req) {
 			kfree(ff);
 			ff = NULL;
+		} else {
+			INIT_LIST_HEAD(&ff->write_entry);
+			atomic_set(&ff->count, 0);
 		}
-		INIT_LIST_HEAD(&ff->write_entry);
-		atomic_set(&ff->count, 0);
 	}
 	return ff;
 }
-- 
cgit v1.2.3


From 9fcc2d15b14894aa53e5e8b7fd5d6e3ca558e5df Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 14 Nov 2007 17:00:07 -0800
Subject: proc: simplify and correct proc_flush_task

Currently we special case when we have only the initial pid namespace.
Unfortunately in doing so the copied case for the other namespaces was
broken so we don't properly flush the thread directories :(

So this patch removes the unnecessary special case (removing a usage of
proc_mnt) and corrects the flushing of the thread directories.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Kirill Korotaev <dev@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index aeaf0d0f2f51..a17c26859074 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2328,21 +2328,18 @@ out:
 
 void proc_flush_task(struct task_struct *task)
 {
-	int i, leader;
-	struct pid *pid, *tgid;
+	int i;
+	struct pid *pid, *tgid = NULL;
 	struct upid *upid;
 
-	leader = thread_group_leader(task);
-	proc_flush_task_mnt(proc_mnt, task->pid, leader ? task->tgid : 0);
 	pid = task_pid(task);
-	if (pid->level == 0)
-		return;
+	if (thread_group_leader(task))
+		tgid = task_tgid(task);
 
-	tgid = task_tgid(task);
-	for (i = 1; i <= pid->level; i++) {
+	for (i = 0; i <= pid->level; i++) {
 		upid = &pid->numbers[i];
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
-				leader ? 0 : tgid->numbers[i].nr);
+			tgid ? tgid->numbers[i].nr : 0);
 	}
 
 	upid = &pid->numbers[pid->level];
-- 
cgit v1.2.3


From cb51f973bce7aef46452b0c6faea8f791885f5b8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 14 Nov 2007 17:00:10 -0800
Subject: mark sys_open/sys_read exports unused

sys_open / sys_read were used in the early 1.2 days to load firmware from
disk inside drivers.  Since 2.0 or so this was deprecated behavior, but
several drivers still were using this.  Since a few years we have a
request_firmware() API that implements this in a nice, consistent way.
Only some old ISA sound drivers (pre-ALSA) still straggled along for some
time....  however with commit c2b1239a9f22f19c53543b460b24507d0e21ea0c the
last user is now gone.

This is a good thing, since using sys_open / sys_read etc for firmware is a
very buggy to dangerous thing to do; these operations put an fd in the
process file descriptor table....  which then can be tampered with from
other threads for example.  For those who don't want the firmware loader,
filp_open()/vfs_read are the better APIs to use, without this security
issue.

The patch below marks sys_open and sys_read unused now that they're
really not used anymore, and for deletion in the 2.6.25 timeframe.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/open.c       | 2 +-
 fs/read_write.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 3b69c53e1837..4932b4d1da05 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1061,7 +1061,7 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
 	prevent_tail_call(ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(sys_open);
+EXPORT_UNUSED_SYMBOL_GPL(sys_open); /* To be deleted for 2.6.25 */
 
 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   int mode)
diff --git a/fs/read_write.c b/fs/read_write.c
index 124693e8d3fa..ea1f94cc722e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -370,7 +370,7 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(sys_read);
+EXPORT_UNUSED_SYMBOL_GPL(sys_read); /* to be deleted for 2.6.25 */
 
 asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
 {
-- 
cgit v1.2.3


From dbaf4c024a657175f43b5091c4fab8b9f0e17078 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 14 Nov 2007 17:00:18 -0800
Subject: smbfs: fix debug builds

Fix some warnings with SMBFS_DEBUG_* builds.  This patch makes it so that
builds with -Werror don't fail.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/smbfs/file.c   | 7 ++++---
 fs/smbfs/inode.c  | 2 +-
 fs/smbfs/proc.c   | 2 +-
 fs/smbfs/smbiod.c | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index f5d14cebc75a..efbe29af3d7a 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -234,7 +234,7 @@ smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 
 	VERBOSE("before read, size=%ld, flags=%x, atime=%ld\n",
 		(long)dentry->d_inode->i_size,
-		dentry->d_inode->i_flags, dentry->d_inode->i_atime);
+		dentry->d_inode->i_flags, dentry->d_inode->i_atime.tv_sec);
 
 	status = generic_file_aio_read(iocb, iov, nr_segs, pos);
 out:
@@ -269,7 +269,7 @@ smb_file_splice_read(struct file *file, loff_t *ppos,
 	struct dentry *dentry = file->f_path.dentry;
 	ssize_t status;
 
-	VERBOSE("file %s/%s, pos=%Ld, count=%d\n",
+	VERBOSE("file %s/%s, pos=%Ld, count=%lu\n",
 		DENTRY_PATH(dentry), *ppos, count);
 
 	status = smb_revalidate_inode(dentry);
@@ -363,7 +363,8 @@ smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 		VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
 			(long) file->f_pos, (long) dentry->d_inode->i_size,
-			dentry->d_inode->i_mtime, dentry->d_inode->i_atime);
+			dentry->d_inode->i_mtime.tv_sec,
+			dentry->d_inode->i_atime.tv_sec);
 	}
 out:
 	return result;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index ab517755ece0..9416ead0c7aa 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -536,7 +536,7 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 
 	/* Allocate the global temp buffer and some superblock helper structs */
 	/* FIXME: move these to the smb_sb_info struct */
-	VERBOSE("alloc chunk = %d\n", sizeof(struct smb_ops) +
+	VERBOSE("alloc chunk = %lu\n", sizeof(struct smb_ops) +
 		sizeof(struct smb_mount_data_kernel));
 	mem = kmalloc(sizeof(struct smb_ops) +
 		      sizeof(struct smb_mount_data_kernel), GFP_KERNEL);
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index feac46050619..d517a27b7f4b 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -2593,7 +2593,7 @@ smb_proc_getattr_ff(struct smb_sb_info *server, struct dentry *dentry,
 	fattr->f_mtime.tv_sec = date_dos2unix(server, date, time);
 	fattr->f_mtime.tv_nsec = 0;
 	VERBOSE("name=%s, date=%x, time=%x, mtime=%ld\n",
-		mask, date, time, fattr->f_mtime);
+		mask, date, time, fattr->f_mtime.tv_sec);
 	fattr->f_size = DVAL(req->rq_data, 12);
 	/* ULONG allocation size */
 	fattr->attr = WVAL(req->rq_data, 20);
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 283c5720c9de..fae8e85af0ed 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -227,7 +227,7 @@ int smbiod_retry(struct smb_sb_info *server)
 		printk(KERN_ERR "smb_retry: signal failed [%d]\n", result);
 		goto out;
 	}
-	VERBOSE("signalled pid %d\n", pid);
+	VERBOSE("signalled pid %d\n", pid_nr(pid));
 
 	/* FIXME: The retried requests should perhaps get a "time boost". */
 
-- 
cgit v1.2.3


From 7c06a8dc64a2d1884bd19b4c6353d9267ae4e3e1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 14 Nov 2007 17:00:19 -0800
Subject: Fix 64KB blocksize in ext3 directories

With 64KB blocksize, a directory entry can have size 64KB which does not
fit into 16 bits we have for entry lenght.  So we store 0xffff instead and
convert value when read from / written to disk.  The patch also converts
some places to use ext3_next_entry() when we are changing them anyway.

[akpm@linux-foundation.org: coding-style cleanups]
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/dir.c   | 10 +++----
 fs/ext3/namei.c | 92 ++++++++++++++++++++++++++++-----------------------------
 2 files changed, 50 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index c8e4ee3af1d0..8ca3bfd72427 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -67,7 +67,7 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
 			  unsigned long offset)
 {
 	const char * error_msg = NULL;
-	const int rlen = le16_to_cpu(de->rec_len);
+	const int rlen = ext3_rec_len_from_disk(de->rec_len);
 
 	if (rlen < EXT3_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
@@ -173,10 +173,10 @@ revalidate:
 				 * least that it is non-zero.  A
 				 * failure will be detected in the
 				 * dirent test below. */
-				if (le16_to_cpu(de->rec_len) <
+				if (ext3_rec_len_from_disk(de->rec_len) <
 						EXT3_DIR_REC_LEN(1))
 					break;
-				i += le16_to_cpu(de->rec_len);
+				i += ext3_rec_len_from_disk(de->rec_len);
 			}
 			offset = i;
 			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -197,7 +197,7 @@ revalidate:
 				ret = stored;
 				goto out;
 			}
-			offset += le16_to_cpu(de->rec_len);
+			offset += ext3_rec_len_from_disk(de->rec_len);
 			if (le32_to_cpu(de->inode)) {
 				/* We might block in the next section
 				 * if the data destination is
@@ -219,7 +219,7 @@ revalidate:
 					goto revalidate;
 				stored ++;
 			}
-			filp->f_pos += le16_to_cpu(de->rec_len);
+			filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
 		}
 		offset = 0;
 		brelse (bh);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index ec8170adac53..4ab6f76e63d0 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -176,6 +176,16 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext3_dir_entry_2 *
+ext3_next_entry(struct ext3_dir_entry_2 *p)
+{
+	return (struct ext3_dir_entry_2 *)((char *)p +
+		ext3_rec_len_from_disk(p->rec_len));
+}
+
 /*
  * Future: use high four bits of block for coalesce-on-delete flags
  * Mask them off for now.
@@ -280,7 +290,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent
 			space += EXT3_DIR_REC_LEN(de->name_len);
 			names++;
 		}
-		de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+		de = ext3_next_entry(de);
 	}
 	printk("(%i)\n", names);
 	return (struct stats) { names, space, 1 };
@@ -546,14 +556,6 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
 }
 
 
-/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
-{
-	return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
-}
-
 /*
  * This function fills a red-black tree with information from a
  * directory block.  It returns the number directory entries loaded
@@ -720,7 +722,7 @@ static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
 			cond_resched();
 		}
 		/* XXX: do we need to check rec_len == 0 case? -Chris */
-		de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+		de = ext3_next_entry(de);
 	}
 	return count;
 }
@@ -822,7 +824,7 @@ static inline int search_dirblock(struct buffer_head * bh,
 			return 1;
 		}
 		/* prevent looping on a bad block */
-		de_len = le16_to_cpu(de->rec_len);
+		de_len = ext3_rec_len_from_disk(de->rec_len);
 		if (de_len <= 0)
 			return -1;
 		offset += de_len;
@@ -1130,7 +1132,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 		rec_len = EXT3_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
 		((struct ext3_dir_entry_2 *) to)->rec_len =
-				cpu_to_le16(rec_len);
+				ext3_rec_len_to_disk(rec_len);
 		de->inode = 0;
 		map++;
 		to += rec_len;
@@ -1149,13 +1151,12 @@ static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
 
 	prev = to = de;
 	while ((char*)de < base + size) {
-		next = (struct ext3_dir_entry_2 *) ((char *) de +
-						    le16_to_cpu(de->rec_len));
+		next = ext3_next_entry(de);
 		if (de->inode && de->name_len) {
 			rec_len = EXT3_DIR_REC_LEN(de->name_len);
 			if (de > to)
 				memmove(to, de, rec_len);
-			to->rec_len = cpu_to_le16(rec_len);
+			to->rec_len = ext3_rec_len_to_disk(rec_len);
 			prev = to;
 			to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
 		}
@@ -1229,8 +1230,8 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	/* Fancy dance to stay within two buffers */
 	de2 = dx_move_dirents(data1, data2, map + split, count - split);
 	de = dx_pack_dirents(data1,blocksize);
-	de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
-	de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
+	de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
 	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
 	dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
 
@@ -1300,7 +1301,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 				return -EEXIST;
 			}
 			nlen = EXT3_DIR_REC_LEN(de->name_len);
-			rlen = le16_to_cpu(de->rec_len);
+			rlen = ext3_rec_len_from_disk(de->rec_len);
 			if ((de->inode? rlen - nlen: rlen) >= reclen)
 				break;
 			de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
@@ -1319,11 +1320,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 
 	/* By now the buffer is marked for journaling */
 	nlen = EXT3_DIR_REC_LEN(de->name_len);
-	rlen = le16_to_cpu(de->rec_len);
+	rlen = ext3_rec_len_from_disk(de->rec_len);
 	if (de->inode) {
 		struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
-		de1->rec_len = cpu_to_le16(rlen - nlen);
-		de->rec_len = cpu_to_le16(nlen);
+		de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
+		de->rec_len = ext3_rec_len_to_disk(nlen);
 		de = de1;
 	}
 	de->file_type = EXT3_FT_UNKNOWN;
@@ -1400,17 +1401,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 
 	/* The 0th block becomes the root, move the dirents out */
 	fde = &root->dotdot;
-	de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
+	de = (struct ext3_dir_entry_2 *)((char *)fde +
+			ext3_rec_len_from_disk(fde->rec_len));
 	len = ((char *) root) + blocksize - (char *) de;
 	memcpy (data1, de, len);
 	de = (struct ext3_dir_entry_2 *) data1;
 	top = data1 + len;
-	while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
+	while ((char *)(de2 = ext3_next_entry(de)) < top)
 		de = de2;
-	de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+	de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
 	/* Initialize the root; the dot dirents already exist */
 	de = (struct ext3_dir_entry_2 *) (&root->dotdot);
-	de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
+	de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
 	memset (&root->info, 0, sizeof(root->info));
 	root->info.info_length = sizeof(root->info);
 	root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
@@ -1490,7 +1492,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
 		return retval;
 	de = (struct ext3_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
-	de->rec_len = cpu_to_le16(blocksize);
+	de->rec_len = ext3_rec_len_to_disk(blocksize);
 	return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
 
@@ -1553,7 +1555,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
-		node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+		node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
 		node2->fake.inode = 0;
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext3_journal_get_write_access(handle, frame->bh);
@@ -1651,9 +1653,9 @@ static int ext3_delete_entry (handle_t *handle,
 			BUFFER_TRACE(bh, "get_write_access");
 			ext3_journal_get_write_access(handle, bh);
 			if (pde)
-				pde->rec_len =
-					cpu_to_le16(le16_to_cpu(pde->rec_len) +
-						    le16_to_cpu(de->rec_len));
+				pde->rec_len = ext3_rec_len_to_disk(
+					ext3_rec_len_from_disk(pde->rec_len) +
+					ext3_rec_len_from_disk(de->rec_len));
 			else
 				de->inode = 0;
 			dir->i_version++;
@@ -1661,10 +1663,9 @@ static int ext3_delete_entry (handle_t *handle,
 			ext3_journal_dirty_metadata(handle, bh);
 			return 0;
 		}
-		i += le16_to_cpu(de->rec_len);
+		i += ext3_rec_len_from_disk(de->rec_len);
 		pde = de;
-		de = (struct ext3_dir_entry_2 *)
-			((char *) de + le16_to_cpu(de->rec_len));
+		de = ext3_next_entry(de);
 	}
 	return -ENOENT;
 }
@@ -1798,13 +1799,13 @@ retry:
 	de = (struct ext3_dir_entry_2 *) dir_block->b_data;
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
-	de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
+	de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
 	strcpy (de->name, ".");
 	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
-	de = (struct ext3_dir_entry_2 *)
-			((char *) de + le16_to_cpu(de->rec_len));
+	de = ext3_next_entry(de);
 	de->inode = cpu_to_le32(dir->i_ino);
-	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
+	de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
+					EXT3_DIR_REC_LEN(1));
 	de->name_len = 2;
 	strcpy (de->name, "..");
 	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1856,8 +1857,7 @@ static int empty_dir (struct inode * inode)
 		return 1;
 	}
 	de = (struct ext3_dir_entry_2 *) bh->b_data;
-	de1 = (struct ext3_dir_entry_2 *)
-			((char *) de + le16_to_cpu(de->rec_len));
+	de1 = ext3_next_entry(de);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
 			!le32_to_cpu(de1->inode) ||
 			strcmp (".", de->name) ||
@@ -1868,9 +1868,9 @@ static int empty_dir (struct inode * inode)
 		brelse (bh);
 		return 1;
 	}
-	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
-	de = (struct ext3_dir_entry_2 *)
-			((char *) de1 + le16_to_cpu(de1->rec_len));
+	offset = ext3_rec_len_from_disk(de->rec_len) +
+			ext3_rec_len_from_disk(de1->rec_len);
+	de = ext3_next_entry(de1);
 	while (offset < inode->i_size ) {
 		if (!bh ||
 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1899,9 +1899,8 @@ static int empty_dir (struct inode * inode)
 			brelse (bh);
 			return 0;
 		}
-		offset += le16_to_cpu(de->rec_len);
-		de = (struct ext3_dir_entry_2 *)
-				((char *) de + le16_to_cpu(de->rec_len));
+		offset += ext3_rec_len_from_disk(de->rec_len);
+		de = ext3_next_entry(de);
 	}
 	brelse (bh);
 	return 1;
@@ -2255,8 +2254,7 @@ retry:
 }
 
 #define PARENT_INO(buffer) \
-	((struct ext3_dir_entry_2 *) ((char *) buffer + \
-	le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
+	(ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
 
 /*
  * Anybody can rename anything with this: the permission checks are left to the
-- 
cgit v1.2.3


From 68bf728a225b7f2045bb501854d6e7695b9b015d Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 16 Nov 2007 18:32:52 +0000
Subject: [CIFS] add ver= prefix to upcall format version

Acked-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Igor Mammedov <niallan@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index ad54a3a6e434..d79eee41e9c5 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -66,6 +66,11 @@ struct key_type cifs_spnego_key_type = {
 	.describe	= user_describe,
 };
 
+#define MAX_VER_STR_LEN   9 /* length of longest version string e.g.
+				strlen(";ver=0xFF) */
+#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg
+			       in future could have strlen(";sec=ntlmsspi") */
+#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo, const char *hostname)
@@ -75,11 +80,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo, const char *hostname)
 	size_t desc_len;
 	struct key *spnego_key;
 
-
-	/* version + ;ip{4|6}= + address + ;host=hostname +
-		;sec= + ;uid= + NULL */
-	desc_len = 4 + 5 + 32 + 1 + 5 + strlen(hostname) +
-		   strlen(";sec=krb5") + 7 + sizeof(uid_t)*2 + 1;
+	/* BB: come up with better scheme for determining length */
+	/* length of fields (with semicolons): ver=0xyz ipv4= ipaddress host=
+	   hostname sec=mechanism uid=0x uid */
+	desc_len = MAX_VER_STR_LEN + 5 + MAX_IPV6_ADDR_LEN + 1 + 6 +
+		  strlen(hostname) + MAX_MECH_STR_LEN + 8 + (sizeof(uid_t) * 2);
 	spnego_key = ERR_PTR(-ENOMEM);
 	description = kzalloc(desc_len, GFP_KERNEL);
 	if (description == NULL)
@@ -88,7 +93,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo, const char *hostname)
 	dp = description;
 	/* start with version and hostname portion of UNC string */
 	spnego_key = ERR_PTR(-EINVAL);
-	sprintf(dp, "0x%2.2x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
+	sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
 		hostname);
 	dp = description + strlen(description);
 
-- 
cgit v1.2.3


From 70fe7dc05596a405ee6a83265f675a544e32f7d8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 16 Nov 2007 22:21:07 +0000
Subject: [CIFS] clean up error handling in cifs_mount

Move all of the kfree's sprinkled in the middle of the function to the
end, and have the code set rc and just goto there on error. Also zero
out the password string before freeing it. Looks like this should also
fix a potential memory leak of the prepath string if an error occurs
near the end of the function.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 88 +++++++++++++++++++------------------------------------
 1 file changed, 30 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 26e1087e081f..58c509e6ac6a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1781,11 +1781,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 	memset(&volume_info, 0, sizeof(struct smb_vol));
 	if (cifs_parse_mount_options(mount_data, devname, &volume_info)) {
-		kfree(volume_info.UNC);
-		kfree(volume_info.password);
-		kfree(volume_info.prepath);
-		FreeXid(xid);
-		return -EINVAL;
+		rc = -EINVAL;
+		goto out;
 	}
 
 	if (volume_info.nullauth) {
@@ -1798,11 +1795,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		cifserror("No username specified");
 	/* In userspace mount helper we can get user name from alternate
 	   locations such as env variables and files on disk */
-		kfree(volume_info.UNC);
-		kfree(volume_info.password);
-		kfree(volume_info.prepath);
-		FreeXid(xid);
-		return -EINVAL;
+		rc = -EINVAL;
+		goto out;
 	}
 
 	if (volume_info.UNCip && volume_info.UNC) {
@@ -1821,11 +1815,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 		if (rc <= 0) {
 			/* we failed translating address */
-			kfree(volume_info.UNC);
-			kfree(volume_info.password);
-			kfree(volume_info.prepath);
-			FreeXid(xid);
-			return -EINVAL;
+			rc = -EINVAL;
+			goto out;
 		}
 
 		cFYI(1, ("UNC: %s ip: %s", volume_info.UNC, volume_info.UNCip));
@@ -1835,20 +1826,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		/* BB using ip addr as server name to connect to the
 		   DFS root below */
 		cERROR(1, ("Connecting to DFS root not implemented yet"));
-		kfree(volume_info.UNC);
-		kfree(volume_info.password);
-		kfree(volume_info.prepath);
-		FreeXid(xid);
-		return -EINVAL;
+		rc = -EINVAL;
+		goto out;
 	} else /* which servers DFS root would we conect to */ {
 		cERROR(1,
 		       ("CIFS mount error: No UNC path (e.g. -o "
 			"unc=//192.168.1.100/public) specified"));
-		kfree(volume_info.UNC);
-		kfree(volume_info.password);
-		kfree(volume_info.prepath);
-		FreeXid(xid);
-		return -EINVAL;
+		rc = -EINVAL;
+		goto out;
 	}
 
 	/* this is needed for ASCII cp to Unicode converts */
@@ -1860,11 +1845,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		if (cifs_sb->local_nls == NULL) {
 			cERROR(1, ("CIFS mount error: iocharset %s not found",
 				 volume_info.iocharset));
-			kfree(volume_info.UNC);
-			kfree(volume_info.password);
-			kfree(volume_info.prepath);
-			FreeXid(xid);
-			return -ELIBACC;
+			rc = -ELIBACC;
+			goto out;
 		}
 	}
 
@@ -1878,11 +1860,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			&sin_server6.sin6_addr,
 			volume_info.username, &srvTcp);
 	} else {
-		kfree(volume_info.UNC);
-		kfree(volume_info.password);
-		kfree(volume_info.prepath);
-		FreeXid(xid);
-		return -EINVAL;
+		rc = -EINVAL;
+		goto out;
 	}
 
 	if (srvTcp) {
@@ -1906,22 +1885,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				   "Aborting operation"));
 			if (csocket != NULL)
 				sock_release(csocket);
-			kfree(volume_info.UNC);
-			kfree(volume_info.password);
-			kfree(volume_info.prepath);
-			FreeXid(xid);
-			return rc;
+			goto out;
 		}
 
 		srvTcp = kzalloc(sizeof(struct TCP_Server_Info), GFP_KERNEL);
 		if (!srvTcp) {
 			rc = -ENOMEM;
 			sock_release(csocket);
-			kfree(volume_info.UNC);
-			kfree(volume_info.password);
-			kfree(volume_info.prepath);
-			FreeXid(xid);
-			return rc;
+			goto out;
 		} else {
 			memcpy(&srvTcp->addr.sockAddr, &sin_server,
 				sizeof(struct sockaddr_in));
@@ -1943,11 +1914,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				cERROR(1, ("error %d create cifsd thread", rc));
 				srvTcp->tsk = NULL;
 				sock_release(csocket);
-				kfree(volume_info.UNC);
-				kfree(volume_info.password);
-				kfree(volume_info.prepath);
-				FreeXid(xid);
-				return rc;
+				goto out;
 			}
 			wait_for_completion(&cifsd_complete);
 			rc = 0;
@@ -1962,8 +1929,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	if (existingCifsSes) {
 		pSesInfo = existingCifsSes;
 		cFYI(1, ("Existing smb sess found"));
-		kfree(volume_info.password);
-		/* volume_info.UNC freed at end of function */
 	} else if (!rc) {
 		cFYI(1, ("Existing smb sess not found"));
 		pSesInfo = sesInfoAlloc();
@@ -1977,8 +1942,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 		if (!rc) {
 			/* volume_info.password freed at unmount */
-			if (volume_info.password)
+			if (volume_info.password) {
 				pSesInfo->password = volume_info.password;
+				/* set to NULL to prevent freeing on exit */
+				volume_info.password = NULL;
+			}
 			if (volume_info.username)
 				strncpy(pSesInfo->userName,
 					volume_info.username,
@@ -2000,8 +1968,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			up(&pSesInfo->sesSem);
 			if (!rc)
 				atomic_inc(&srvTcp->socketUseCount);
-		} else
-			kfree(volume_info.password);
+		}
 	}
 
 	/* search for existing tcon to this server share */
@@ -2106,9 +2073,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 						"", cifs_sb->local_nls,
 						cifs_sb->mnt_cifs_flags &
 						  CIFS_MOUNT_MAP_SPECIAL_CHR);
-					kfree(volume_info.UNC);
-					FreeXid(xid);
-					return -ENODEV;
+					rc = -ENODEV;
+					goto out;
 				} else {
 					/* BB Do we need to wrap sesSem around
 					 * this TCon call and Unix SetFS as
@@ -2231,6 +2197,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	(in which case it is not needed anymore) but when new sesion is created
 	the password ptr is put in the new session structure (in which case the
 	password will be freed at unmount time) */
+out:
+	/* zero out password before freeing */
+	if (volume_info.password != NULL) {
+		memset(volume_info.password, 0, strlen(volume_info.password));
+		kfree(volume_info.password);
+	}
 	kfree(volume_info.UNC);
 	kfree(volume_info.prepath);
 	FreeXid(xid);
-- 
cgit v1.2.3


From c359cf3c61c6ea9f4f461a8bd22023a15d75d9b5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 16 Nov 2007 22:22:06 +0000
Subject: [CIFS] add hostname field to TCP_Server_Info struct

...and populate it with the hostname portion of the UNC string.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/connect.c  | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4ff8179df7ec..3525082f5e58 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -139,6 +139,7 @@ struct TCP_Server_Info {
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
 	char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2];
+	char *hostname; /* hostname portion of UNC string */
 	struct socket *ssocket;
 	union {
 		struct sockaddr_in sockAddr;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 58c509e6ac6a..98ec57ff4d98 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -752,6 +752,7 @@ multi_t2_fnd:
 	}
 	write_unlock(&GlobalSMBSeslock);
 
+	kfree(server->hostname);
 	kfree(server);
 	if (length  > 0)
 		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
@@ -760,6 +761,34 @@ multi_t2_fnd:
 	return 0;
 }
 
+/* extract the host portion of the UNC string */
+static char *
+extract_hostname(const char *unc)
+{
+	const char *src;
+	char *dst, *delim;
+	unsigned int len;
+
+	/* skip double chars at beginning of string */
+	/* BB: check validity of these bytes? */
+	src = unc + 2;
+
+	/* delimiter between hostname and sharename is always '\\' now */
+	delim = strchr(src, '\\');
+	if (!delim)
+		return ERR_PTR(-EINVAL);
+
+	len = delim - src;
+	dst = kmalloc((len + 1), GFP_KERNEL);
+	if (dst == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(dst, src, len);
+	dst[len] = '\0';
+
+	return dst;
+}
+
 static int
 cifs_parse_mount_options(char *options, const char *devname,
 			 struct smb_vol *vol)
@@ -1900,6 +1929,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			/* BB Add code for ipv6 case too */
 			srvTcp->ssocket = csocket;
 			srvTcp->protocolType = IPV4;
+			srvTcp->hostname = extract_hostname(volume_info.UNC);
+			if (IS_ERR(srvTcp->hostname)) {
+				rc = PTR_ERR(srvTcp->hostname);
+				sock_release(csocket);
+				goto out;
+			}
 			init_waitqueue_head(&srvTcp->response_q);
 			init_waitqueue_head(&srvTcp->request_q);
 			INIT_LIST_HEAD(&srvTcp->pending_mid_q);
@@ -1914,6 +1949,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				cERROR(1, ("error %d create cifsd thread", rc));
 				srvTcp->tsk = NULL;
 				sock_release(csocket);
+				kfree(srvTcp->hostname);
 				goto out;
 			}
 			wait_for_completion(&cifsd_complete);
-- 
cgit v1.2.3


From d6c2e4d02b72d8ae63784bdc57cfa285128de211 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 16 Nov 2007 22:23:17 +0000
Subject: [CIFS] have cifs_get_spnego_key get the hostname from TCP_Server_Info

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 3 ++-
 fs/cifs/cifsproto.h   | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index d79eee41e9c5..c466b56e0283 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -73,12 +73,13 @@ struct key_type cifs_spnego_key_type = {
 #define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
-cifs_get_spnego_key(struct cifsSesInfo *sesInfo, const char *hostname)
+cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 {
 	struct TCP_Server_Info *server = sesInfo->server;
 	char *description, *dp;
 	size_t desc_len;
 	struct key *spnego_key;
+	const char *hostname = server->hostname;
 
 	/* BB: come up with better scheme for determining length */
 	/* length of fields (with semicolons): ver=0xyz ipv4= ipaddress host=
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0c55dff2add8..3a37c6c67f6f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -77,8 +77,7 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
 				struct cifsSesInfo *ses,
 				void **request_buf);
-extern struct key *cifs_get_spnego_key(struct cifsSesInfo *sesInfo,
-					const char *hostname);
+extern struct key *cifs_get_spnego_key(struct cifsSesInfo *sesInfo);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 			     const int stage,
 			     const struct nls_table *nls_cp);
-- 
cgit v1.2.3


From 8840dee9dc53883883c321d2811e9f87700d9350 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 16 Nov 2007 23:05:52 +0000
Subject: [CIFS] minor checkpatch cleanup

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h | 10 +++++-----
 fs/cifs/connect.c   |  2 +-
 fs/cifs/file.c      |  6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 3a37c6c67f6f..3748104ddedb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -248,15 +248,15 @@ extern int CIFSSMBQueryReparseLinkInfo(const int xid,
 extern int CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
 			const char *fileName, const int disposition,
 			const int access_flags, const int omode,
-			__u16 * netfid, int *pOplock, FILE_ALL_INFO *,
+			__u16 *netfid, int *pOplock, FILE_ALL_INFO *,
 			const struct nls_table *nls_codepage, int remap);
 extern int SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
 			const char *fileName, const int disposition,
 			const int access_flags, const int omode,
-			__u16 * netfid, int *pOplock, FILE_ALL_INFO *,
+			__u16 *netfid, int *pOplock, FILE_ALL_INFO *,
 			const struct nls_table *nls_codepage, int remap);
 extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
-			u32 posix_flags, __u64 mode, __u16 * netfid,
+			u32 posix_flags, __u64 mode, __u16 *netfid,
 			FILE_UNIX_BASIC_INFO *pRetData,
 			__u32 *pOplock, const char *name,
 			const struct nls_table *nls_codepage, int remap);
@@ -277,7 +277,7 @@ extern int CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 			const __u64 offset, unsigned int *nbytes,
 			struct kvec *iov, const int nvec, const int long_op);
 extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
-			const unsigned char *searchName, __u64 * inode_number,
+			const unsigned char *searchName, __u64 *inode_number,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
 extern int cifs_convertUCSpath(char *target, const __le16 *source, int maxlen,
@@ -352,5 +352,5 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
 		const char *local_acl, const int buflen, const int acl_type,
 		const struct nls_table *nls_codepage, int remap_special_chars);
 extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
-			const int netfid, __u64 * pExtAttrBits, __u64 *pMask);
+			const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 98ec57ff4d98..c4b32b7f4355 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1944,7 +1944,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			srvTcp->tcpStatus = CifsNew;
 			init_MUTEX(&srvTcp->tcpSem);
 			srvTcp->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, srvTcp, "cifsd");
-			if ( IS_ERR(srvTcp->tsk) ) {
+			if (IS_ERR(srvTcp->tsk)) {
 				rc = PTR_ERR(srvTcp->tsk);
 				cERROR(1, ("error %d create cifsd thread", rc));
 				srvTcp->tsk = NULL;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 82326d2142e7..802564196510 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1087,11 +1087,11 @@ refind_writable:
 				read_unlock(&GlobalSMBSeslock);
 				return open_file;
 			}
-	
+
 			read_unlock(&GlobalSMBSeslock);
 			/* Had to unlock since following call can block */
 			rc = cifs_reopen_file(open_file->pfile, FALSE);
-			if (!rc) { 
+			if (!rc) {
 				if (!open_file->closePend)
 					return open_file;
 				else { /* start over in case this was deleted */
@@ -1114,7 +1114,7 @@ refind_writable:
 			/* can not use this handle, no write
 			   pending on this one after all */
 			atomic_dec(&open_file->wrtPending);
-			
+
 			if (open_file->closePend) /* list could have changed */
 				goto refind_writable;
 			/* else we simply continue to the next entry. Thus
-- 
cgit v1.2.3


From 2442421b176420eca7cb68c575fc221332f488d8 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 16 Nov 2007 23:37:35 +0000
Subject: [CIFS] Have CIFS_SessSetup build correct SPNEGO SessionSetup request

Have CIFS_SessSetup call cifs_get_spnego_key when Kerberos is
negotiated. Use the info in the key payload to build a session
setup request packet. Also clean up how the request buffer in
the function is freed on error.

With appropriate user space helper (in samba/source/client). Kerberos
support (secure session establishment can be done now via Kerberos,
previously users would have to use NTLMv2 instead for more secure
session setup).

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES    |  1 +
 fs/cifs/TODO       |  2 +-
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/sess.c     | 91 ++++++++++++++++++++++++++++++++++++++++++++----------
 4 files changed, 77 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 64dd22239b21..e31aa74f7d9e 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,6 +1,7 @@
 Version 1.52
 ------------
 Fix oops on second mount to server when null auth is used.
+Enable experimental Kerberos support
 
 Version 1.51
 ------------
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 29d4b2715254..a8852c200728 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -16,7 +16,7 @@ SecurityDescriptors
 c) Better pam/winbind integration (e.g. to handle uid mapping
 better)
 
-d) Kerberos/SPNEGO session setup support - (started)
+d) Verify that Kerberos signing works
 
 e) Cleanup now unneeded SessSetup code in
 fs/cifs/connect.c and add back in NTLMSSP code if any servers
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 3525082f5e58..1fde2197ad76 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -110,6 +110,7 @@ struct mac_key {
 	unsigned int len;
 	union {
 		char ntlm[CIFS_SESS_KEY_SIZE + 16];
+		char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */
 		struct {
 			char key[16];
 			struct ntlmv2_resp resp;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index ed01ef382aa9..d0cb469daab7 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,6 +29,7 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include <linux/utsname.h>
+#include "cifs_spnego.h"
 
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
@@ -340,11 +341,12 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	SESSION_SETUP_ANDX *pSMB;
 	__u32 capabilities;
 	int count;
-	int resp_buf_type = 0;
-	struct kvec iov[2];
+	int resp_buf_type;
+	struct kvec iov[3];
 	enum securityEnum type;
 	__u16 action;
 	int bytes_remaining;
+	struct key *spnego_key = NULL;
 
 	if (ses == NULL)
 		return -EINVAL;
@@ -377,24 +379,32 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 
 	capabilities = cifs_ssetup_hdr(ses, pSMB);
 
-	/* we will send the SMB in two pieces,
-	a fixed length beginning part, and a
-	second part which will include the strings
-	and rest of bcc area, in order to avoid having
-	to do a large buffer 17K allocation */
+	/* we will send the SMB in three pieces:
+	a fixed length beginning part, an optional
+	SPNEGO blob (which can be zero length), and a
+	last part which will include the strings
+	and rest of bcc area. This allows us to avoid
+	a large buffer 17K allocation */
 	iov[0].iov_base = (char *)pSMB;
 	iov[0].iov_len = smb_buf->smb_buf_length + 4;
 
+	/* setting this here allows the code at the end of the function
+	   to free the request buffer if there's an error */
+	resp_buf_type = CIFS_SMALL_BUFFER;
+
 	/* 2000 big enough to fit max user, domain, NOS name etc. */
 	str_area = kmalloc(2000, GFP_KERNEL);
 	if (str_area == NULL) {
-		cifs_small_buf_release(smb_buf);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto ssetup_exit;
 	}
 	bcc_ptr = str_area;
 
 	ses->flags &= ~CIFS_SES_LANMAN;
 
+	iov[1].iov_base = NULL;
+	iov[1].iov_len = 0;
+
 	if (type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		char lnm_session_key[CIFS_SESS_KEY_SIZE];
@@ -463,8 +473,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		   struct ntlmv2_resp */
 
 		if (v2_sess_key == NULL) {
-			cifs_small_buf_release(smb_buf);
-			return -ENOMEM;
+			rc = -ENOMEM;
+			goto ssetup_exit;
 		}
 
 		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
@@ -499,21 +509,66 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 		} else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-	} else /* NTLMSSP or SPNEGO */ {
+	} else if (type == Kerberos) {
+#ifdef CONFIG_CIFS_UPCALL
+		struct cifs_spnego_msg *msg;
+		spnego_key = cifs_get_spnego_key(ses);
+		if (IS_ERR(spnego_key)) {
+			rc = PTR_ERR(spnego_key);
+			spnego_key = NULL;
+			goto ssetup_exit;
+		}
+
+		msg = spnego_key->payload.data;
+		/* bail out if key is too long */
+		if (msg->sesskey_len >
+		    sizeof(ses->server->mac_signing_key.data.krb5)) {
+			cERROR(1, ("Kerberos signing key too long (%u bytes)",
+				msg->sesskey_len));
+			rc = -EOVERFLOW;
+			goto ssetup_exit;
+		}
+		ses->server->mac_signing_key.len = msg->sesskey_len;
+		memcpy(ses->server->mac_signing_key.data.krb5, msg->data,
+			msg->sesskey_len);
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 		capabilities |= CAP_EXTENDED_SECURITY;
 		pSMB->req.Capabilities = cpu_to_le32(capabilities);
-		/* BB set password lengths */
+		iov[1].iov_base = msg->data + msg->sesskey_len;
+		iov[1].iov_len = msg->secblob_len;
+		pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len);
+
+		if (ses->capabilities & CAP_UNICODE) {
+			/* unicode strings must be word aligned */
+			if (iov[0].iov_len % 2) {
+				*bcc_ptr = 0;
+				bcc_ptr++;
+			}
+			unicode_oslm_strings(&bcc_ptr, nls_cp);
+			unicode_domain_string(&bcc_ptr, ses, nls_cp);
+		} else
+		/* BB: is this right? */
+			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+#else /* ! CONFIG_CIFS_UPCALL */
+		cERROR(1, ("Kerberos negotiated but upcall support disabled!"));
+		rc = -ENOSYS;
+		goto ssetup_exit;
+#endif /* CONFIG_CIFS_UPCALL */
+	} else {
+		cERROR(1, ("secType %d not supported!", type));
+		rc = -ENOSYS;
+		goto ssetup_exit;
 	}
 
-	count = (long) bcc_ptr - (long) str_area;
+	iov[2].iov_base = str_area;
+	iov[2].iov_len = (long) bcc_ptr - (long) str_area;
+
+	count = iov[1].iov_len + iov[2].iov_len;
 	smb_buf->smb_buf_length += count;
 
 	BCC_LE(smb_buf) = cpu_to_le16(count);
 
-	iov[1].iov_base = str_area;
-	iov[1].iov_len = count;
-	rc = SendReceive2(xid, ses, iov, 2 /* num_iovecs */, &resp_buf_type,
+	rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
 			  CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
 	/* SMB request buf freed in SendReceive2 */
 
@@ -560,6 +615,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 					 ses, nls_cp);
 
 ssetup_exit:
+	if (spnego_key)
+		key_put(spnego_key);
 	kfree(str_area);
 	if (resp_buf_type == CIFS_SMALL_BUFFER) {
 		cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
-- 
cgit v1.2.3


From f7a44eadd5a03b8455c7caab402ce96811c6903d Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 17 Nov 2007 00:01:51 +0000
Subject: [CIFS] remove build warning

CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 2 +-
 fs/cifs/cifs_spnego.h | 1 +
 fs/cifs/cifsproto.h   | 1 -
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index c466b56e0283..1529d2b12e9c 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -67,7 +67,7 @@ struct key_type cifs_spnego_key_type = {
 };
 
 #define MAX_VER_STR_LEN   9 /* length of longest version string e.g.
-				strlen(";ver=0xFF) */
+				strlen(";ver=0xFF") */
 #define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg
 			       in future could have strlen(";sec=ntlmsspi") */
 #define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index f443f3b35134..05a34b17a1ab 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -41,6 +41,7 @@ struct cifs_spnego_msg {
 
 #ifdef __KERNEL__
 extern struct key_type cifs_spnego_key_type;
+extern struct key *cifs_get_spnego_key(struct cifsSesInfo *sesInfo);
 #endif /* KERNEL */
 
 #endif /* _CIFS_SPNEGO_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 3748104ddedb..8350eec49663 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -77,7 +77,6 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
 				struct cifsSesInfo *ses,
 				void **request_buf);
-extern struct key *cifs_get_spnego_key(struct cifsSesInfo *sesInfo);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 			     const int stage,
 			     const struct nls_table *nls_cp);
-- 
cgit v1.2.3


From b09b9417d074e01a4e4ab5c19358f1b3dc76c1b2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 Oct 2007 13:56:10 -0400
Subject: NFS: Fix the ustat() regression

Since 2.6.18, the superblock sb->s_root has been a dummy dentry with a
dummy inode. This breaks ustat(), which actually uses sb->s_root in a
vfstat() call.

Fix this by making the s_root a dummy alias to the directory inode that was
used when creating the superblock.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/getroot.c | 81 +++++++++++++++++++-------------------------------------
 1 file changed, 27 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 522e5ad4d8ad..0ee43843f4ec 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -42,6 +42,25 @@
 
 #define NFSDBG_FACILITY		NFSDBG_CLIENT
 
+/*
+ * Set the superblock root dentry.
+ * Note that this function frees the inode in case of error.
+ */
+static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode)
+{
+	/* The mntroot acts as the dummy root dentry for this superblock */
+	if (sb->s_root == NULL) {
+		sb->s_root = d_alloc_root(inode);
+		if (sb->s_root == NULL) {
+			iput(inode);
+			return -ENOMEM;
+		}
+		/* Circumvent igrab(): we know the inode is not being freed */
+		atomic_inc(&inode->i_count);
+	}
+	return 0;
+}
+
 /*
  * get an NFS2/NFS3 root dentry from the root filehandle
  */
@@ -54,33 +73,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 	struct inode *inode;
 	int error;
 
-	/* create a dummy root dentry with dummy inode for this superblock */
-	if (!sb->s_root) {
-		struct nfs_fh dummyfh;
-		struct dentry *root;
-		struct inode *iroot;
-
-		memset(&dummyfh, 0, sizeof(dummyfh));
-		memset(&fattr, 0, sizeof(fattr));
-		nfs_fattr_init(&fattr);
-		fattr.valid = NFS_ATTR_FATTR;
-		fattr.type = NFDIR;
-		fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR;
-		fattr.nlink = 2;
-
-		iroot = nfs_fhget(sb, &dummyfh, &fattr);
-		if (IS_ERR(iroot))
-			return ERR_PTR(PTR_ERR(iroot));
-
-		root = d_alloc_root(iroot);
-		if (!root) {
-			iput(iroot);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		sb->s_root = root;
-	}
-
 	/* get the actual root for this mount */
 	fsinfo.fattr = &fattr;
 
@@ -96,6 +88,10 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 		return ERR_PTR(PTR_ERR(inode));
 	}
 
+	error = nfs_superblock_set_dummy_root(sb, inode);
+	if (error != 0)
+		return ERR_PTR(error);
+
 	/* root dentries normally start off anonymous and get spliced in later
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
@@ -241,33 +237,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 
 	dprintk("--> nfs4_get_root()\n");
 
-	/* create a dummy root dentry with dummy inode for this superblock */
-	if (!sb->s_root) {
-		struct nfs_fh dummyfh;
-		struct dentry *root;
-		struct inode *iroot;
-
-		memset(&dummyfh, 0, sizeof(dummyfh));
-		memset(&fattr, 0, sizeof(fattr));
-		nfs_fattr_init(&fattr);
-		fattr.valid = NFS_ATTR_FATTR;
-		fattr.type = NFDIR;
-		fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR;
-		fattr.nlink = 2;
-
-		iroot = nfs_fhget(sb, &dummyfh, &fattr);
-		if (IS_ERR(iroot))
-			return ERR_PTR(PTR_ERR(iroot));
-
-		root = d_alloc_root(iroot);
-		if (!root) {
-			iput(iroot);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		sb->s_root = root;
-	}
-
 	/* get the info about the server and filesystem */
 	error = nfs4_server_capabilities(server, mntfh);
 	if (error < 0) {
@@ -289,6 +258,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 		return ERR_PTR(PTR_ERR(inode));
 	}
 
+	error = nfs_superblock_set_dummy_root(sb, inode);
+	if (error != 0)
+		return ERR_PTR(error);
+
 	/* root dentries normally start off anonymous and get spliced in later
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
-- 
cgit v1.2.3


From 4c1fe2f78a08e2c514a39c91a0eb7b55bbd3c0d2 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Thu, 1 Nov 2007 16:50:20 +1100
Subject: kernel BUG at fs/nfs/namespace.c:108! - can be triggered by bad
 server

Hi Trond,

I have discovered that the BUG_ON in nfs_follow_mountpoint:

	BUG_ON(IS_ROOT(dentry));

can be triggered by a misbehaving server.

What happens is the client does a lookup and discoveres that the named
directory has a different fsid, so it initiates a mount.
It then performs a GETATTR on the mounted directory and gets a
different fsid again (due to a bug in the NFS server).
This causes nfs_follow_mountpoint to be called on the newly mounted
root, which triggers the BUG_ON.

To duplicate this, have a directory which contains some mountpoints,
and export that directory with the "crossmnt" flag using nfs-utils
1.1.1 (or 1.1.0 I think)

The GETATTR on the root of the mounted filesystem will return the
information for the top exportpoint, while a lookup will return the
correct information.  This difference causes the NFS client to BUG.

I think the best way to fix this is to trap this possibility early, so
just before completing the mount in the NFS client, check that it isn't
going to use nfs_mountpoint_inode_operations.
As long as i_op will never change once set (is that true?), this
should be adequately safe.

The following patch shows a possible approach, and it works for me.
i.e. when the NFS server is misbehaving, I get ESTALE on those
mountpoints, while when the NFS server is working correctly, I get
correct behaviour on the client.

NeilBrown

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index fa517ae9207f..71067d1ac9d9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1474,6 +1474,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
 	}
+	if (mntroot->d_inode->i_op != &nfs_dir_inode_operations) {
+		dput(mntroot);
+		error = -ESTALE;
+		goto error_splat_super;
+	}
 
 	s->s_flags |= MS_ACTIVE;
 	mnt->mnt_sb = s;
-- 
cgit v1.2.3


From 2a97468024fb5b6eccee2a67a7796485c829343a Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.cz>
Date: Tue, 20 Nov 2007 02:24:08 +0000
Subject: [CIFS] Fix spurious reconnect on 2nd peek from read of SMB length

When retrying kernel_recvmsg() because of a short read, check returned
length against the remaining length, not against total length. This
avoids unneeded session reconnects which would otherwise occur when
kernel_recvmsg() finally returns zero when asked to read zero bytes.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c4b32b7f4355..fd9147cdb5a9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -438,9 +438,9 @@ incomplete_rcv:
 			csocket = server->ssocket;
 			wake_up(&server->response_q);
 			continue;
-		} else if (length < 4) {
-			cFYI(1, ("less than four bytes received (%d bytes)",
-			      length));
+		} else if (length < pdu_length) {
+			cFYI(1, ("requested %d bytes but only got %d bytes",
+				  pdu_length, length));
 			pdu_length -= length;
 			msleep(1);
 			goto incomplete_rcv;
-- 
cgit v1.2.3


From cea218054ad277d6c126890213afde07b4eb1602 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 20 Nov 2007 23:19:03 +0000
Subject: [CIFS] Fix potential data corruption when writing out cached dirty
 pages

Fix RedHat bug 329431

The idea here is separate "conscious" from "unconscious" flushes.
Conscious flushes are those due to a fsync() or close(). Unconscious
ones are flushes that occur as a side effect of some other operation or
due to memory pressure.

Currently, when an error occurs during an unconscious flush (ENOSPC or
EIO), we toss out the page and don't preserve that error to report to
the user when a conscious flush occurs. If after the unconscious flush,
there are no more dirty pages for the inode, the conscious flush will
simply return success even though there were previous errors when writing
out pages. This can lead to data corruption.

The easiest way to reproduce this is to mount up a CIFS share that's
very close to being full or where the user is very close to quota. mv
a file to the share that's slightly larger than the quota allows. The
writes will all succeed (since they go to pagecache). The mv will do a
setattr to set the new file's attributes. This calls
filemap_write_and_wait,
which will return an error since all of the pages can't be written out.
Then later, when the flush and release ops occur, there are no more
dirty pages in pagecache for the file and those operations return 0. mv
then assumes that the file was written out correctly and deletes the
original.

CIFS already has a write_behind_rc variable where it stores the results
from earlier flushes, but that value is only reported in cifs_close.
Since the VFS ignores the return value from the release operation, this
isn't helpful. We should be reporting this error during the flush
operation.

This patch does the following:

1) changes cifs_fsync to use filemap_write_and_wait and cifs_flush and also
sync to check its return code. If it returns successful, they then check
the value of write_behind_rc to see if an earlier flush had reported any
errors. If so, they return that error and clear write_behind_rc.

2) sets write_behind_rc in a few other places where pages are written
out as a side effect of other operations and the code waits on them.

3) changes cifs_setattr to only call filemap_write_and_wait for
ATTR_SIZE changes.

4) makes cifs_writepages accurately distinguish between EIO and ENOSPC
errors when writing out pages.

Some simple testing indicates that the patch works as expected and that
it fixes the reproduceable known problem.

Acked-by: Dave Kleikamp <shaggy@austin.rr.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES  |  4 +++-
 fs/cifs/README   | 27 ++++++++++++---------------
 fs/cifs/cifsfs.c |  7 +++++--
 fs/cifs/file.c   | 24 ++++++++++++++++++------
 fs/cifs/inode.c  | 26 ++++++++++++++++++++------
 5 files changed, 58 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index e31aa74f7d9e..a609599287aa 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,7 +1,9 @@
 Version 1.52
 ------------
 Fix oops on second mount to server when null auth is used.
-Enable experimental Kerberos support
+Enable experimental Kerberos support.  Return writebehind errors on flush
+and sync so that events like out of disk space get reported properly on
+cached files.
 
 Version 1.51
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index b806b11b5560..bf11329ac784 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -225,12 +225,9 @@ If no password is provided, mount.cifs will prompt for password entry
 
 Restrictions
 ============
-Servers must support the NTLM SMB dialect (which is the most recent, supported 
-by Samba and Windows NT version 4, 2000 and XP and many other SMB/CIFS servers) 
 Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC 
-1001/1002 support for "Netbios-Over-TCP/IP." Neither of these is likely to be a 
-problem as most servers support this.  IPv6 support is planned for the future,
-and is almost complete.
+1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a 
+problem as most servers support this.
 
 Valid filenames differ between Windows and Linux.  Windows typically restricts
 filenames which contain certain reserved characters (e.g.the character : 
@@ -458,6 +455,8 @@ A partial list of the supported mount options follows:
 		byte range locks).
  remount        remount the share (often used to change from ro to rw mounts
 	        or vice versa)
+ cifsacl        Report mode bits (e.g. on stat) based on the Windows ACL for
+	        the file. (EXPERIMENTAL)
  servern        Specify the server 's netbios name (RFC1001 name) to use
 		when attempting to setup a session to the server.  This is
 		This is needed for mounting to some older servers (such
@@ -584,8 +583,8 @@ Experimental            When set to 1 used to enable certain experimental
 			performance enhancement was disabled when
 			signing turned on in case buffer was modified
 			just before it was sent, also this flag will
-			be used to use the new experimental sessionsetup
-			code).
+			be used to use the new experimental directory change 
+			notification code).
 
 These experimental features and tracing can be enabled by changing flags in 
 /proc/fs/cifs (after the cifs module has been installed or built into the 
@@ -608,7 +607,8 @@ the start of smb requests and responses can be enabled via:
 Two other experimental features are under development. To test these
 requires enabling CONFIG_CIFS_EXPERIMENTAL
 
-	ipv6 enablement
+	cifsacl support needed to retrieve approximated mode bits based on
+		the contents on the CIFS ACL.
 
 	DNOTIFY fcntl: needed for support of directory change 
 			    notification and perhaps later for file leases)
@@ -625,10 +625,7 @@ that they represent all for that share, not just those for which the server
 returned success.
 	
 Also note that "cat /proc/fs/cifs/DebugData" will display information about 
-the active sessions and the shares that are mounted.  Note: NTLMv2 enablement 
-will not work since its implementation is not quite complete yet. Do not alter
-the ExtendedSecurity configuration value unless you are doing specific testing.
-Enabling extended security works to Windows 2000 Workstations and XP but not to 
-Windows 2000 server or Samba since it does not usually send "raw NTLMSSP" 
-(instead it sends NTLMSSP encapsulated in SPNEGO/GSSAPI, which support is not 
-complete in the CIFS VFS yet).  
+the active sessions and the shares that are mounted.
+Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is enabled
+but requires a user space helper (from the Samba project). NTLM and NTLMv2 and
+LANMAN support do not require this helpr.
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 416dc9fe8961..093beaa3900d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -266,6 +266,7 @@ cifs_alloc_inode(struct super_block *sb)
 	cifs_inode->cifsAttrs = 0x20;	/* default */
 	atomic_set(&cifs_inode->inUse, 0);
 	cifs_inode->time = 0;
+	cifs_inode->write_behind_rc = 0;
 	/* Until the file is open and we have gotten oplock
 	info back from the server, can not assume caching of
 	file data or metadata */
@@ -852,7 +853,7 @@ static int cifs_oplock_thread(void *dummyarg)
 	struct cifsTconInfo *pTcon;
 	struct inode *inode;
 	__u16  netfid;
-	int rc;
+	int rc, waitrc = 0;
 
 	set_freezable();
 	do {
@@ -884,9 +885,11 @@ static int cifs_oplock_thread(void *dummyarg)
 					   filemap_fdatawrite(inode->i_mapping);
 					if (CIFS_I(inode)->clientCanCacheRead
 									 == 0) {
-						filemap_fdatawait(inode->i_mapping);
+						waitrc = filemap_fdatawait(inode->i_mapping);
 						invalidate_remote_inode(inode);
 					}
+					if (rc == 0)
+						rc = waitrc;
 				} else
 					rc = 0;
 				/* mutex_unlock(&inode->i_mutex);*/
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 802564196510..dd26e2759b17 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -130,7 +130,9 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 		if (file->f_path.dentry->d_inode->i_mapping) {
 		/* BB no need to lock inode until after invalidate
 		   since namei code should already have it locked? */
-			filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+			if (rc != 0)
+				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
 		}
 		cFYI(1, ("invalidating remote inode since open detected it "
 			 "changed"));
@@ -425,7 +427,9 @@ reopen_error_exit:
 		pCifsInode = CIFS_I(inode);
 		if (pCifsInode) {
 			if (can_flush) {
-				filemap_write_and_wait(inode->i_mapping);
+				rc = filemap_write_and_wait(inode->i_mapping);
+				if (rc != 0)
+					CIFS_I(inode)->write_behind_rc = rc;
 			/* temporarily disable caching while we
 			   go to server to get inode info */
 				pCifsInode->clientCanCacheAll = FALSE;
@@ -1367,7 +1371,10 @@ retry:
 						  rc, bytes_written));
 					/* BB what if continued retry is
 					   requested via mount flags? */
-					set_bit(AS_EIO, &mapping->flags);
+					if (rc == -ENOSPC)
+						set_bit(AS_ENOSPC, &mapping->flags);
+					else
+						set_bit(AS_EIO, &mapping->flags);
 				} else {
 					cifs_stats_bytes_written(cifs_sb->tcon,
 								 bytes_written);
@@ -1499,9 +1506,11 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	cFYI(1, ("Sync file - name: %s datasync: 0x%x",
 		dentry->d_name.name, datasync));
 
-	rc = filemap_fdatawrite(inode->i_mapping);
-	if (rc == 0)
+	rc = filemap_write_and_wait(inode->i_mapping);
+	if (rc == 0) {
+		rc = CIFS_I(inode)->write_behind_rc;
 		CIFS_I(inode)->write_behind_rc = 0;
+	}
 	FreeXid(xid);
 	return rc;
 }
@@ -1553,8 +1562,11 @@ int cifs_flush(struct file *file, fl_owner_t id)
 	   filemapfdatawrite appears easier for the time being */
 
 	rc = filemap_fdatawrite(inode->i_mapping);
-	if (!rc) /* reset wb rc if we were able to write out dirty pages */
+	/* reset wb rc if we were able to write out dirty pages */
+	if (!rc) {
+		rc = CIFS_I(inode)->write_behind_rc;
 		CIFS_I(inode)->write_behind_rc = 0;
+	}
 
 	cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc));
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7d907e84e032..e915eb1d2e66 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1233,7 +1233,7 @@ cifs_rename_exit:
 int cifs_revalidate(struct dentry *direntry)
 {
 	int xid;
-	int rc = 0;
+	int rc = 0, wbrc = 0;
 	char *full_path;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsInodeInfo *cifsInode;
@@ -1333,7 +1333,9 @@ int cifs_revalidate(struct dentry *direntry)
 	if (direntry->d_inode->i_mapping) {
 		/* do we need to lock inode until after invalidate completes
 		   below? */
-		filemap_fdatawrite(direntry->d_inode->i_mapping);
+		wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
+		if (wbrc)
+			CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
 	}
 	if (invalidate_inode) {
 	/* shrink_dcache not necessary now that cifs dentry ops
@@ -1342,7 +1344,9 @@ int cifs_revalidate(struct dentry *direntry)
 			shrink_dcache_parent(direntry); */
 		if (S_ISREG(direntry->d_inode->i_mode)) {
 			if (direntry->d_inode->i_mapping)
-				filemap_fdatawait(direntry->d_inode->i_mapping);
+				wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
+				if (wbrc)
+					CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
 			/* may eventually have to do this for open files too */
 			if (list_empty(&(cifsInode->openFileList))) {
 				/* changed on server - flush read ahead pages */
@@ -1485,10 +1489,20 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 
 	/* BB check if we need to refresh inode from server now ? BB */
 
-	/* need to flush data before changing file size on server */
-	filemap_write_and_wait(direntry->d_inode->i_mapping);
-
 	if (attrs->ia_valid & ATTR_SIZE) {
+		/*
+		   Flush data before changing file size on server. If the
+		   flush returns error, store it to report later and continue.
+		   BB: This should be smarter. Why bother flushing pages that
+		   will be truncated anyway? Also, should we error out here if
+		   the flush returns error?
+		 */
+		rc = filemap_write_and_wait(direntry->d_inode->i_mapping);
+		if (rc != 0) {
+			CIFS_I(direntry->d_inode)->write_behind_rc = rc;
+			rc = 0;
+		}
+
 		/* To avoid spurious oplock breaks from server, in the case of
 		   inodes that we already have open, avoid doing path based
 		   setting of file size if we can do it by handle.
-- 
cgit v1.2.3


From 2b83457bded19cb57c5bdd59ebe16fe1a919c088 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sun, 25 Nov 2007 10:01:00 +0000
Subject: [CIFS] Fix check after use error in ACL code

Spotted by the coverity scanner.

CC: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index dabbce00712b..f02fdef463a7 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -269,6 +269,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 	/* BB need to add parm so we can store the SID BB */
 
+	if (!pdacl) {
+		/* no DACL in the security descriptor, set
+		   all the permissions for user/group/other */
+		inode->i_mode |= S_IRWXUGO;
+		return;
+	}
+
 	/* validate that we do not go past end of acl */
 	if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
 		cERROR(1, ("ACL too small to parse DACL"));
@@ -286,12 +293,6 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 	   user/group/other have no permissions */
 	inode->i_mode &= ~(S_IRWXUGO);
 
-	if (!pdacl) {
-		/* no DACL in the security descriptor, set
-		   all the permissions for user/group/other */
-		inode->i_mode |= S_IRWXUGO;
-		return;
-	}
 	acl_base = (char *)pdacl;
 	acl_size = sizeof(struct cifs_acl);
 
-- 
cgit v1.2.3


From 08e4570a4a393bcc241f78dfc444cb0b07995fc0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 26 Nov 2007 21:21:49 +0100
Subject: sched: fix prev_stime calculation

Srivatsa Vaddagiri noticed occasionally incorrect CPU usage
values in top and tracked it down to stime going below 0 in
task_stime(). Negative values are possible there due to the
sampled nature of stime/utime.

Fix suggested by Balbir Singh.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
 fs/proc/array.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index eba339ecba27..65c62e1bfd6f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -374,7 +374,9 @@ static cputime_t task_stime(struct task_struct *p)
 	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
 			cputime_to_clock_t(task_utime(p));
 
-	p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+	if (stime >= 0)
+		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+
 	return p->prev_stime;
 }
 #endif
-- 
cgit v1.2.3


From f16c960332b125491178fc2da7ea7893b0d65d05 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@arm.linux.org.uk>
Date: Fri, 16 Nov 2007 22:13:24 +0000
Subject: NFS: mount failure causes bad page state

While testing a kernel based upon ecd744eec3aa8bbc949ec04ed3fbf7ecb2958a0e
(with wrong boot arguments), I got the following bad page state entry while
NFS was trying to mount it's rootfs:

IP-Config: Complete:
      device=eth0, addr=192.168.1.101, mask=255.255.255.0, gw=255.255.255.255,
     host=192.168.1.101, domain=, nis-domain=(none),
     bootserver=192.168.1.100, rootserver=192.168.1.100, rootpath=
Looking up port of RPC 100003/2 on 192.168.1.100
rpcbind: server 192.168.1.100 not responding, timed out
Root-NFS: Unable to get nfsd port number from server, using default
Looking up port of RPC 100005/1 on 192.168.1.100
rpcbind: server 192.168.1.100 not responding, timed out
Root-NFS: Unable to get mountd port number from server, using default
mount: server 192.168.1.100 not responding, timed out
Root-NFS: Server returned error -5 while mounting /nfs/rootfs/
VFS: Unable to mount root fs via NFS, trying floppy.
Bad page state in process 'swapper'
page:c02b1260 flags:0x00000400 mapping:00000000 mapcount:0 count:0
Trying to fix it up, but a reboot is needed
Backtrace:
[<c0023e34>] (dump_stack+0x0/0x14) from [<c0062570>] (bad_page+0x70/0xac)
[<c0062500>] (bad_page+0x0/0xac) from [<c0064914>] (free_hot_cold_page+0x80/0x178)
[<c0064894>] (free_hot_cold_page+0x0/0x178) from [<c0064a74>] (free_hot_page+0x14/0x18)
[<c0064a60>] (free_hot_page+0x0/0x18) from [<c0067078>] (put_page+0xf8/0x154)
[<c0066f80>] (put_page+0x0/0x154) from [<c007dbc8>] (kfree+0xc8/0xd0)
[<c007db00>] (kfree+0x0/0xd0) from [<c00cbb54>] (nfs_get_sb+0x230/0x710)
[<c00cb924>] (nfs_get_sb+0x0/0x710) from [<c0084334>] (vfs_kern_mount+0x58/0xac)[<c00842dc>] (vfs_kern_mount+0x0/0xac) from [<c00843c0>] (do_kern_mount+0x38/0xf4)
[<c0084388>] (do_kern_mount+0x0/0xf4) from [<c0099c7c>] (do_mount+0x1e8/0x614)
...

This seems to be caused by use of an uninitialised structure due to NULL
options being passed to nfs_validate_mount_data().  Ensure that the
parsed mount data is always initialised.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
     (Trond: added fix for the same bug in nfs4_validate_mount_data()).
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 71067d1ac9d9..2426e713b77f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1054,10 +1054,11 @@ static int nfs_validate_mount_data(void *options,
 {
 	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
 
+	memset(args, 0, sizeof(*args));
+
 	if (data == NULL)
 		goto out_no_data;
 
-	memset(args, 0, sizeof(*args));
 	args->flags		= (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
 	args->rsize		= NFS_MAX_FILE_IO_SIZE;
 	args->wsize		= NFS_MAX_FILE_IO_SIZE;
@@ -1536,10 +1537,11 @@ static int nfs4_validate_mount_data(void *options,
 	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
 	char *c;
 
+	memset(args, 0, sizeof(*args));
+
 	if (data == NULL)
 		goto out_no_data;
 
-	memset(args, 0, sizeof(*args));
 	args->rsize		= NFS_MAX_FILE_IO_SIZE;
 	args->wsize		= NFS_MAX_FILE_IO_SIZE;
 	args->timeo		= 600;
-- 
cgit v1.2.3


From 5334eb13d455dd26b7064980b118e3c957929701 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 21 Nov 2007 15:04:31 -0800
Subject: NFS: make nfs_wb_page_priority() static

nfs_wb_page_priority() can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 89527a487ed7..51cc1bd6a116 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1436,7 +1436,8 @@ out:
 	return ret;
 }
 
-int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
+static int nfs_wb_page_priority(struct inode *inode, struct page *page,
+				int how)
 {
 	loff_t range_start = page_offset(page);
 	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
-- 
cgit v1.2.3


From 4c30d56edcaaa0446370189e8ab5c5393dc20ca3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 21 Nov 2007 15:04:31 -0800
Subject: NFS: fs/nfs/dir.c should #include "internal.h"

Every file should include the headers containing the prototypes for its global
functions (in this case nfs_access_cache_shrinker()).

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 35334539d947..f697b5c74b7c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -38,6 +38,7 @@
 #include "nfs4_fs.h"
 #include "delegation.h"
 #include "iostat.h"
+#include "internal.h"
 
 /* #define NFS_DEBUG_VERBOSE 1 */
 
-- 
cgit v1.2.3


From 19f737879cc623c3aa73e655465faa3bff121768 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Nov 2007 12:16:47 -0500
Subject: NFS: Introduce iovec I/O helpers to fs/nfs/direct.c

Add helpers that iterate over multi-segment iovecs.  These will
be used to support multi-segment scatter/gather direct I/O in a
later patch.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index afcab007a22b..e30d9285a566 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -355,6 +355,41 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+					      const struct iovec *iov,
+					      unsigned long nr_segs,
+					      loff_t pos)
+{
+	ssize_t result = -EINVAL;
+	size_t requested_bytes = 0;
+	unsigned long seg;
+
+	get_dreq(dreq);
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *vec = &iov[seg];
+		result = nfs_direct_read_schedule(dreq,
+					(unsigned long)vec->iov_base,
+						  vec->iov_len, pos);
+		if (result < 0)
+			break;
+		requested_bytes += result;
+		if ((size_t)result < vec->iov_len)
+			break;
+		pos += vec->iov_len;
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_complete(dreq);
+
+	if (requested_bytes != 0)
+		return 0;
+
+	if (result < 0)
+		return result;
+	return -EIO;
+}
+
 static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
 	ssize_t result = 0;
@@ -697,6 +732,42 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
+static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
+					       const struct iovec *iov,
+					       unsigned long nr_segs,
+					       loff_t pos, int sync)
+{
+	ssize_t result = 0;
+	size_t requested_bytes = 0;
+	unsigned long seg;
+
+	get_dreq(dreq);
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *vec = &iov[seg];
+		result = nfs_direct_write_schedule(dreq,
+					(unsigned long)vec->iov_base,
+						   vec->iov_len,
+						   pos, sync);
+		if (result < 0)
+			break;
+		requested_bytes += result;
+		if ((size_t)result < vec->iov_len)
+			break;
+		pos += vec->iov_len;
+	}
+
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, dreq->inode);
+
+	if (requested_bytes != 0)
+		return 0;
+
+	if (result < 0)
+		return result;
+	return -EIO;
+}
+
 static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
 	ssize_t result = 0;
-- 
cgit v1.2.3


From c216fd708e1a97431925ecffd6d1896cff61df0a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Nov 2007 12:16:52 -0500
Subject: NFS: Support multiple segment iovecs in the NFS direct I/O path

Allow applications to perform asynchronous scatter-gather direct I/O
to NFS files.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 67 ++++++++++++++++++++-------------------------------------
 1 file changed, 23 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e30d9285a566..88d5d1c7f987 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -272,8 +272,6 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 	int result;
 	ssize_t started = 0;
 
-	get_dreq(dreq);
-
 	do {
 		struct nfs_read_data *data;
 		size_t bytes;
@@ -347,11 +345,8 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 		count -= bytes;
 	} while (count != 0);
 
-	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
-
 	if (started)
-		return 0;
+		return started;
 	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
@@ -390,7 +385,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	return -EIO;
 }
 
-static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
 {
 	ssize_t result = 0;
 	sigset_t oldset;
@@ -407,9 +403,8 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
 	rpc_clnt_sigmask(clnt, &oldset);
-	result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
+	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
@@ -645,8 +640,6 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 	int result;
 	ssize_t started = 0;
 
-	get_dreq(dreq);
-
 	do {
 		struct nfs_write_data *data;
 		size_t bytes;
@@ -724,11 +717,8 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 		count -= bytes;
 	} while (count != 0);
 
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, inode);
-
 	if (started)
-		return 0;
+		return started;
 	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
@@ -768,7 +758,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	return -EIO;
 }
 
-static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos,
+				size_t count)
 {
 	ssize_t result = 0;
 	sigset_t oldset;
@@ -791,10 +783,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);
-
 	rpc_clnt_sigmask(clnt, &oldset);
-	result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
+	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
 	if (!result)
 		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
@@ -830,21 +820,16 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t retval = -EINVAL;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
-	/* XXX: temporary */
-	const char __user *buf = iov[0].iov_base;
-	size_t count = iov[0].iov_len;
+	size_t count;
+
+	count = iov_length(iov, nr_segs);
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 
-	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
+	dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name,
-		(unsigned long) count, (long long) pos);
+		count, (long long) pos);
 
-	if (nr_segs != 1)
-		goto out;
-
-	retval = -EFAULT;
-	if (!access_ok(VERIFY_WRITE, buf, count))
-		goto out;
 	retval = 0;
 	if (!count)
 		goto out;
@@ -853,7 +838,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	if (retval)
 		goto out;
 
-	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
+	retval = nfs_direct_read(iocb, iov, nr_segs, pos);
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
 
@@ -892,17 +877,15 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t retval = -EINVAL;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
-	/* XXX: temporary */
-	const char __user *buf = iov[0].iov_base;
-	size_t count = iov[0].iov_len;
+	size_t count;
 
-	dprintk("nfs: direct write(%s/%s, %lu@%Ld)\n",
+	count = iov_length(iov, nr_segs);
+	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
+
+	dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name,
-		(unsigned long) count, (long long) pos);
-
-	if (nr_segs != 1)
-		goto out;
+		count, (long long) pos);
 
 	retval = generic_write_checks(file, &pos, &count, 0);
 	if (retval)
@@ -915,15 +898,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!count)
 		goto out;
 
-	retval = -EFAULT;
-	if (!access_ok(VERIFY_READ, buf, count))
-		goto out;
-
 	retval = nfs_sync_mapping(mapping);
 	if (retval)
 		goto out;
 
-	retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
+	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
 
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
-- 
cgit v1.2.3


From b9148c6b80d802dbc2a7530b29915a80432e50c7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Nov 2007 12:16:58 -0500
Subject: NFS: Ensure we return zero if applications attempt to write zero
 bytes

A zero byte count direct write request should be a successful no-op, not an
error.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 88d5d1c7f987..4d726e9db295 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -890,6 +890,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	retval = generic_write_checks(file, &pos, &count, 0);
 	if (retval)
 		goto out;
+	if (!count)
+		goto out;	/* return 0 */
 
 	retval = -EINVAL;
 	if ((ssize_t) count < 0)
-- 
cgit v1.2.3


From 02fe494619d525ea803ab1f4f671186dc8a52f7a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Nov 2007 12:17:03 -0500
Subject: NFS: Clean up new multi-segment direct I/O changes

Simplify calling sequence of nfs_direct_{read,write}_schedule(), and
rename them to reflect their new role.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4d726e9db295..5e8d82f6666b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -263,10 +263,14 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
  * handled automatically by nfs_direct_read_result().  Otherwise, if
  * no requests have been sent, just return an error.
  */
-static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
+						const struct iovec *iov,
+						loff_t pos)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->path.dentry->d_inode;
+	unsigned long user_addr = (unsigned long)iov->iov_base;
+	size_t count = iov->iov_len;
 	size_t rsize = NFS_SERVER(inode)->rsize;
 	unsigned int pgbase;
 	int result;
@@ -363,9 +367,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		const struct iovec *vec = &iov[seg];
-		result = nfs_direct_read_schedule(dreq,
-					(unsigned long)vec->iov_base,
-						  vec->iov_len, pos);
+		result = nfs_direct_read_schedule_segment(dreq, vec, pos);
 		if (result < 0)
 			break;
 		requested_bytes += result;
@@ -631,10 +633,14 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
  * handled automatically by nfs_direct_write_result().  Otherwise, if
  * no requests have been sent, just return an error.
  */
-static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
+static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
+						 const struct iovec *iov,
+						 loff_t pos, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->path.dentry->d_inode;
+	unsigned long user_addr = (unsigned long)iov->iov_base;
+	size_t count = iov->iov_len;
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	unsigned int pgbase;
 	int result;
@@ -735,10 +741,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		const struct iovec *vec = &iov[seg];
-		result = nfs_direct_write_schedule(dreq,
-					(unsigned long)vec->iov_base,
-						   vec->iov_len,
-						   pos, sync);
+		result = nfs_direct_write_schedule_segment(dreq, vec,
+							   pos, sync);
 		if (result < 0)
 			break;
 		requested_bytes += result;
-- 
cgit v1.2.3


From e001e796e47d29c470de6c2cd36400e03c66118b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 7 Nov 2007 14:21:45 -0800
Subject: ocfs2: Reset journal parameters after s_mount_opt update

Right now we're just setting them from the existing parameters, not the
new ones that a remount specified.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index be562ac3e89c..5ee775420665 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -438,14 +438,14 @@ unlock_osb:
 	}
 
 	if (!ret) {
-		if (!ocfs2_is_hard_readonly(osb))
-			ocfs2_set_journal_params(osb);
-
 		/* Only save off the new mount options in case of a successful
 		 * remount. */
 		osb->s_mount_opt = parsed_options.mount_opt;
 		osb->s_atime_quantum = parsed_options.atime_quantum;
 		osb->preferred_slot = parsed_options.slot;
+
+		if (!ocfs2_is_hard_readonly(osb))
+			ocfs2_set_journal_params(osb);
 	}
 out:
 	return ret;
-- 
cgit v1.2.3


From 2759236f8415ccc0f275b57bd8142c2c81fd2177 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 19 Nov 2007 17:53:34 -0800
Subject: [PATCH] fs/ocfs2: Add missing "space"

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 4 ++--
 fs/ocfs2/inode.c         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 62e4a7daa286..a54d33d95ada 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -908,7 +908,7 @@ lookup:
 		 * but they might own this lockres.  wait on them. */
 		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
 		if (bit < O2NM_MAX_NODES) {
-			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
 			     "recover before lock mastery can begin\n",
 			     dlm->name, namelen, (char *)lockid, bit);
 			wait_on_recovery = 1;
@@ -962,7 +962,7 @@ redo_request:
 		spin_lock(&dlm->spinlock);
 		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
 		if (bit < O2NM_MAX_NODES) {
-			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
 			     "recover before lock mastery can begin\n",
 			     dlm->name, namelen, (char *)lockid, bit);
 			wait_on_recovery = 1;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 1d5e0cb0fda1..9e3e7df02c7a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -863,7 +863,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 	status = ocfs2_try_open_lock(inode, 1);
 	if (status == -EAGAIN) {
 		status = 0;
-		mlog(0, "Skipping delete of %llu because it is in use on"
+		mlog(0, "Skipping delete of %llu because it is in use on "
 		     "other nodes\n", (unsigned long long)oi->ip_blkno);
 		goto bail;
 	}
-- 
cgit v1.2.3


From ef9f86ceb63f2803c9aada249986b84d2f99c635 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@ca-build8.us.oracle.com>
Date: Mon, 19 Nov 2007 18:31:17 -0800
Subject: ocfs2: Filter -ENOSPC in mlog_errno()

It's almost never worth printing in that situation and we keep forgetting to
manually filter it out.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/masklog.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index cd046060114e..597e064bb94f 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -212,7 +212,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #define mlog_errno(st) do {						\
 	int _st = (st);							\
 	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
-	    _st != AOP_TRUNCATED_PAGE)					\
+	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC)		\
 		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
 } while (0)
 
-- 
cgit v1.2.3


From a46043e08f300982c51df317e2f8fb919dedadcd Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@ca-build8.us.oracle.com>
Date: Mon, 19 Nov 2007 18:40:16 -0800
Subject: ocfs2: log valid inode # on bad inode

If the inode block isn't valid then we don't want to print the value from
that, instead print the block number which was passed in (which should
always be correct). Also, turn this into a debug print for now - folks who
hit an actual problem always have other logs indicating what the source is.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 9e3e7df02c7a..ebb2bbe30f35 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -455,8 +455,8 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	status = -EINVAL;
 	fe = (struct ocfs2_dinode *) bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+		mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
+		     (unsigned long long)args->fi_blkno, 7,
 		     fe->i_signature);
 		goto bail;
 	}
-- 
cgit v1.2.3


From 5a58c3ef22d6e5b84ff651a7d27ae2cbea9f9870 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 13 Nov 2007 19:59:33 +0100
Subject: [PATCH] ocfs2: Remove expensive bitmap scanning

Enable expensive bitmap scanning only if DEBUG option is enabled.
The bitmap scanning quite loads the CPU and on my machine the write
throughput of dd if=/dev/zero of=/ocfs2/file bs=1M count=500 conv=sync
improves from 37 MB/s to 45.4 MB/s in local mode...

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Kconfig            | 9 +++++++++
 fs/ocfs2/localalloc.c | 5 +++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 429a00228507..635f3e286ad8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -459,6 +459,15 @@ config OCFS2_DEBUG_MASKLOG
 	  This option will enlarge your kernel, but it allows debugging of
 	  ocfs2 filesystem issues.
 
+config OCFS2_DEBUG_FS
+	bool "OCFS2 expensive checks"
+	depends on OCFS2_FS
+	default n
+	help
+	  This option will enable expensive consistency checks. Enable
+	  this option for debugging only as it is likely to decrease
+	  performance of the filesystem.
+
 config MINIX_FS
 	tristate "Minix fs support"
 	help
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index d272847d5a07..58ea88b5af36 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -484,6 +484,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 
+#ifdef OCFS2_DEBUG_FS
 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
 	    ocfs2_local_alloc_count_bits(alloc)) {
 		ocfs2_error(osb->sb, "local alloc inode %llu says it has "
@@ -494,6 +495,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 		status = -EIO;
 		goto bail;
 	}
+#endif
 
 	free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
 		le32_to_cpu(alloc->id1.bitmap1.i_used);
@@ -712,9 +714,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 	void *bitmap;
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 
-	mlog_entry("total = %u, COUNT = %u, used = %u\n",
+	mlog_entry("total = %u, used = %u\n",
 		   le32_to_cpu(alloc->id1.bitmap1.i_total),
-		   ocfs2_local_alloc_count_bits(alloc),
 		   le32_to_cpu(alloc->id1.bitmap1.i_used));
 
 	if (!alloc->id1.bitmap1.i_total) {
-- 
cgit v1.2.3


From bccb9dad8949cd0df1d3d2188a1fdb4b1936d537 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 7 Nov 2007 16:35:14 -0800
Subject: ocfs2: Remove bug statement in ocfs2_dentry_iput()

The existing bug statement didn't take into account unhashed dentries which
might not have a cluster lock on them. This could happen if a node exporting
the file system via NFS is rebooted, re-exported to nfs clients and then
unmounted. It's fine in this case to not have a dentry cluster lock.

Just remove the bug statement and replace it with an error print, which
does the proper checks. Though we want to know if something has happened
which might have prevented a cluster lock from being created, it's
definitely not necessary to panic the machine for this.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dcache.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 1957a5ed219e..9923278ea6d4 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -344,12 +344,24 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
 	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
 
-	mlog_bug_on_msg(!dl && !(dentry->d_flags & DCACHE_DISCONNECTED),
-			"dentry: %.*s\n", dentry->d_name.len,
-			dentry->d_name.name);
+	if (!dl) {
+		/*
+		 * No dentry lock is ok if we're disconnected or
+		 * unhashed.
+		 */
+		if (!(dentry->d_flags & DCACHE_DISCONNECTED) &&
+		    !d_unhashed(dentry)) {
+			unsigned long long ino = 0ULL;
+			if (inode)
+				ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
+			mlog(ML_ERROR, "Dentry is missing cluster lock. "
+			     "inode: %llu, d_flags: 0x%x, d_name: %.*s\n",
+			     ino, dentry->d_flags, dentry->d_name.len,
+			     dentry->d_name.name);
+		}
 
-	if (!dl)
 		goto out;
+	}
 
 	mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
 			dentry->d_name.len, dentry->d_name.name,
-- 
cgit v1.2.3


From 0d8a4e0cd688ad0de6430ce3425c7849cfec1c2d Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 20 Nov 2007 11:48:41 -0800
Subject: ocfs2: Fix comparison in ocfs2_size_fits_inline_data()

This was causing us to prematurely push out inline data by one byte.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 556e34ccb005..56f7790cad46 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1514,7 +1514,7 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
 {
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
-	if (new_size < le16_to_cpu(di->id2.i_data.id_count))
+	if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
 		return 1;
 	return 0;
 }
-- 
cgit v1.2.3


From b1967d0eddeef4869ee283e692735cb994f3745a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 20 Nov 2007 11:56:39 -0800
Subject: ocfs2: reverse inline-data truncate args

ocfs2_truncate() and ocfs2_remove_inode_range() had reversed their "set
i_size" arguments to ocfs2_truncate_inline(). Fix things so that truncate
sets i_size, and punching a hole ignores it.

This exposed a problem where punching a hole in an inline-data file wasn't
updating the page cache, so fix that too.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bbac7cd33e0b..b75b2e1f0e42 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -399,7 +399,7 @@ static int ocfs2_truncate_file(struct inode *inode,
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
-					       i_size_read(inode), 0);
+					       i_size_read(inode), 1);
 		if (status)
 			mlog_errno(status);
 
@@ -1521,6 +1521,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 	u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct address_space *mapping = inode->i_mapping;
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
@@ -1529,10 +1530,20 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
-					    byte_start + byte_len, 1);
-		if (ret)
+					    byte_start + byte_len, 0);
+		if (ret) {
 			mlog_errno(ret);
-		return ret;
+			goto out;
+		}
+		/*
+		 * There's no need to get fancy with the page cache
+		 * truncate of an inline-data inode. We're talking
+		 * about less than a page here, which will be cached
+		 * in the dinode buffer anyway.
+		 */
+		unmap_mapping_range(mapping, 0, 0, 0);
+		truncate_inode_pages(mapping, 0);
+		goto out;
 	}
 
 	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
-- 
cgit v1.2.3


From c46f739dd39db3b07ab5deb4e3ec81e1c04a91af Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 28 Nov 2007 13:59:18 +0100
Subject: vfs: coredumping fix

fix: http://bugzilla.kernel.org/show_bug.cgi?id=3043

only allow coredumping to the same uid that the coredumping
task runs under.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Alan Cox <alan@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Al Viro <viro@ftp.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 4ccaaa4b13b2..282240afe99e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1780,6 +1780,12 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	   but keep the previous behaviour for now. */
 	if (!ispipe && !S_ISREG(inode->i_mode))
 		goto close_fail;
+	/*
+	 * Dont allow local users get cute and trick others to coredump
+	 * into their pre-created files:
+	 */
+	if (inode->i_uid != current->fsuid)
+		goto close_fail;
 	if (!file->f_op)
 		goto close_fail;
 	if (!file->f_op->write)
-- 
cgit v1.2.3


From 8118a859dc7abd873193986c77a8d9bdb877adc8 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 21 Nov 2007 14:55:19 -0800
Subject: sysfs: fix off-by-one error in fill_read_buffer()

I found that there is a off-by-one problem in the following code.

Version:	2.6.24-rc2
File:		fs/sysfs/file.c:118-122
Function:	fill_read_buffer
--------------------------------------------------------------------
	count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);

	sysfs_put_active_two(attr_sd);

	BUG_ON(count > (ssize_t)PAGE_SIZE);
--------------------------------------------------------------------

Because according to the specification of the sysfs and the implement of
the show methods, the show methods return the number of bytes which would
be generated for the given input, excluding the trailing null.So if the
return value of the show methods equals PAGE_SIZE - 1, the buffer is full
in fact.  And if the return value equals PAGE_SIZE, the resulting string
was already truncated,or buffer overflow occurred.

This patch fixes an off-by-one error in fill_read_buffer.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <teheo@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 27d1785b7644..4045bdcc4b33 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -119,7 +119,11 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 
 	sysfs_put_active_two(attr_sd);
 
-	BUG_ON(count > (ssize_t)PAGE_SIZE);
+	/*
+	 * The code works fine with PAGE_SIZE return but it's likely to
+	 * indicate truncated result or overflow in normal use cases.
+	 */
+	BUG_ON(count >= (ssize_t)PAGE_SIZE);
 	if (count >= 0) {
 		buffer->needs_read_fill = 0;
 		buffer->count = count;
-- 
cgit v1.2.3


From c2319540cd7330fa9066e5b9b84d357a2c8631a2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Wed, 28 Nov 2007 16:21:23 -0800
Subject: proc: fix NULL ->i_fop oops

proc_kill_inodes() can clear ->i_fop in the middle of vfs_readdir resulting in
NULL dereference during "file->f_op->readdir(file, buf, filler)".

The solution is to remove proc_kill_inodes() completely:

a) we don't have tricky modules implementing their tricky readdir hooks which
   could keeping this revoke from hell.

b) In a situation when module is gone but PDE still alive, standard
   readdir will return only "." and "..", because pde->next was cleared by
   remove_proc_entry().

c) the race proc_kill_inode() destined to prevent is not completely
   fixed, just race window made smaller, because vfs_readdir() is run
   without sb_lock held and without file_list_lock held.  Effectively,
   ->i_fop is cleared at random moment, which can't fix properly anything.

BUG: unable to handle kernel NULL pointer dereference at virtual address 00000018
printing eip: c1061205 *pdpt = 0000000005b22001 *pde = 0000000000000000
Oops: 0000 [#1] PREEMPT SMP
Modules linked in: foo af_packet ipv6 cpufreq_ondemand loop serio_raw sr_mod k8temp cdrom hwmon amd_rng
Pid: 2033, comm: find Not tainted (2.6.24-rc1-b1d08ac064268d0ae2281e98bf5e82627e0f0c56 #2)
EIP: 0060:[<c1061205>] EFLAGS: 00010246 CPU: 0
EIP is at vfs_readdir+0x47/0x74
EAX: c6b6a780 EBX: 00000000 ECX: c1061040 EDX: c5decf94
ESI: c6b6a780 EDI: fffffffe EBP: c9797c54 ESP: c5decf78
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process find (pid: 2033, ti=c5dec000 task=c64bba90 task.ti=c5dec000)
Stack: c5decf94 c1061040 fffffff7 0805ffbc 00000000 c6b6a780 c1061295 0805ffbc
       00000000 00000400 00000000 00000004 0805ffbc 4588eff4 c5dec000 c10026ba
       00000004 0805ffbc 00000400 0805ffbc 4588eff4 bfdc6c70 000000dc 0000007b
Call Trace:
 [<c1061040>] filldir64+0x0/0xc5
 [<c1061295>] sys_getdents64+0x63/0xa5
 [<c10026ba>] sysenter_past_esp+0x5f/0x85
 =======================
Code: 49 83 78 18 00 74 43 8d 6b 74 bf fe ff ff ff 89 e8 e8 b8 c0 12 00 f6 83 2c 01 00 00 10 75 22 8b 5e 10 8b 4c 24 04 89 f0 8b 14 24 <ff> 53 18 f6 46 1a 04 89 c7 75 0b 8b 56 0c 8b 46 08 e8 c8 66 00
EIP: [<c1061205>] vfs_readdir+0x47/0x74 SS:ESP 0068:c5decf78

hch: "Nice, getting rid of this is a very good step formwards.
      Unfortunately we have another copy of this junk in
      security/selinux/selinuxfs.c:sel_remove_entries() which would need the
      same treatment."

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Acked-by: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c  | 37 -------------------------------------
 fs/proc/internal.h |  2 --
 fs/proc/root.c     |  2 +-
 3 files changed, 1 insertion(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a9806bc21ec3..39f3d6519035 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -555,41 +555,6 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 	return 0;
 }
 
-/*
- * Kill an inode that got unregistered..
- */
-static void proc_kill_inodes(struct proc_dir_entry *de)
-{
-	struct list_head *p;
-	struct super_block *sb;
-
-	/*
-	 * Actually it's a partial revoke().
-	 */
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &proc_fs_type.fs_supers, s_instances) {
-		file_list_lock();
-		list_for_each(p, &sb->s_files) {
-			struct file *filp = list_entry(p, struct file,
-							f_u.fu_list);
-			struct dentry *dentry = filp->f_path.dentry;
-			struct inode *inode;
-			const struct file_operations *fops;
-
-			if (dentry->d_op != &proc_dentry_operations)
-				continue;
-			inode = dentry->d_inode;
-			if (PDE(inode) != de)
-				continue;
-			fops = filp->f_op;
-			filp->f_op = NULL;
-			fops_put(fops);
-		}
-		file_list_unlock();
-	}
-	spin_unlock(&sb_lock);
-}
-
 static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
 					  const char *name,
 					  mode_t mode,
@@ -764,8 +729,6 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 continue_removing:
 		if (S_ISDIR(de->mode))
 			parent->nlink--;
-		if (!S_ISREG(de->mode))
-			proc_kill_inodes(de);
 		de->nlink = 0;
 		WARN_ON(de->subdir);
 		if (!atomic_read(&de->count))
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1b2b6c6bb475..1820eb2ef762 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -78,5 +78,3 @@ static inline int proc_fd(struct inode *inode)
 {
 	return PROC_I(inode)->fd;
 }
-
-extern struct file_system_type proc_fs_type;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 1f86bb860e04..ec9cb3b6c93b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -98,7 +98,7 @@ static void proc_kill_sb(struct super_block *sb)
 	put_pid_ns(ns);
 }
 
-struct file_system_type proc_fs_type = {
+static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.get_sb		= proc_get_sb,
 	.kill_sb	= proc_kill_sb,
-- 
cgit v1.2.3


From 19fd4bb2a0cfede054e4904e0b167e0ca4f36cc7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 28 Nov 2007 16:21:26 -0800
Subject: proc: remove races from proc_id_readdir()

Oleg noticed that the call of task_pid_nr_ns() in proc_pid_readdir
is racy with respect to tasks exiting.

After a bit of examination it also appears that the call itself
is completely unnecessary.

So to fix the problem this patch modifies next_tgid() to return
both a tgid and the task struct in question.

A structure is introduced to return these values because it is
slightly cleaner and easier to optimize, and the resulting code
is a little shorter.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 51 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index a17c26859074..02a63ac04178 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2411,19 +2411,23 @@ out:
  * Find the first task with tgid >= tgid
  *
  */
-static struct task_struct *next_tgid(unsigned int tgid,
-		struct pid_namespace *ns)
-{
+struct tgid_iter {
+	unsigned int tgid;
 	struct task_struct *task;
+};
+static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
+{
 	struct pid *pid;
 
+	if (iter.task)
+		put_task_struct(iter.task);
 	rcu_read_lock();
 retry:
-	task = NULL;
-	pid = find_ge_pid(tgid, ns);
+	iter.task = NULL;
+	pid = find_ge_pid(iter.tgid, ns);
 	if (pid) {
-		tgid = pid_nr_ns(pid, ns) + 1;
-		task = pid_task(pid, PIDTYPE_PID);
+		iter.tgid = pid_nr_ns(pid, ns);
+		iter.task = pid_task(pid, PIDTYPE_PID);
 		/* What we to know is if the pid we have find is the
 		 * pid of a thread_group_leader.  Testing for task
 		 * being a thread_group_leader is the obvious thing
@@ -2436,23 +2440,25 @@ retry:
 		 * found doesn't happen to be a thread group leader.
 		 * As we don't care in the case of readdir.
 		 */
-		if (!task || !has_group_leader_pid(task))
+		if (!iter.task || !has_group_leader_pid(iter.task)) {
+			iter.tgid += 1;
 			goto retry;
-		get_task_struct(task);
+		}
+		get_task_struct(iter.task);
 	}
 	rcu_read_unlock();
-	return task;
+	return iter;
 }
 
 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
 
 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	struct task_struct *task, int tgid)
+	struct tgid_iter iter)
 {
 	char name[PROC_NUMBUF];
-	int len = snprintf(name, sizeof(name), "%d", tgid);
+	int len = snprintf(name, sizeof(name), "%d", iter.tgid);
 	return proc_fill_cache(filp, dirent, filldir, name, len,
-				proc_pid_instantiate, task, NULL);
+				proc_pid_instantiate, iter.task, NULL);
 }
 
 /* for the /proc/ directory itself, after non-process stuff has been done */
@@ -2460,8 +2466,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
 	struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
-	struct task_struct *task;
-	int tgid;
+	struct tgid_iter iter;
 	struct pid_namespace *ns;
 
 	if (!reaper)
@@ -2474,14 +2479,14 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	}
 
 	ns = filp->f_dentry->d_sb->s_fs_info;
-	tgid = filp->f_pos - TGID_OFFSET;
-	for (task = next_tgid(tgid, ns);
-	     task;
-	     put_task_struct(task), task = next_tgid(tgid + 1, ns)) {
-		tgid = task_pid_nr_ns(task, ns);
-		filp->f_pos = tgid + TGID_OFFSET;
-		if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) {
-			put_task_struct(task);
+	iter.task = NULL;
+	iter.tgid = filp->f_pos - TGID_OFFSET;
+	for (iter = next_tgid(ns, iter);
+	     iter.task;
+	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
+		filp->f_pos = iter.tgid + TGID_OFFSET;
+		if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+			put_task_struct(iter.task);
 			goto out;
 		}
 	}
-- 
cgit v1.2.3


From 6454d1f9038f708d7deef6270ed4ba5bb6e55869 Mon Sep 17 00:00:00 2001
From: Tobias Poschwatta <tp@fonz.de>
Date: Wed, 28 Nov 2007 16:21:45 -0800
Subject: fix up ext2_fs.h for userspace after reservations backport

In commit a686cd898bd999fd026a51e90fb0a3410d258ddb:

 "Val's cross-port of the ext3 reservations code into ext2."

include/linux/ext2_fs.h got a new function whose return value is only
defined if __KERNEL__ is defined. Putting #ifdef __KERNEL__ around the
function seems to help, patch below.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/ext2.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 7730388c4931..c87ae29c19cb 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -178,3 +178,10 @@ extern const struct inode_operations ext2_special_inode_operations;
 /* symlink.c */
 extern const struct inode_operations ext2_fast_symlink_inode_operations;
 extern const struct inode_operations ext2_symlink_inode_operations;
+
+static inline ext2_fsblk_t
+ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
+{
+	return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
+		le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
+}
-- 
cgit v1.2.3


From bcb4be809d2a804ff040d95db4a664113833e702 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 28 Nov 2007 16:21:59 -0800
Subject: fuse: fix reading past EOF

Currently reading a fuse file will stop at cached i_size and return
EOF, even though the file might have grown since the attributes were
last updated.

So detect if trying to read past EOF, and refresh the attributes
before continuing with the read.

Thanks to mpb for the report.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dir.c    | 48 +++++++++++++++++++++++++++++-------------------
 fs/fuse/file.c   | 21 ++++++++++++++++++++-
 fs/fuse/fuse_i.h |  3 +++
 3 files changed, 52 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3763757f9fe7..7b3df35cf196 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -775,6 +775,31 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
 	return err;
 }
 
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+			   struct file *file, bool *refreshed)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	int err;
+	bool r;
+
+	if (fi->i_time < get_jiffies_64()) {
+		r = true;
+		err = fuse_do_getattr(inode, stat, file);
+	} else {
+		r = false;
+		err = 0;
+		if (stat) {
+			generic_fillattr(inode, stat);
+			stat->mode = fi->orig_i_mode;
+		}
+	}
+
+	if (refreshed != NULL)
+		*refreshed = r;
+
+	return err;
+}
+
 /*
  * Calling into a user-controlled filesystem gives the filesystem
  * daemon ptrace-like capabilities over the requester process.  This
@@ -862,14 +887,9 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 	 */
 	if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
 	    ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		if (fi->i_time < get_jiffies_64()) {
-			err = fuse_do_getattr(inode, NULL, NULL);
-			if (err)
-				return err;
-
-			refreshed = true;
-		}
+		err = fuse_update_attributes(inode, NULL, NULL, &refreshed);
+		if (err)
+			return err;
 	}
 
 	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
@@ -1173,22 +1193,12 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
 			struct kstat *stat)
 {
 	struct inode *inode = entry->d_inode;
-	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	int err;
 
 	if (!fuse_allow_task(fc, current))
 		return -EACCES;
 
-	if (fi->i_time < get_jiffies_64())
-		err = fuse_do_getattr(inode, stat, NULL);
-	else {
-		err = 0;
-		generic_fillattr(inode, stat);
-		stat->mode = fi->orig_i_mode;
-	}
-
-	return err;
+	return fuse_update_attributes(inode, stat, NULL, NULL);
 }
 
 static int fuse_setxattr(struct dentry *entry, const char *name,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 535b37399009..474968fbb555 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -453,6 +453,25 @@ out:
 	return err;
 }
 
+static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				  unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+
+	if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) {
+		int err;
+		/*
+		 * If trying to read past EOF, make sure the i_size
+		 * attribute is up-to-date.
+		 */
+		err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
+		if (err)
+			return err;
+	}
+
+	return generic_file_aio_read(iocb, iov, nr_segs, pos);
+}
+
 static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
 			    struct inode *inode, loff_t pos, size_t count,
 			    int writepage)
@@ -887,7 +906,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
 static const struct file_operations fuse_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
-	.aio_read	= generic_file_aio_read,
+	.aio_read	= fuse_file_aio_read,
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= fuse_file_mmap,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6c5461de1a5f..19b0129c9811 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -593,3 +593,6 @@ int fuse_valid_type(int m);
 int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task);
 
 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+			   struct file *file, bool *refreshed);
-- 
cgit v1.2.3


From 7dca9fd39f7d4605ac178a67bb1772381056917d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 28 Nov 2007 16:21:59 -0800
Subject: fuse: cleanup: add fuse_get_attr_version()

Extract repeated code into helper function, as suggested by Akpm.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dir.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 7b3df35cf196..c84f825b57e7 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -132,6 +132,21 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 	req->out.args[0].value = outarg;
 }
 
+static u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+	u64 curr_version;
+
+	/*
+	 * The spin lock isn't actually needed on 64bit archs, but we
+	 * don't yet care too much about such optimizations.
+	 */
+	spin_lock(&fc->lock);
+	curr_version = fc->attr_version;
+	spin_unlock(&fc->lock);
+
+	return curr_version;
+}
+
 /*
  * Check whether the dentry is still valid
  *
@@ -171,9 +186,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 			return 0;
 		}
 
-		spin_lock(&fc->lock);
-		attr_version = fc->attr_version;
-		spin_unlock(&fc->lock);
+		attr_version = fuse_get_attr_version(fc);
 
 		parent = dget_parent(entry);
 		fuse_lookup_init(req, parent->d_inode, entry, &outarg);
@@ -264,9 +277,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 		return ERR_PTR(PTR_ERR(forget_req));
 	}
 
-	spin_lock(&fc->lock);
-	attr_version = fc->attr_version;
-	spin_unlock(&fc->lock);
+	attr_version = fuse_get_attr_version(fc);
 
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
@@ -733,9 +744,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	spin_lock(&fc->lock);
-	attr_version = fc->attr_version;
-	spin_unlock(&fc->lock);
+	attr_version = fuse_get_attr_version(fc);
 
 	memset(&inarg, 0, sizeof(inarg));
 	memset(&outarg, 0, sizeof(outarg));
-- 
cgit v1.2.3


From a6643094e73247c1ebd36816f494f631fa7be348 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 28 Nov 2007 16:22:00 -0800
Subject: fuse: pass open flags to read and write

Some open flags (O_APPEND, O_DIRECT) can be changed with fcntl(F_SETFL, ...)
after open, but fuse currently only sends the flags to userspace in open.

To make it possible to correcly handle changing flags, send the
current value to userspace in each read and write.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dir.c    |  3 +--
 fs/fuse/file.c   | 26 +++++++++++++++-----------
 fs/fuse/fuse_i.h |  2 +-
 3 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c84f825b57e7..dfc32dc97f7f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -964,7 +964,6 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	struct page *page;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_file *ff = file->private_data;
 	struct fuse_req *req;
 
 	if (is_bad_inode(inode))
@@ -981,7 +980,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	}
 	req->num_pages = 1;
 	req->pages[0] = page;
-	fuse_read_fill(req, ff, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+	fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
 	request_send(fc, req);
 	nbytes = req->out.args[0].size;
 	err = req->out.h.error;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 474968fbb555..bb05d227cf30 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -289,14 +289,16 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
 	return fuse_fsync_common(file, de, datasync, 0);
 }
 
-void fuse_read_fill(struct fuse_req *req, struct fuse_file *ff,
+void fuse_read_fill(struct fuse_req *req, struct file *file,
 		    struct inode *inode, loff_t pos, size_t count, int opcode)
 {
 	struct fuse_read_in *inarg = &req->misc.read_in;
+	struct fuse_file *ff = file->private_data;
 
 	inarg->fh = ff->fh;
 	inarg->offset = pos;
 	inarg->size = count;
+	inarg->flags = file->f_flags;
 	req->in.h.opcode = opcode;
 	req->in.h.nodeid = get_node_id(inode);
 	req->in.numargs = 1;
@@ -313,9 +315,8 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
 			     fl_owner_t owner)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_file *ff = file->private_data;
 
-	fuse_read_fill(req, ff, inode, pos, count, FUSE_READ);
+	fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
 	if (owner != NULL) {
 		struct fuse_read_in *inarg = &req->misc.read_in;
 
@@ -376,15 +377,16 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 	fuse_put_request(fc, req);
 }
 
-static void fuse_send_readpages(struct fuse_req *req, struct fuse_file *ff,
+static void fuse_send_readpages(struct fuse_req *req, struct file *file,
 				struct inode *inode)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	loff_t pos = page_offset(req->pages[0]);
 	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
 	req->out.page_zeroing = 1;
-	fuse_read_fill(req, ff, inode, pos, count, FUSE_READ);
+	fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
 	if (fc->async_read) {
+		struct fuse_file *ff = file->private_data;
 		req->ff = fuse_file_get(ff);
 		req->end = fuse_readpages_end;
 		request_send_background(fc, req);
@@ -396,7 +398,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct fuse_file *ff,
 
 struct fuse_fill_data {
 	struct fuse_req *req;
-	struct fuse_file *ff;
+	struct file *file;
 	struct inode *inode;
 };
 
@@ -411,7 +413,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
 	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
 	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
-		fuse_send_readpages(req, data->ff, inode);
+		fuse_send_readpages(req, data->file, inode);
 		data->req = req = fuse_get_req(fc);
 		if (IS_ERR(req)) {
 			unlock_page(page);
@@ -435,7 +437,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 	if (is_bad_inode(inode))
 		goto out;
 
-	data.ff = file->private_data;
+	data.file = file;
 	data.inode = inode;
 	data.req = fuse_get_req(fc);
 	err = PTR_ERR(data.req);
@@ -445,7 +447,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
 	if (!err) {
 		if (data.req->num_pages)
-			fuse_send_readpages(data.req, data.ff, inode);
+			fuse_send_readpages(data.req, file, inode);
 		else
 			fuse_put_request(fc, data.req);
 	}
@@ -472,11 +474,12 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	return generic_file_aio_read(iocb, iov, nr_segs, pos);
 }
 
-static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
+static void fuse_write_fill(struct fuse_req *req, struct file *file,
 			    struct inode *inode, loff_t pos, size_t count,
 			    int writepage)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
 	struct fuse_write_in *inarg = &req->misc.write.in;
 	struct fuse_write_out *outarg = &req->misc.write.out;
 
@@ -485,6 +488,7 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
 	inarg->offset = pos;
 	inarg->size = count;
 	inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
+	inarg->flags = file->f_flags;
 	req->in.h.opcode = FUSE_WRITE;
 	req->in.h.nodeid = get_node_id(inode);
 	req->in.argpages = 1;
@@ -505,7 +509,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
 			      fl_owner_t owner)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	fuse_write_fill(req, file->private_data, inode, pos, count, 0);
+	fuse_write_fill(req, file, inode, pos, count, 0);
 	if (owner != NULL) {
 		struct fuse_write_in *inarg = &req->misc.write.in;
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 19b0129c9811..3ab8a3048e8b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -447,7 +447,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 /**
  * Initialize READ or READDIR request
  */
-void fuse_read_fill(struct fuse_req *req, struct fuse_file *ff,
+void fuse_read_fill(struct fuse_req *req, struct file *file,
 		    struct inode *inode, loff_t pos, size_t count, int opcode);
 
 /**
-- 
cgit v1.2.3


From d0186b25e65d4d786727a03044b8aafe2ba118ee Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 28 Nov 2007 16:22:01 -0800
Subject: fuse: fix FUSE_FILE_OPS sending

FUSE_FILE_OPS is meant to signal that the kernel will send the open file to to
the userspace filesystem for operations on open files, so that sillyrenaming
unlinked files becomes unnecessary.

However this needs VFS changes, which won't make it into 2.6.24.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9a68d6970845..0cc95eeb9379 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -562,8 +562,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
-	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_FILE_OPS |
-		FUSE_ATOMIC_O_TRUNC;
+	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
-- 
cgit v1.2.3


From fbee36b92abc965d3fc2862aa60fd0dfcf779d0b Mon Sep 17 00:00:00 2001
From: John Muir <muirj@nortel.com>
Date: Wed, 28 Nov 2007 16:22:02 -0800
Subject: fuse: fix uninitialized field in fuse_inode

I found problems accessing (executing) previously existing files, until
I did chmod on them (or setattr).

If the fi->attr_version is not initialized, then it could be
larger than fc->attr_version until a setattr is executed, and as a
result the inode attributes would never be set.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0cc95eeb9379..84f9f7dfdf5b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -56,6 +56,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->i_time = 0;
 	fi->nodeid = 0;
 	fi->nlookup = 0;
+	fi->attr_version = 0;
 	INIT_LIST_HEAD(&fi->write_files);
 	fi->forget_req = fuse_request_alloc();
 	if (!fi->forget_req) {
-- 
cgit v1.2.3


From 08b633070ad5fa17a837428a601c32cf3db6aafd Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 28 Nov 2007 16:22:03 -0800
Subject: fuse: fix attribute caching after rename

Invalidate attributes on rename, since some filesystems may update
st_ctime.  Reported by Szabolcs Szakacsits

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dir.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index dfc32dc97f7f..80d2f5292cf9 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -657,6 +657,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
+		/* ctime changes */
+		fuse_invalidate_attr(oldent->d_inode);
+
 		fuse_invalidate_attr(olddir);
 		if (olddir != newdir)
 			fuse_invalidate_attr(newdir);
-- 
cgit v1.2.3


From 81257def2ab8ae1680583ce1e5f018dc6c8ed98d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 28 Nov 2007 16:22:07 -0800
Subject: tty: add the new termios2 ioctls to the compatible list.

Make them depend on TCGETS2.  If that one is implemented the rest should be
there as well.

Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index bd26e4cbb994..e8b7c3a98a54 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1954,6 +1954,12 @@ ULONG_IOCTL(TIOCSCTTY)
 COMPATIBLE_IOCTL(TIOCGPTN)
 COMPATIBLE_IOCTL(TIOCSPTLCK)
 COMPATIBLE_IOCTL(TIOCSERGETLSR)
+#ifdef TCGETS2
+COMPATIBLE_IOCTL(TCGETS2)
+COMPATIBLE_IOCTL(TCSETS2)
+COMPATIBLE_IOCTL(TCSETSW2)
+COMPATIBLE_IOCTL(TCSETSF2)
+#endif
 /* Little f */
 COMPATIBLE_IOCTL(FIOCLEX)
 COMPATIBLE_IOCTL(FIONCLEX)
-- 
cgit v1.2.3


From 2b1e300a9dfc3196ccddf6f1d74b91b7af55e416 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 2 Dec 2007 00:33:17 +1100
Subject: [NETNS]: Fix /proc/net breakage

Well I clearly goofed when I added the initial network namespace support
for /proc/net.  Currently things work but there are odd details visible to
user space, even when we have a single network namespace.

Since we do not cache proc_dir_entry dentries at the moment we can just
modify ->lookup to return a different directory inode depending on the
network namespace of the process looking at /proc/net, replacing the
current technique of using a magic and fragile follow_link method.

To accomplish that this patch:
- introduces a shadow_proc method to allow different dentries to
  be returned from proc_lookup.
- Removes the old /proc/net follow_link magic
- Fixes a weakness in our not caching of proc generic dentries.

As shadow_proc uses a task struct to decided which dentry to return we can
go back later and fix the proc generic caching without modifying any code
that uses the shadow_proc method.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 fs/proc/generic.c  | 12 +++++++-
 fs/proc/proc_net.c | 86 ++++--------------------------------------------------
 2 files changed, 16 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a9806bc21ec3..c2b752341f89 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -374,9 +374,16 @@ static int proc_delete_dentry(struct dentry * dentry)
 	return 1;
 }
 
+static int proc_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
+{
+	d_drop(dentry);
+	return 0;
+}
+
 static struct dentry_operations proc_dentry_operations =
 {
 	.d_delete	= proc_delete_dentry,
+	.d_revalidate	= proc_revalidate_dentry,
 };
 
 /*
@@ -397,8 +404,11 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
 			if (de->namelen != dentry->d_name.len)
 				continue;
 			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
-				unsigned int ino = de->low_ino;
+				unsigned int ino;
 
+				if (de->shadow_proc)
+					de = de->shadow_proc(current, de);
+				ino = de->low_ino;
 				de_get(de);
 				spin_unlock(&proc_subdir_lock);
 				error = -EINVAL;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 131f9c68be5f..0afe21ee0607 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -50,89 +50,14 @@ struct net *get_proc_net(const struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(get_proc_net);
 
-static struct proc_dir_entry *proc_net_shadow;
+static struct proc_dir_entry *shadow_pde;
 
-static struct dentry *proc_net_shadow_dentry(struct dentry *parent,
+static struct proc_dir_entry *proc_net_shadow(struct task_struct *task,
 						struct proc_dir_entry *de)
 {
-	struct dentry *shadow = NULL;
-	struct inode *inode;
-	if (!de)
-		goto out;
-	de_get(de);
-	inode = proc_get_inode(parent->d_inode->i_sb, de->low_ino, de);
-	if (!inode)
-		goto out_de_put;
-	shadow = d_alloc_name(parent, de->name);
-	if (!shadow)
-		goto out_iput;
-	shadow->d_op = parent->d_op; /* proc_dentry_operations */
-	d_instantiate(shadow, inode);
-out:
-	return shadow;
-out_iput:
-	iput(inode);
-out_de_put:
-	de_put(de);
-	goto out;
-}
-
-static void *proc_net_follow_link(struct dentry *parent, struct nameidata *nd)
-{
-	struct net *net = current->nsproxy->net_ns;
-	struct dentry *shadow;
-	shadow = proc_net_shadow_dentry(parent, net->proc_net);
-	if (!shadow)
-		return ERR_PTR(-ENOENT);
-
-	dput(nd->dentry);
-	/* My dentry count is 1 and that should be enough as the
-	 * shadow dentry is thrown away immediately.
-	 */
-	nd->dentry = shadow;
-	return NULL;
+	return task->nsproxy->net_ns->proc_net;
 }
 
-static struct dentry *proc_net_lookup(struct inode *dir, struct dentry *dentry,
-				      struct nameidata *nd)
-{
-	struct net *net = current->nsproxy->net_ns;
-	struct dentry *shadow;
-
-	shadow = proc_net_shadow_dentry(nd->dentry, net->proc_net);
-	if (!shadow)
-		return ERR_PTR(-ENOENT);
-
-	dput(nd->dentry);
-	nd->dentry = shadow;
-
-	return shadow->d_inode->i_op->lookup(shadow->d_inode, dentry, nd);
-}
-
-static int proc_net_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-	struct net *net = current->nsproxy->net_ns;
-	struct dentry *shadow;
-	int ret;
-
-	shadow = proc_net_shadow_dentry(dentry->d_parent, net->proc_net);
-	if (!shadow)
-		return -ENOENT;
-	ret = shadow->d_inode->i_op->setattr(shadow, iattr);
-	dput(shadow);
-	return ret;
-}
-
-static const struct file_operations proc_net_dir_operations = {
-	.read			= generic_read_dir,
-};
-
-static struct inode_operations proc_net_dir_inode_operations = {
-	.follow_link	= proc_net_follow_link,
-	.lookup		= proc_net_lookup,
-	.setattr	= proc_net_setattr,
-};
-
 static __net_init int proc_net_ns_init(struct net *net)
 {
 	struct proc_dir_entry *root, *netd, *net_statd;
@@ -185,9 +110,8 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {
 
 int __init proc_net_init(void)
 {
-	proc_net_shadow = proc_mkdir("net", NULL);
-	proc_net_shadow->proc_iops = &proc_net_dir_inode_operations;
-	proc_net_shadow->proc_fops = &proc_net_dir_operations;
+	shadow_pde = proc_mkdir("net", NULL);
+	shadow_pde->shadow_proc = proc_net_shadow;
 
 	return register_pernet_subsys(&proc_net_ns_ops);
 }
-- 
cgit v1.2.3


From e136e769d471e7f3d24a8f6bf9c91dcb372bd0ab Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 4 Dec 2007 01:11:09 +0100
Subject: Freezer: Fix JFFS2 garbage collector freezing issue (rev. 2)

Fix breakage caused by commit d5d8c5976d6adeddb8208c240460411e2198b393
"freezer: do not send signals to kernel threads" in
jffs2_garbage_collect_thread() that assumed it would be sent signals
by the freezer.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Pete MacKay <armlinux@architechnical.net>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 fs/jffs2/background.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index d568ae846741..8adebd3e43c6 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -105,7 +105,7 @@ static int jffs2_garbage_collect_thread(void *_c)
 
 		/* Put_super will send a SIGKILL and then wait on the sem.
 		 */
-		while (signal_pending(current)) {
+		while (signal_pending(current) || freezing(current)) {
 			siginfo_t info;
 			unsigned long signr;
 
-- 
cgit v1.2.3


From e00ba3dae077f54cfd2af42e939a618caa7a3bca Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 4 Dec 2007 23:45:02 -0800
Subject: aio: only account I/O wait time in read_events if there are active
 requests

On 2.6.24, top started showing 100% iowait on one CPU when a UML instance was
running (but completely idle).  The UML code sits in io_getevents waiting for
an event to be submitted and completed.

Fix this by checking ctx->reqs_active before scheduling to determine whether
or not we are waiting for I/O.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index f12db415c0f6..9dec7d2d546e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1161,7 +1161,12 @@ retry:
 			ret = 0;
 			if (to.timed_out)	/* Only check after read evt */
 				break;
-			io_schedule();
+			/* Try to only show up in io wait if there are ops
+			 *  in flight */
+			if (ctx->reqs_active)
+				io_schedule();
+			else
+				schedule();
 			if (signal_pending(tsk)) {
 				ret = -EINTR;
 				break;
-- 
cgit v1.2.3


From 0c664f974269bb4c3d38ba900c91a9a5d4cee5b1 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Tue, 4 Dec 2007 23:45:06 -0800
Subject: ufs: fix nexstep dir block size

This patch fixes regression, introduced since 2.6.16.  NextStep variant of
UFS as OpenStep uses directory block size equals to 1024.  Without this
change, ufs_check_page fails in many cases.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Cc: Dave Bailey <dsbailey@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ufs/dir.c   | 2 +-
 fs/ufs/super.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 30f8c2bb0c3e..aaf2878305ce 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -179,7 +179,7 @@ bad_entry:
 	goto fail;
 Eend:
 	p = (struct ufs_dir_entry *)(kaddr + offs);
-	ufs_error (sb, "ext2_check_page",
+	ufs_error(sb, __FUNCTION__,
 		   "entry in directory #%lu spans the page boundary"
 		   "offset=%lu",
 		   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index c78c04fd993f..0072cb33ebec 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -755,13 +755,13 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_NEXTSTEP:
-		/*TODO: check may be we need set special dir block size?*/
 		UFSD("ufstype=nextstep\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
 		uspi->s_sbsize = super_block_size = 2048;
 		uspi->s_sbbase = 0;
+		uspi->s_dirblksize = 1024;
 		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
 		if (!(sb->s_flags & MS_RDONLY)) {
 			if (!silent)
@@ -771,13 +771,13 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
-		/*TODO: check may be we need set special dir block size?*/
 		UFSD("ufstype=nextstep-cd\n");
 		uspi->s_fsize = block_size = 2048;
 		uspi->s_fmask = ~(2048 - 1);
 		uspi->s_fshift = 11;
 		uspi->s_sbsize = super_block_size = 2048;
 		uspi->s_sbbase = 0;
+		uspi->s_dirblksize = 1024;
 		flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
 		if (!(sb->s_flags & MS_RDONLY)) {
 			if (!silent)
-- 
cgit v1.2.3


From d4beaf4ab5f89496f2bcf67db62ad95d99bfeff6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 4 Dec 2007 23:45:27 -0800
Subject: jbd: Fix assertion failure in fs/jbd/checkpoint.c

Before we start committing a transaction, we call
__journal_clean_checkpoint_list() to cleanup transaction's written-back
buffers.

If this call happens to remove all of them (and there were already some
buffers), __journal_remove_checkpoint() will decide to free the transaction
because it isn't (yet) a committing transaction and soon we fail some
assertion - the transaction really isn't ready to be freed :).

We change the check in __journal_remove_checkpoint() to free only a
transaction in T_FINISHED state.  The locking there is subtle though (as
everywhere in JBD ;().  We use j_list_lock to protect the check and a
subsequent call to __journal_drop_transaction() and do the same in the end
of journal_commit_transaction() which is the only place where a transaction
can get to T_FINISHED state.

Probably I'm too paranoid here and such locking is not really necessary -
checkpoint lists are processed only from log_do_checkpoint() where a
transaction must be already committed to be processed or from
__journal_clean_checkpoint_list() where kjournald itself calls it and thus
transaction cannot change state either.  Better be safe if something
changes in future...

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/checkpoint.c | 12 ++++++------
 fs/jbd/commit.c     |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 47552d4a6324..0f69c416eebc 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -602,15 +602,15 @@ int __journal_remove_checkpoint(struct journal_head *jh)
 
 	/*
 	 * There is one special case to worry about: if we have just pulled the
-	 * buffer off a committing transaction's forget list, then even if the
-	 * checkpoint list is empty, the transaction obviously cannot be
-	 * dropped!
+	 * buffer off a running or committing transaction's checkpoing list,
+	 * then even if the checkpoint list is empty, the transaction obviously
+	 * cannot be dropped!
 	 *
-	 * The locking here around j_committing_transaction is a bit sleazy.
+	 * The locking here around t_state is a bit sleazy.
 	 * See the comment at the end of journal_commit_transaction().
 	 */
-	if (transaction == journal->j_committing_transaction) {
-		JBUFFER_TRACE(jh, "belongs to committing transaction");
+	if (transaction->t_state != T_FINISHED) {
+		JBUFFER_TRACE(jh, "belongs to running/committing transaction");
 		goto out;
 	}
 
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 8f1f2aa5fb39..610264b99a8e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -858,10 +858,10 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 	/*
-	 * This is a bit sleazy.  We borrow j_list_lock to protect
-	 * journal->j_committing_transaction in __journal_remove_checkpoint.
-	 * Really, __journal_remove_checkpoint should be using j_state_lock but
-	 * it's a bit hassle to hold that across __journal_remove_checkpoint
+	 * This is a bit sleazy.  We use j_list_lock to protect transition
+	 * of a transaction into T_FINISHED state and calling
+	 * __journal_drop_transaction(). Otherwise we could race with
+	 * other checkpointing code processing the transaction...
 	 */
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
-- 
cgit v1.2.3


From 5a622f2d0f86b316b07b55a4866ecb5518dd1cf7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Tue, 4 Dec 2007 23:45:28 -0800
Subject: proc: fix proc_dir_entry refcounting

Creating PDEs with refcount 0 and "deleted" flag has problems (see below).
Switch to usual scheme:
* PDE is created with refcount 1
* every de_get does +1
* every de_put() and remove_proc_entry() do -1
* once refcount reaches 0, PDE is freed.

This elegantly fixes at least two following races (both observed) without
introducing new locks, without abusing old locks, without spreading
lock_kernel():

1) PDE leak

remove_proc_entry			de_put
-----------------			------
			[refcnt = 1]
if (atomic_read(&de->count) == 0)
					if (atomic_dec_and_test(&de->count))
						if (de->deleted)
							/* also not taken! */
							free_proc_entry(de);
else
	de->deleted = 1;
		[refcount=0, deleted=1]

2) use after free

remove_proc_entry			de_put
-----------------			------
			[refcnt = 1]

					if (atomic_dec_and_test(&de->count))
if (atomic_read(&de->count) == 0)
	free_proc_entry(de);
						/* boom! */
						if (de->deleted)
							free_proc_entry(de);

BUG: unable to handle kernel paging request at virtual address 6b6b6b6b
printing eip: c10acdda *pdpt = 00000000338f8001 *pde = 0000000000000000
Oops: 0000 [#1] PREEMPT SMP
Modules linked in: af_packet ipv6 cpufreq_ondemand loop serio_raw psmouse k8temp hwmon sr_mod cdrom
Pid: 23161, comm: cat Not tainted (2.6.24-rc2-8c0863403f109a43d7000b4646da4818220d501f #4)
EIP: 0060:[<c10acdda>] EFLAGS: 00210097 CPU: 1
EIP is at strnlen+0x6/0x18
EAX: 6b6b6b6b EBX: 6b6b6b6b ECX: 6b6b6b6b EDX: fffffffe
ESI: c128fa3b EDI: f380bf34 EBP: ffffffff ESP: f380be44
 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process cat (pid: 23161, ti=f380b000 task=f38f2570 task.ti=f380b000)
Stack: c10ac4f0 00000278 c12ce000 f43cd2a8 00000163 00000000 7da86067 00000400
       c128fa20 00896b18 f38325a8 c128fe20 ffffffff 00000000 c11f291e 00000400
       f75be300 c128fa20 f769c9a0 c10ac779 f380bf34 f7bfee70 c1018e6b f380bf34
Call Trace:
 [<c10ac4f0>] vsnprintf+0x2ad/0x49b
 [<c10ac779>] vscnprintf+0x14/0x1f
 [<c1018e6b>] vprintk+0xc5/0x2f9
 [<c10379f1>] handle_fasteoi_irq+0x0/0xab
 [<c1004f44>] do_IRQ+0x9f/0xb7
 [<c117db3b>] preempt_schedule_irq+0x3f/0x5b
 [<c100264e>] need_resched+0x1f/0x21
 [<c10190ba>] printk+0x1b/0x1f
 [<c107c8ad>] de_put+0x3d/0x50
 [<c107c8f8>] proc_delete_inode+0x38/0x41
 [<c107c8c0>] proc_delete_inode+0x0/0x41
 [<c1066298>] generic_delete_inode+0x5e/0xc6
 [<c1065aa9>] iput+0x60/0x62
 [<c1063c8e>] d_kill+0x2d/0x46
 [<c1063fa9>] dput+0xdc/0xe4
 [<c10571a1>] __fput+0xb0/0xcd
 [<c1054e49>] filp_close+0x48/0x4f
 [<c1055ee9>] sys_close+0x67/0xa5
 [<c10026b6>] sysenter_past_esp+0x5f/0x85
=======================
Code: c9 74 0c f2 ae 74 05 bf 01 00 00 00 4f 89 fa 5f 89 d0 c3 85 c9 57 89 c7 89 d0 74 05 f2 ae 75 01 4f 89 f8 5f c3 89 c1 89 c8 eb 06 <80> 38 00 74 07 40 4a 83 fa ff 75 f4 29 c8 c3 90 90 90 57 83 c9
EIP: [<c10acdda>] strnlen+0x6/0x18 SS:ESP 0068:f380be44

Also, remove broken usage of ->deleted from reiserfs: if sget() succeeds,
module is already pinned and remove_proc_entry() can't happen => nobody
can mark PDE deleted.

Dummy proc root in netns code is not marked with refcount 1. AFAICS, we
never get it, it's just for proper /proc/net removal. I double checked
CLONE_NETNS continues to work.

Patch survives many hours of modprobe/rmmod/cat loops without new bugs
which can be attributed to refcounting.

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c    | 9 ++-------
 fs/proc/inode.c      | 9 ++-------
 fs/proc/root.c       | 1 +
 fs/reiserfs/procfs.c | 6 ------
 4 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5fccfe222a63..8d49838e5554 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -595,6 +595,7 @@ static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
 	ent->namelen = len;
 	ent->mode = mode;
 	ent->nlink = nlink;
+	atomic_set(&ent->count, 1);
 	ent->pde_users = 0;
 	spin_lock_init(&ent->pde_unload_lock);
 	ent->pde_unload_completion = NULL;
@@ -692,7 +693,6 @@ void free_proc_entry(struct proc_dir_entry *de)
 
 /*
  * Remove a /proc entry and free it if it's not currently in use.
- * If it is in use, we set the 'deleted' flag.
  */
 void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 {
@@ -741,13 +741,8 @@ continue_removing:
 			parent->nlink--;
 		de->nlink = 0;
 		WARN_ON(de->subdir);
-		if (!atomic_read(&de->count))
+		if (atomic_dec_and_test(&de->count))
 			free_proc_entry(de);
-		else {
-			de->deleted = 1;
-			printk("remove_proc_entry: %s/%s busy, count=%d\n",
-				parent->name, de->name, atomic_read(&de->count));
-		}
 		break;
 	}
 	spin_unlock(&proc_subdir_lock);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index abe6a3f04368..1a551d92e1d8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,13 +43,8 @@ void de_put(struct proc_dir_entry *de)
 			return;
 		}
 
-		if (atomic_dec_and_test(&de->count)) {
-			if (de->deleted) {
-				printk("de_put: deferred delete of %s\n",
-					de->name);
-				free_proc_entry(de);
-			}
-		}		
+		if (atomic_dec_and_test(&de->count))
+			free_proc_entry(de);
 		unlock_kernel();
 	}
 }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ec9cb3b6c93b..81f99e691f99 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -207,6 +207,7 @@ struct proc_dir_entry proc_root = {
 	.name		= "/proc",
 	.mode		= S_IFDIR | S_IRUGO | S_IXUGO, 
 	.nlink		= 2, 
+	.count		= ATOMIC_INIT(1),
 	.proc_iops	= &proc_root_inode_operations, 
 	.proc_fops	= &proc_root_operations,
 	.parent		= &proc_root,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 9aa7a06e093f..001144621672 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -420,12 +420,6 @@ static void *r_start(struct seq_file *m, loff_t * pos)
 		return NULL;
 
 	up_write(&s->s_umount);
-
-	if (de->deleted) {
-		deactivate_super(s);
-		return NULL;
-	}
-
 	return s;
 }
 
-- 
cgit v1.2.3


From 9b5e6857b3f3acc8ab434e565b7ec87bf9f9b53c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 5 Dec 2007 08:24:38 +0000
Subject: regression: cifs endianness bug

access_flags_to_mode() gets on-the-wire data (little-endian) and treats
it as host-endian.

Introduced in commit e01b64001359034d04c695388870936ed3d1b56b ("[CIFS]
enable get mode from ACL when cifsacl mount option specified")

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cifs/cifsacl.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index f02fdef463a7..c312adcba4fc 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -134,9 +134,10 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
    pmode is the existing mode (we only want to overwrite part of this
    bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007
 */
-static void access_flags_to_mode(__u32 ace_flags, int type, umode_t *pmode,
+static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
 				 umode_t *pbits_to_set)
 {
+	__u32 flags = le32_to_cpu(ace_flags);
 	/* the order of ACEs is important.  The canonical order is to begin with
 	   DENY entries followed by ALLOW, otherwise an allow entry could be
 	   encountered first, making the subsequent deny entry like "dead code"
@@ -146,17 +147,17 @@ static void access_flags_to_mode(__u32 ace_flags, int type, umode_t *pmode,
 	/* For deny ACEs we change the mask so that subsequent allow access
 	   control entries do not turn on the bits we are denying */
 	if (type == ACCESS_DENIED) {
-		if (ace_flags & GENERIC_ALL) {
+		if (flags & GENERIC_ALL) {
 			*pbits_to_set &= ~S_IRWXUGO;
 		}
-		if ((ace_flags & GENERIC_WRITE) ||
-			((ace_flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
+		if ((flags & GENERIC_WRITE) ||
+			((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
 			*pbits_to_set &= ~S_IWUGO;
-		if ((ace_flags & GENERIC_READ) ||
-			((ace_flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
+		if ((flags & GENERIC_READ) ||
+			((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
 			*pbits_to_set &= ~S_IRUGO;
-		if ((ace_flags & GENERIC_EXECUTE) ||
-			((ace_flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
+		if ((flags & GENERIC_EXECUTE) ||
+			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
 			*pbits_to_set &= ~S_IXUGO;
 		return;
 	} else if (type != ACCESS_ALLOWED) {
@@ -165,25 +166,25 @@ static void access_flags_to_mode(__u32 ace_flags, int type, umode_t *pmode,
 	}
 	/* else ACCESS_ALLOWED type */
 
-	if (ace_flags & GENERIC_ALL) {
+	if (flags & GENERIC_ALL) {
 		*pmode |= (S_IRWXUGO & (*pbits_to_set));
 #ifdef CONFIG_CIFS_DEBUG2
 		cFYI(1, ("all perms"));
 #endif
 		return;
 	}
-	if ((ace_flags & GENERIC_WRITE) ||
-			((ace_flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
+	if ((flags & GENERIC_WRITE) ||
+			((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS))
 		*pmode |= (S_IWUGO & (*pbits_to_set));
-	if ((ace_flags & GENERIC_READ) ||
-			((ace_flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
+	if ((flags & GENERIC_READ) ||
+			((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS))
 		*pmode |= (S_IRUGO & (*pbits_to_set));
-	if ((ace_flags & GENERIC_EXECUTE) ||
-			((ace_flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
+	if ((flags & GENERIC_EXECUTE) ||
+			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
 		*pmode |= (S_IXUGO & (*pbits_to_set));
 
 #ifdef CONFIG_CIFS_DEBUG2
-	cFYI(1, ("access flags 0x%x mode now 0x%x", ace_flags, *pmode));
+	cFYI(1, ("access flags 0x%x mode now 0x%x", flags, *pmode));
 #endif
 	return;
 }
-- 
cgit v1.2.3


From 7e46aa5c8cb1347853de9ec86f3fa440f9dc9d77 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 5 Dec 2007 08:32:52 +0000
Subject: regression: bfs endianness bug

BFS_FILEBLOCKS() expects struct bfs_inode * (on-disk data, with little-
endian fields), not struct bfs_inode_info * (in-core stuff, with host-
endian ones).

It's a macro and fields with the right names are present in
bfs_inode_info, so it compiles, but on big-endian host it gives bogus
results.

Introduced in commit f433dc56344cb72cc3de5ba0819021cec3aef807 ("Fixes to
the BFS filesystem driver").

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 294c41baef6e..a64a71d444f5 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -178,7 +178,8 @@ static void bfs_delete_inode(struct inode *inode)
 	brelse(bh);
 
         if (bi->i_dsk_ino) {
-		info->si_freeb += BFS_FILEBLOCKS(bi);
+		if (bi->i_sblock)
+			info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
 		info->si_freei++;
 		clear_bit(ino, info->si_imap);
 		dump_imap("delete_inode", s);
-- 
cgit v1.2.3


From 97bd7919e2c1445dabbcc2686795dbb52316b923 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 5 Dec 2007 08:46:47 +0000
Subject: remove nonsense force-casts from ocfs2

endianness annotations in networking code had been in place for quite a
while; in particular, sin_port and s_addr are annotated as big-endian.

Code in ocfs2 had __force casts added apparently to shut the sparse
warnings up; of course, these days they only serve to *produce* warnings
for no reason whatsoever...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d84bd155997b..ee50c9610e7f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,14 +72,6 @@
 
 #include "tcp_internal.h"
 
-/* 
- * The linux network stack isn't sparse endian clean.. It has macros like
- * ntohs() which perform the endian checks and structs like sockaddr_in
- * which aren't annotated.  So __force is found here to get the build
- * clean.  When they emerge from the dark ages and annotate the code
- * we can remove these.
- */
-
 #define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
 #define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,	\
 			  NIPQUAD(sc->sc_node->nd_ipv4_address),	\
@@ -1500,7 +1492,7 @@ static void o2net_start_connect(struct work_struct *work)
 
 	myaddr.sin_family = AF_INET;
 	myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
-	myaddr.sin_port = (__force u16)htons(0); /* any port */
+	myaddr.sin_port = htons(0); /* any port */
 
 	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
 			      sizeof(myaddr));
@@ -1701,11 +1693,11 @@ static int o2net_accept_one(struct socket *sock)
 	if (ret < 0)
 		goto out;
 
-	node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
+	node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
 	if (node == NULL) {
 		mlog(ML_NOTICE, "attempt to connect from unknown node at "
 		     "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
-		     ntohs((__force __be16)sin.sin_port));
+		     ntohs(sin.sin_port));
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1714,7 +1706,7 @@ static int o2net_accept_one(struct socket *sock)
 		mlog(ML_NOTICE, "unexpected connect attempted from a lower "
 		     "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
 		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
-		     ntohs((__force __be16)sin.sin_port), node->nd_num);
+		     ntohs(sin.sin_port), node->nd_num);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1725,7 +1717,7 @@ static int o2net_accept_one(struct socket *sock)
 		mlog(ML_CONN, "attempt to connect from node '%s' at "
 		     "%u.%u.%u.%u:%d but it isn't heartbeating\n",
 		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
-		     ntohs((__force __be16)sin.sin_port));
+		     ntohs(sin.sin_port));
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1742,7 +1734,7 @@ static int o2net_accept_one(struct socket *sock)
 		mlog(ML_NOTICE, "attempt to connect from node '%s' at "
 		     "%u.%u.%u.%u:%d but it already has an open connection\n",
 		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
-		     ntohs((__force __be16)sin.sin_port));
+		     ntohs(sin.sin_port));
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 4a6e9e2ce822c9f597b3036887f6cf5fa3a79375 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 30 Aug 2007 16:10:22 -0400
Subject: Use wake_up_locked() in eventpoll

Replace the uses of __wake_up_locked with wake_up_locked

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
---
 fs/eventpoll.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 34f68f3a069a..81c04abfb1aa 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -656,8 +656,7 @@ is_linked:
 	 * wait list.
 	 */
 	if (waitqueue_active(&ep->wq))
-		__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
-				 TASK_INTERRUPTIBLE);
+		wake_up_locked(&ep->wq);
 	if (waitqueue_active(&ep->poll_wait))
 		pwake++;
 
@@ -780,7 +779,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 		/* Notify waiting tasks that events are available */
 		if (waitqueue_active(&ep->wq))
-			__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
+			wake_up_locked(&ep->wq);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
@@ -854,8 +853,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 
 			/* Notify waiting tasks that events are available */
 			if (waitqueue_active(&ep->wq))
-				__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
-						 TASK_INTERRUPTIBLE);
+				wake_up_locked(&ep->wq);
 			if (waitqueue_active(&ep->poll_wait))
 				pwake++;
 		}
@@ -978,8 +976,7 @@ errxit:
 		 * wait list (delayed after we release the lock).
 		 */
 		if (waitqueue_active(&ep->wq))
-			__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
-					 TASK_INTERRUPTIBLE);
+			wake_up_locked(&ep->wq);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
-- 
cgit v1.2.3


From 1587e2b1880632d959db6ac9e79cb1d99a73c656 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 6 Dec 2007 11:03:36 -0500
Subject: proc/array.c: Use TASK_REPORT

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 fs/proc/array.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 65c62e1bfd6f..5be663e5dad1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -141,12 +141,7 @@ static const char *task_state_array[] = {
 
 static inline const char *get_task_state(struct task_struct *tsk)
 {
-	unsigned int state = (tsk->state & (TASK_RUNNING |
-					    TASK_INTERRUPTIBLE |
-					    TASK_UNINTERRUPTIBLE |
-					    TASK_STOPPED |
-					    TASK_TRACED)) |
-					   tsk->exit_state;
+	unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
 	const char **p = &task_state_array[0];
 
 	while (state) {
-- 
cgit v1.2.3


From 6d8982d9b8f4b771754335f1398e406cc72003c3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 6 Dec 2007 11:04:01 -0500
Subject: proc/base.c: Use task_is_*

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 fs/proc/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 02a63ac04178..e88ee1a0323a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -199,7 +199,7 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
 	(task == current || \
 	(task->parent == current && \
 	(task->ptrace & PT_PTRACED) && \
-	 (task->state == TASK_STOPPED || task->state == TASK_TRACED) && \
+	 (task_is_stopped_or_traced(task)) && \
 	 security_ptrace(current,task) == 0))
 
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
-- 
cgit v1.2.3


From da78451190bdaae0e67d6c96b1ec3366abc45474 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <howlett@gmail.com>
Date: Thu, 6 Dec 2007 17:39:54 -0500
Subject: Use mutex_lock_killable in vfs_readdir

Signed-off-by: Liam R. Howlett <howlett@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 fs/readdir.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/readdir.c b/fs/readdir.c
index efe52e676577..4e026e5407fb 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -30,7 +30,10 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
 	if (res)
 		goto out;
 
-	mutex_lock(&inode->i_mutex);
+	res = mutex_lock_killable(&inode->i_mutex);
+	if (res)
+		goto out;
+
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
 		res = file->f_op->readdir(file, buf, filler);
-- 
cgit v1.2.3


From 150030b78a454ba50d5e267b0dcf01b162809192 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 6 Dec 2007 16:24:39 -0500
Subject: NFS: Switch from intr mount option to TASK_KILLABLE

By using the TASK_KILLABLE infrastructure, we can get rid of the 'intr'
mount option.  We have to use _killable everywhere instead of _interruptible
as we get rid of rpc_clnt_sigmask/sigunmask.

Signed-off-by: Liam R. Howlett <howlett@gmail.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 fs/nfs/client.c     |  6 +-----
 fs/nfs/direct.c     | 10 +---------
 fs/nfs/inode.c      |  6 +-----
 fs/nfs/mount_clnt.c |  2 +-
 fs/nfs/nfs3proc.c   |  7 ++-----
 fs/nfs/nfs4proc.c   | 27 +++++++--------------------
 fs/nfs/nfsroot.c    |  3 ---
 fs/nfs/pagelist.c   | 18 +++++-------------
 fs/nfs/read.c       |  5 -----
 fs/nfs/super.c      |  4 ----
 fs/nfs/write.c      |  7 +------
 11 files changed, 19 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 70587f383f10..310fa2f4cbb8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -302,7 +302,7 @@ found_client:
 	if (new)
 		nfs_free_client(new);
 
-	error = wait_event_interruptible(nfs_client_active_wq,
+	error = wait_event_killable(nfs_client_active_wq,
 				clp->cl_cons_state != NFS_CS_INITING);
 	if (error < 0) {
 		nfs_put_client(clp);
@@ -494,10 +494,6 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
 	if (server->flags & NFS_MOUNT_SOFT)
 		server->client->cl_softrtry = 1;
 
-	server->client->cl_intr = 0;
-	if (server->flags & NFS4_MOUNT_INTR)
-		server->client->cl_intr = 1;
-
 	return 0;
 }
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 5e8d82f6666b..7b994b2fa593 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -193,7 +193,7 @@ static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
 	if (dreq->iocb)
 		goto out;
 
-	result = wait_for_completion_interruptible(&dreq->completion);
+	result = wait_for_completion_killable(&dreq->completion);
 
 	if (!result)
 		result = dreq->error;
@@ -391,9 +391,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 			       unsigned long nr_segs, loff_t pos)
 {
 	ssize_t result = 0;
-	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
-	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
 
 	dreq = nfs_direct_req_alloc();
@@ -405,11 +403,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	rpc_clnt_sigmask(clnt, &oldset);
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
-	rpc_clnt_sigunmask(clnt, &oldset);
 	nfs_direct_req_release(dreq);
 
 	return result;
@@ -767,9 +763,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 				size_t count)
 {
 	ssize_t result = 0;
-	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
-	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	int sync = 0;
@@ -787,11 +781,9 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	rpc_clnt_sigmask(clnt, &oldset);
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
 	if (!result)
 		result = nfs_direct_wait(dreq);
-	rpc_clnt_sigunmask(clnt, &oldset);
 	nfs_direct_req_release(dreq);
 
 	return result;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index db5d96dc6107..f68c22215b14 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -433,15 +433,11 @@ static int nfs_wait_schedule(void *word)
  */
 static int nfs_wait_on_inode(struct inode *inode)
 {
-	struct rpc_clnt	*clnt = NFS_CLIENT(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
-	sigset_t oldmask;
 	int error;
 
-	rpc_clnt_sigmask(clnt, &oldmask);
 	error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
-					nfs_wait_schedule, TASK_INTERRUPTIBLE);
-	rpc_clnt_sigunmask(clnt, &oldmask);
+					nfs_wait_schedule, TASK_KILLABLE);
 
 	return error;
 }
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 8afd9f7e7a97..49c7cd0502cc 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -56,7 +56,7 @@ int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path,
 		.program	= &mnt_program,
 		.version	= version,
 		.authflavor	= RPC_AUTH_UNIX,
-		.flags		= RPC_CLNT_CREATE_INTR,
+		.flags		= 0,
 	};
 	struct rpc_clnt		*mnt_clnt;
 	int			status;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 4cdc2361a669..5ae96340f2c2 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -27,17 +27,14 @@
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 {
-	sigset_t oldset;
 	int res;
-	rpc_clnt_sigmask(clnt, &oldset);
 	do {
 		res = rpc_call_sync(clnt, msg, flags);
 		if (res != -EJUKEBOX)
 			break;
-		schedule_timeout_interruptible(NFS_JUKEBOX_RETRY_TIME);
+		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
-	} while (!signalled());
-	rpc_clnt_sigunmask(clnt, &oldset);
+	} while (!fatal_signal_pending(current));
 	return res;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f03d9d5f5ba4..c4faa43b36de 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -316,12 +316,9 @@ static void nfs4_opendata_put(struct nfs4_opendata *p)
 
 static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
 {
-	sigset_t oldset;
 	int ret;
 
-	rpc_clnt_sigmask(task->tk_client, &oldset);
 	ret = rpc_wait_for_completion_task(task);
-	rpc_clnt_sigunmask(task->tk_client, &oldset);
 	return ret;
 }
 
@@ -2806,9 +2803,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
 	return 0;
 }
 
-static int nfs4_wait_bit_interruptible(void *word)
+static int nfs4_wait_bit_killable(void *word)
 {
-	if (signal_pending(current))
+	if (fatal_signal_pending(current))
 		return -ERESTARTSYS;
 	schedule();
 	return 0;
@@ -2816,18 +2813,14 @@ static int nfs4_wait_bit_interruptible(void *word)
 
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
 {
-	sigset_t oldset;
 	int res;
 
 	might_sleep();
 
 	rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_);
 
-	rpc_clnt_sigmask(clnt, &oldset);
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
-			nfs4_wait_bit_interruptible,
-			TASK_INTERRUPTIBLE);
-	rpc_clnt_sigunmask(clnt, &oldset);
+			nfs4_wait_bit_killable, TASK_KILLABLE);
 
 	rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_);
 	return res;
@@ -2835,7 +2828,6 @@ static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
 
 static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 {
-	sigset_t oldset;
 	int res = 0;
 
 	might_sleep();
@@ -2844,14 +2836,9 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 		*timeout = NFS4_POLL_RETRY_MIN;
 	if (*timeout > NFS4_POLL_RETRY_MAX)
 		*timeout = NFS4_POLL_RETRY_MAX;
-	rpc_clnt_sigmask(clnt, &oldset);
-	if (clnt->cl_intr) {
-		schedule_timeout_interruptible(*timeout);
-		if (signalled())
-			res = -ERESTARTSYS;
-	} else
-		schedule_timeout_uninterruptible(*timeout);
-	rpc_clnt_sigunmask(clnt, &oldset);
+	schedule_timeout_killable(*timeout);
+	if (fatal_signal_pending(current))
+		res = -ERESTARTSYS;
 	*timeout <<= 1;
 	return res;
 }
@@ -3085,7 +3072,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 static unsigned long
 nfs4_set_lock_task_retry(unsigned long timeout)
 {
-	schedule_timeout_interruptible(timeout);
+	schedule_timeout_killable(timeout);
 	timeout <<= 1;
 	if (timeout > NFS4_LOCK_MAXTIMEOUT)
 		return NFS4_LOCK_MAXTIMEOUT;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 4b0334590ee5..531379d36823 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -228,10 +228,7 @@ static int __init root_nfs_parse(char *name, char *buf)
 				nfs_data.flags &= ~NFS_MOUNT_SOFT;
 				break;
 			case Opt_intr:
-				nfs_data.flags |= NFS_MOUNT_INTR;
-				break;
 			case Opt_nointr:
-				nfs_data.flags &= ~NFS_MOUNT_INTR;
 				break;
 			case Opt_posix:
 				nfs_data.flags |= NFS_MOUNT_POSIX;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 345bb9b4765b..2dff469f04fe 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -67,7 +67,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 		if (req != NULL)
 			break;
 
-		if (signalled() && (server->flags & NFS_MOUNT_INTR))
+		if (fatal_signal_pending(current))
 			return ERR_PTR(-ERESTARTSYS);
 		yield();
 	}
@@ -175,11 +175,11 @@ void nfs_release_request(struct nfs_page *req)
 	kref_put(&req->wb_kref, nfs_free_request);
 }
 
-static int nfs_wait_bit_interruptible(void *word)
+static int nfs_wait_bit_killable(void *word)
 {
 	int ret = 0;
 
-	if (signal_pending(current))
+	if (fatal_signal_pending(current))
 		ret = -ERESTARTSYS;
 	else
 		schedule();
@@ -190,26 +190,18 @@ static int nfs_wait_bit_interruptible(void *word)
  * nfs_wait_on_request - Wait for a request to complete.
  * @req: request to wait upon.
  *
- * Interruptible by signals only if mounted with intr flag.
+ * Interruptible by fatal signals only.
  * The user is responsible for holding a count on the request.
  */
 int
 nfs_wait_on_request(struct nfs_page *req)
 {
-	struct rpc_clnt *clnt = NFS_CLIENT(req->wb_context->path.dentry->d_inode);
-	sigset_t oldmask;
 	int ret = 0;
 
 	if (!test_bit(PG_BUSY, &req->wb_flags))
 		goto out;
-	/*
-	 * Note: the call to rpc_clnt_sigmask() suffices to ensure that we
-	 *	 are not interrupted if intr flag is not set
-	 */
-	rpc_clnt_sigmask(clnt, &oldmask);
 	ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY,
-			nfs_wait_bit_interruptible, TASK_INTERRUPTIBLE);
-	rpc_clnt_sigunmask(clnt, &oldmask);
+			nfs_wait_bit_killable, TASK_KILLABLE);
 out:
 	return ret;
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4587a86adaac..3dcaa6a73261 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -212,12 +212,7 @@ nfs_async_read_error(struct list_head *head)
  */
 static void nfs_execute_read(struct nfs_read_data *data)
 {
-	struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-	sigset_t oldset;
-
-	rpc_clnt_sigmask(clnt, &oldset);
 	rpc_execute(&data->task);
-	rpc_clnt_sigunmask(clnt, &oldset);
 }
 
 /*
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2426e713b77f..5b6339f70a4c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -424,7 +424,6 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		const char *nostr;
 	} nfs_info[] = {
 		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
-		{ NFS_MOUNT_INTR, ",intr", ",nointr" },
 		{ NFS_MOUNT_NOCTO, ",nocto", "" },
 		{ NFS_MOUNT_NOAC, ",noac", "" },
 		{ NFS_MOUNT_NONLM, ",nolock", "" },
@@ -624,10 +623,7 @@ static int nfs_parse_mount_options(char *raw,
 			mnt->flags &= ~NFS_MOUNT_SOFT;
 			break;
 		case Opt_intr:
-			mnt->flags |= NFS_MOUNT_INTR;
-			break;
 		case Opt_nointr:
-			mnt->flags &= ~NFS_MOUNT_INTR;
 			break;
 		case Opt_posix:
 			mnt->flags |= NFS_MOUNT_POSIX;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 51cc1bd6a116..60e3e870ada4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -490,7 +490,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
 /*
  * Wait for a request to complete.
  *
- * Interruptible by signals only if mounted with intr flag.
+ * Interruptible by fatal signals only.
  */
 static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
 {
@@ -816,12 +816,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 
 static void nfs_execute_write(struct nfs_write_data *data)
 {
-	struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-	sigset_t oldset;
-
-	rpc_clnt_sigmask(clnt, &oldset);
 	rpc_execute(&data->task);
-	rpc_clnt_sigunmask(clnt, &oldset);
 }
 
 /*
-- 
cgit v1.2.3


From 2dfe485a2c8afa54cb069fcf48476f6c90ea3fdf Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 6 Dec 2007 16:25:30 -0500
Subject: Remove commented-out code copied from NFS

This is a false positive when grepping ... change it to be what the NFS
code looks like now.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 fs/smbfs/request.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index ca4b2d59c0ca..45f45933e862 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -105,7 +105,7 @@ struct smb_request *smb_alloc_request(struct smb_sb_info *server, int bufsize)
                 if (nfs_try_to_free_pages(server))
 			continue;
 
-		if (signalled() && (server->flags & NFS_MOUNT_INTR))
+		if (fatal_signal_pending(current))
 			return ERR_PTR(-ERESTARTSYS);
 		current->policy = SCHED_YIELD;
 		schedule();
-- 
cgit v1.2.3


From d757762bf2f6aea954745c76b4d767067b85be9d Mon Sep 17 00:00:00 2001
From: Donald Douwsma <donaldd@sgi.com>
Date: Fri, 23 Nov 2007 16:27:42 +1100
Subject: [XFS] Fix dbflush panic in xfs_qm_sync.

The recent behaviour layer removal dropped the check for quotas that have
been requested at mount time but have subsequently been turned off. This
results in a panic when accessing m_quotainfo which has been freed.

This patch adds the check originally made by xfs_qm_syncall() to
xfs_qm_sync().

SGI-PV: 969769
SGI-Modid: xfs-linux-melb:xfs-kern:29908a

Signed-off-by: Donald Douwsma <donaldd@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index b5f91281b707..d488645f833d 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1008,6 +1008,9 @@ xfs_qm_sync(
 	boolean_t	nowait;
 	int		error;
 
+	if (! XFS_IS_QUOTA_ON(mp))
+		return 0;
+
 	restarts = 0;
 	/*
 	 * We won't block unless we are asked to.
-- 
cgit v1.2.3


From cd57e594adc624dd9ee4c0ded3949da21ec24b2f Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Fri, 23 Nov 2007 16:30:32 +1100
Subject: [XFS] 971064 Various fixups for xfs_bulkstat().

- sanity check for NULL user buffer in xfs_ioc_bulkstat[_compat]()
- remove the special case for XFS_IOC_FSBULKSTAT with count == 1. This
  special case causes bulkstat to fail because the special case uses
  xfs_bulkstat_single() instead of xfs_bulkstat() and the two functions
  have different semantics.  xfs_bulkstat() will return the next inode
  after the one supplied while skipping internal inodes (ie quota inodes).
  xfs_bulkstate_single() will only lookup the inode supplied and return
  an error if it is an internal inode.
- in xfs_bulkstat(), need to initialise 'lastino' to the inode supplied
  so in cases were we return without examining any inodes the scan wont
  restart back at zero.
- sanity check for valid *ubcountp values. Cannot sanity check for valid
  ubuffer here because some users of xfs_bulkstat() don't supply a buffer.
- checks against 'ubleft' (the space left in the user's buffer) should be
  against 'statstruct_size' which is the supplied minimum object size.
  The mixture of checks against statstruct_size and 0 was one of the
  reasons we were skipping inodes.
- if the formatter function returns BULKSTAT_RV_NOTHING and an error and
  the error is not ENOENT or EINVAL then we need to abort the scan. ENOENT
  is for inodes that are no longer valid and we just skip them. EINVAL is
  returned if we try to lookup an internal inode so we skip them too. For
  a DMF scan if the inode and DMF attribute cannot fit into the space left
  in the user's buffer it would return ERANGE. We didn't handle this error
  and skipped the inode. We would continue to skip inodes until one fitted
  into the user's buffer or we completed the scan.
- put back the recalculation of agino (that got removed with the last fix)
  at the end of the while loop. This is because the code at the start of
  the loop expects agino to be the last inode examined if it is non-zero.
- if we found some inodes but then encountered an error, return success
  this time and the error next time. If the formatter aborted with ENOMEM
  we will now return this error but only if we couldn't read any inodes.
  Previously if we encountered ENOMEM without reading any inodes we
  returned a zero count and no error which falsely indicated the scan was
  complete.

SGI-PV: 973431
SGI-Modid: xfs-linux-melb:xfs-kern:30089a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: David Chinner <dgc@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c   | 20 ++++++++------------
 fs/xfs/linux-2.6/xfs_ioctl32.c |  3 +++
 fs/xfs/xfs_itable.c            | 43 ++++++++++++++++++++++++++++--------------
 3 files changed, 40 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2b34bad48b07..98a56568bb24 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1047,24 +1047,20 @@ xfs_ioc_bulkstat(
 	if ((count = bulkreq.icount) <= 0)
 		return -XFS_ERROR(EINVAL);
 
+	if (bulkreq.ubuffer == NULL)
+		return -XFS_ERROR(EINVAL);
+
 	if (cmd == XFS_IOC_FSINUMBERS)
 		error = xfs_inumbers(mp, &inlast, &count,
 					bulkreq.ubuffer, xfs_inumbers_fmt);
 	else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
 		error = xfs_bulkstat_single(mp, &inlast,
 						bulkreq.ubuffer, &done);
-	else {	/* XFS_IOC_FSBULKSTAT */
-		if (count == 1 && inlast != 0) {
-			inlast++;
-			error = xfs_bulkstat_single(mp, &inlast,
-					bulkreq.ubuffer, &done);
-		} else {
-			error = xfs_bulkstat(mp, &inlast, &count,
-				(bulkstat_one_pf)xfs_bulkstat_one, NULL,
-				sizeof(xfs_bstat_t), bulkreq.ubuffer,
-				BULKSTAT_FG_QUICK, &done);
-		}
-	}
+	else	/* XFS_IOC_FSBULKSTAT */
+		error = xfs_bulkstat(mp, &inlast, &count,
+			(bulkstat_one_pf)xfs_bulkstat_one, NULL,
+			sizeof(xfs_bstat_t), bulkreq.ubuffer,
+			BULKSTAT_FG_QUICK, &done);
 
 	if (error)
 		return -error;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0046bdd5b7f1..bf2a956b63c2 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -291,6 +291,9 @@ xfs_ioc_bulkstat_compat(
 	if ((count = bulkreq.icount) <= 0)
 		return -XFS_ERROR(EINVAL);
 
+	if (bulkreq.ubuffer == NULL)
+		return -XFS_ERROR(EINVAL);
+
 	if (cmd == XFS_IOC_FSINUMBERS)
 		error = xfs_inumbers(mp, &inlast, &count,
 				bulkreq.ubuffer, xfs_inumbers_fmt_compat);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 9972992fd3c3..9fc4c2886529 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -316,6 +316,8 @@ xfs_bulkstat_use_dinode(
 	return 1;
 }
 
+#define XFS_BULKSTAT_UBLEFT(ubleft)	((ubleft) >= statstruct_size)
+
 /*
  * Return stat information in bulk (by-inode) for the filesystem.
  */
@@ -353,7 +355,7 @@ xfs_bulkstat(
 	xfs_inobt_rec_incore_t	*irbp;	/* current irec buffer pointer */
 	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
 	xfs_inobt_rec_incore_t	*irbufend; /* end of good irec buffer entries */
-	xfs_ino_t		lastino=0; /* last inode number returned */
+	xfs_ino_t		lastino; /* last inode number returned */
 	int			nbcluster; /* # of blocks in a cluster */
 	int			nicluster; /* # of inodes in a cluster */
 	int			nimask;	/* mask for inode clusters */
@@ -373,6 +375,7 @@ xfs_bulkstat(
 	 * Get the last inode value, see if there's nothing to do.
 	 */
 	ino = (xfs_ino_t)*lastinop;
+	lastino = ino;
 	dip = NULL;
 	agno = XFS_INO_TO_AGNO(mp, ino);
 	agino = XFS_INO_TO_AGINO(mp, ino);
@@ -382,6 +385,9 @@ xfs_bulkstat(
 		*ubcountp = 0;
 		return 0;
 	}
+	if (!ubcountp || *ubcountp <= 0) {
+		return EINVAL;
+	}
 	ubcount = *ubcountp; /* statstruct's */
 	ubleft = ubcount * statstruct_size; /* bytes */
 	*ubcountp = ubelem = 0;
@@ -402,7 +408,8 @@ xfs_bulkstat(
 	 * inode returned; 0 means start of the allocation group.
 	 */
 	rval = 0;
-	while (ubleft >= statstruct_size && agno < mp->m_sb.sb_agcount) {
+	while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
+		cond_resched();
 		bp = NULL;
 		down_read(&mp->m_peraglock);
 		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
@@ -499,6 +506,7 @@ xfs_bulkstat(
 					break;
 				error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
 							    &tmp);
+				cond_resched();
 			}
 			/*
 			 * If ran off the end of the ag either with an error,
@@ -542,6 +550,7 @@ xfs_bulkstat(
 			 */
 			agino = gino + XFS_INODES_PER_CHUNK;
 			error = xfs_inobt_increment(cur, 0, &tmp);
+			cond_resched();
 		}
 		/*
 		 * Drop the btree buffers and the agi buffer.
@@ -555,12 +564,12 @@ xfs_bulkstat(
 		 */
 		irbufend = irbp;
 		for (irbp = irbuf;
-		     irbp < irbufend && ubleft >= statstruct_size; irbp++) {
+		     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
 			/*
 			 * Now process this chunk of inodes.
 			 */
 			for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
-			     ubleft > 0 &&
+			     XFS_BULKSTAT_UBLEFT(ubleft) &&
 				irbp->ir_freecount < XFS_INODES_PER_CHUNK;
 			     chunkidx++, clustidx++, agino++) {
 				ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
@@ -663,15 +672,13 @@ xfs_bulkstat(
 						ubleft, private_data,
 						bno, &ubused, dip, &fmterror);
 				if (fmterror == BULKSTAT_RV_NOTHING) {
-                                        if (error == EFAULT) {
-                                                ubleft = 0;
-                                                rval = error;
-                                                break;
-                                        }
-					else if (error == ENOMEM)
+					if (error && error != ENOENT &&
+						error != EINVAL) {
 						ubleft = 0;
-					else
-						lastino = ino;
+						rval = error;
+						break;
+					}
+					lastino = ino;
 					continue;
 				}
 				if (fmterror == BULKSTAT_RV_GIVEUP) {
@@ -686,6 +693,8 @@ xfs_bulkstat(
 				ubelem++;
 				lastino = ino;
 			}
+
+			cond_resched();
 		}
 
 		if (bp)
@@ -694,11 +703,12 @@ xfs_bulkstat(
 		/*
 		 * Set up for the next loop iteration.
 		 */
-		if (ubleft > 0) {
+		if (XFS_BULKSTAT_UBLEFT(ubleft)) {
 			if (end_of_ag) {
 				agno++;
 				agino = 0;
-			}
+			} else
+				agino = XFS_INO_TO_AGINO(mp, lastino);
 		} else
 			break;
 	}
@@ -707,6 +717,11 @@ xfs_bulkstat(
 	 */
 	kmem_free(irbuf, irbsize);
 	*ubcountp = ubelem;
+	/*
+	 * Found some inodes, return them now and return the error next time.
+	 */
+	if (ubelem)
+		rval = 0;
 	if (agno >= mp->m_sb.sb_agcount) {
 		/*
 		 * If we ran out of filesystem, mark lastino as off
-- 
cgit v1.2.3


From d1afb678ce77b930334a8a640a05b8e68178a377 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Tue, 27 Nov 2007 17:01:24 +1100
Subject: [XFS] Fixed a few bugs in xfs_buf_associate_memory()

- calculation of 'page_count' was incorrect as it did not
  consider the offset of 'mem' into the first page. The
  logic to bump 'page_count' didn't work if 'len' was <=
  PAGE_CACHE_SIZE (ie offset = 3k, len = 2k).
- setting b_buffer_length to 'len' is incorrect if 'offset'
  is > 0. Set it to the total length of the buffer.
- I suspect that passing a non-aligned address into
  mem_to_page() for the first page may have been causing
  issues - don't know but just tidy up that code anyway.

SGI-PV: 971596
SGI-Modid: xfs-linux-melb:xfs-kern:30143a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/linux-2.6/xfs_buf.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b9c8589e05c2..48bf477cbca5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -725,15 +725,15 @@ xfs_buf_associate_memory(
 {
 	int			rval;
 	int			i = 0;
-	size_t			ptr;
-	size_t			end, end_cur;
-	off_t			offset;
+	unsigned long		pageaddr;
+	unsigned long		offset;
+	size_t			buflen;
 	int			page_count;
 
-	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
-	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
-	if (offset && (len > PAGE_CACHE_SIZE))
-		page_count++;
+	pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
+	offset = (unsigned long)mem - pageaddr;
+	buflen = PAGE_CACHE_ALIGN(len + offset);
+	page_count = buflen >> PAGE_CACHE_SHIFT;
 
 	/* Free any previous set of page pointers */
 	if (bp->b_pages)
@@ -747,22 +747,15 @@ xfs_buf_associate_memory(
 		return rval;
 
 	bp->b_offset = offset;
-	ptr = (size_t) mem & PAGE_CACHE_MASK;
-	end = PAGE_CACHE_ALIGN((size_t) mem + len);
-	end_cur = end;
-	/* set up first page */
-	bp->b_pages[0] = mem_to_page(mem);
-
-	ptr += PAGE_CACHE_SIZE;
-	bp->b_page_count = ++i;
-	while (ptr < end) {
-		bp->b_pages[i] = mem_to_page((void *)ptr);
-		bp->b_page_count = ++i;
-		ptr += PAGE_CACHE_SIZE;
+
+	for (i = 0; i < bp->b_page_count; i++) {
+		bp->b_pages[i] = mem_to_page((void *)pageaddr);
+		pageaddr += PAGE_CACHE_SIZE;
 	}
 	bp->b_locked = 0;
 
-	bp->b_count_desired = bp->b_buffer_length = len;
+	bp->b_count_desired = len;
+	bp->b_buffer_length = buflen;
 	bp->b_flags |= XBF_MAPPED;
 
 	return 0;
-- 
cgit v1.2.3


From 77be55a5a13d9c7ddf780a93861f2fba33f8be1a Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Fri, 23 Nov 2007 16:31:00 +1100
Subject: [XFS] Clear XBF_READ_AHEAD flag on I/O completion.

SGI-PV: 972554
SGI-Modid: xfs-linux-melb:xfs-kern:30128a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/linux-2.6/xfs_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 48bf477cbca5..43d6c7a290e2 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1025,7 +1025,7 @@ xfs_buf_ioend(
 	xfs_buf_t		*bp,
 	int			schedule)
 {
-	bp->b_flags &= ~(XBF_READ | XBF_WRITE);
+	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
 	if (bp->b_error == 0)
 		bp->b_flags |= XBF_DONE;
 
-- 
cgit v1.2.3


From a7430847fcb19297d6db833f35b9c9645c4a6395 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Fri, 23 Nov 2007 16:30:23 +1100
Subject: [XFS] Fix broken inode cluster setup.

The radix tree based inode caches did away with the inode cluster hashes,
replacing them with a bunch of masking and gang lookups on the radix tree.

This masking got broken when moving the code to per-ag radix trees and
indexing by agino # rather than straight inode number. The result is
clustered inode writeback does not cluster and things can go extremely
slowly when there are lots of inodes to write.

Fix it up by comparing the agino # of the inode we just looked up to the
index of the cluster we are looking for.

Tested-by: Torsten Kaiser <just.for.lkml@googlemail.com>

SGI-PV: 972915
SGI-Modid: xfs-linux-melb:xfs-kern:30033a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_iget.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 488836e204a3..fb69ef180b27 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -267,7 +267,7 @@ finish_inode:
 	icl = NULL;
 	if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
 							first_index, 1)) {
-		if ((iq->i_ino & mask) == first_index)
+		if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
 			icl = iq->i_cluster;
 	}
 
-- 
cgit v1.2.3


From e89bc612d61edbcefaeb6f2244f86c0f3ec89d23 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 7 Dec 2007 14:07:53 +1100
Subject: [XFS] revert to double-buffering readdir

The current readdir implementation deadlocks on a btree buffers locks
because nfsd calls back into ->lookup from the filldir callback. The only
short-term fix for this is to revert to the old inefficient
double-buffering scheme.

SGI-PV: 973377
SGI-Modid: xfs-linux-melb:xfs-kern:30201a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c | 124 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index fb8dd34041eb..54c564693d93 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -218,6 +218,15 @@ xfs_vm_fault(
 }
 #endif /* CONFIG_XFS_DMAPI */
 
+/*
+ * Unfortunately we can't just use the clean and simple readdir implementation
+ * below, because nfs might call back into ->lookup from the filldir callback
+ * and that will deadlock the low-level btree code.
+ *
+ * Hopefully we'll find a better workaround that allows to use the optimal
+ * version at least for local readdirs for 2.6.25.
+ */
+#if 0
 STATIC int
 xfs_file_readdir(
 	struct file	*filp,
@@ -249,6 +258,121 @@ xfs_file_readdir(
 		return -error;
 	return 0;
 }
+#else
+
+struct hack_dirent {
+	int		namlen;
+	loff_t		offset;
+	u64		ino;
+	unsigned int	d_type;
+	char		name[];
+};
+
+struct hack_callback {
+	char		*dirent;
+	size_t		len;
+	size_t		used;
+};
+
+STATIC int
+xfs_hack_filldir(
+	void		*__buf,
+	const char	*name,
+	int		namlen,
+	loff_t		offset,
+	u64		ino,
+	unsigned int	d_type)
+{
+	struct hack_callback *buf = __buf;
+	struct hack_dirent *de = (struct hack_dirent *)(buf->dirent + buf->used);
+
+	if (buf->used + sizeof(struct hack_dirent) + namlen > buf->len)
+		return -EINVAL;
+
+	de->namlen = namlen;
+	de->offset = offset;
+	de->ino = ino;
+	de->d_type = d_type;
+	memcpy(de->name, name, namlen);
+	buf->used += sizeof(struct hack_dirent) + namlen;
+	return 0;
+}
+
+STATIC int
+xfs_file_readdir(
+	struct file	*filp,
+	void		*dirent,
+	filldir_t	filldir)
+{
+	struct inode	*inode = filp->f_path.dentry->d_inode;
+	xfs_inode_t	*ip = XFS_I(inode);
+	struct hack_callback buf;
+	struct hack_dirent *de;
+	int		error;
+	loff_t		size;
+	int		eof = 0;
+	xfs_off_t       start_offset, curr_offset, offset;
+
+	/*
+	 * Try fairly hard to get memory
+	 */
+	buf.len = PAGE_CACHE_SIZE;
+	do {
+		buf.dirent = kmalloc(buf.len, GFP_KERNEL);
+		if (buf.dirent)
+			break;
+		buf.len >>= 1;
+	} while (buf.len >= 1024);
+
+	if (!buf.dirent)
+		return -ENOMEM;
+
+	curr_offset = filp->f_pos;
+	if (curr_offset == 0x7fffffff)
+		offset = 0xffffffff;
+	else
+		offset = filp->f_pos;
+
+	while (!eof) {
+		int reclen;
+		start_offset = offset;
+
+		buf.used = 0;
+		error = -xfs_readdir(ip, &buf, buf.len, &offset,
+				     xfs_hack_filldir);
+		if (error || offset == start_offset) {
+			size = 0;
+			break;
+		}
+
+		size = buf.used;
+		de = (struct hack_dirent *)buf.dirent;
+		while (size > 0) {
+			if (filldir(dirent, de->name, de->namlen,
+					curr_offset & 0x7fffffff,
+					de->ino, de->d_type)) {
+				goto done;
+			}
+
+			reclen = sizeof(struct hack_dirent) + de->namlen;
+			size -= reclen;
+			curr_offset = de->offset /* & 0x7fffffff */;
+			de = (struct hack_dirent *)((char *)de + reclen);
+		}
+	}
+
+ done:
+ 	if (!error) {
+		if (size == 0)
+			filp->f_pos = offset & 0x7fffffff;
+		else if (de)
+			filp->f_pos = curr_offset;
+	}
+
+	kfree(buf.dirent);
+	return error;
+}
+#endif
 
 STATIC int
 xfs_file_mmap(
-- 
cgit v1.2.3


From 978c7b2ff49597ab76ff7529a933bd366941ac25 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 7 Dec 2007 14:09:02 +1100
Subject: [XFS] Make xfsbufd threads freezable

Fix breakage caused by commit 831441862956fffa17b9801db37e6ea1650b0f69
that did not introduce the necessary call to set_freezable() in
xfs/linux-2.6/xfs_buf.c .

SGI-PV: 974224
SGI-Modid: xfs-linux-melb:xfs-kern:30203a

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 43d6c7a290e2..a49dd8d4b069 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1743,6 +1743,8 @@ xfsbufd(
 
 	current->flags |= PF_MEMALLOC;
 
+	set_freezable();
+
 	do {
 		if (unlikely(freezing(current))) {
 			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
-- 
cgit v1.2.3


From cf10e82bdc0d38d09dfaf46d0daf56136138ef3f Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Fri, 7 Dec 2007 14:09:11 +1100
Subject: [XFS] Fix xfs_ichgtime()s broken usage of I_SYNC

The recent I_LOCK->I_SYNC changes mistakenly changed xfs_ichgtime to look
at I_SYNC instead of I_LOCK. This was incorrect and prevents newly created
inodes from moving to the dirty list. Change this to the correct check
which is for I_NEW, not I_LOCK or I_SYNC so that behaviour is correct.

SGI-PV: 974225
SGI-Modid: xfs-linux-melb:xfs-kern:30204a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index ac50f8a37582..37e116779eb1 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -117,7 +117,7 @@ xfs_ichgtime(
 	 */
 	SYNCHRONIZE();
 	ip->i_update_core = 1;
-	if (!(inode->i_state & I_SYNC))
+	if (!(inode->i_state & I_NEW))
 		mark_inode_dirty_sync(inode);
 }
 
@@ -169,7 +169,7 @@ xfs_ichgtime_fast(
 	 */
 	SYNCHRONIZE();
 	ip->i_update_core = 1;
-	if (!(inode->i_state & I_SYNC))
+	if (!(inode->i_state & I_NEW))
 		mark_inode_dirty_sync(inode);
 }
 
-- 
cgit v1.2.3


From 3790ee4bd86396558eedd86faac1052cb782e4e1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 10 Dec 2007 15:49:36 -0800
Subject: proc: remove/Fix proc generic d_revalidate

Ultimately to implement /proc perfectly we need an implementation of
d_revalidate because files and directories can be removed behind the back
of the VFS, and d_revalidate is the only way we can let the VFS know that
this has happened.

Unfortunately the linux VFS can not cope with anything in the path to a
mount point going away.  So a proper d_revalidate method that calls d_drop
also needs to call have_submounts which is moderately expensive, so you
really don't want a d_revalidate method that unconditionally calls it, but
instead only calls it when the backing object has really gone away.

proc generic entries only disappear on module_unload (when not counting the
fledgling network namespace) so it is quite rare that we actually encounter
that case and has not actually caused us real world trouble yet.

So until we get a proper test for keeping dentries in the dcache fix the
current d_revalidate method by completely removing it.  This returns us to
the current status quo.

So with CONFIG_NETNS=n things should look as they have always looked.

For CONFIG_NETNS=y things work most of the time but there are a few rare
corner cases that don't behave properly.  As the network namespace is
barely present in 2.6.24 this should not be a problem.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "Denis V. Lunev" <den@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8d49838e5554..6a2fe5187b62 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -374,16 +374,9 @@ static int proc_delete_dentry(struct dentry * dentry)
 	return 1;
 }
 
-static int proc_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
-{
-	d_drop(dentry);
-	return 0;
-}
-
 static struct dentry_operations proc_dentry_operations =
 {
 	.d_delete	= proc_delete_dentry,
-	.d_revalidate	= proc_revalidate_dentry,
 };
 
 /*
-- 
cgit v1.2.3


From 4584f520e1f773082ef44ff4f8969a5d992b16ec Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 11 Dec 2007 19:01:45 -0500
Subject: NFS: Fix NFS mountpoint crossing...

The check that was added to nfs_xdev_get_sb() to work around broken
servers, works fine for NFSv2, but causes mountpoint crossing on NFSv3 to
always return ESTALE.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2426e713b77f..ea929207f274 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1475,7 +1475,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
 	}
-	if (mntroot->d_inode->i_op != &nfs_dir_inode_operations) {
+	if (mntroot->d_inode->i_op != server->nfs_client->rpc_ops->dir_inode_ops) {
 		dput(mntroot);
 		error = -ESTALE;
 		goto error_splat_super;
-- 
cgit v1.2.3


From 5cef338b30c110daf547fb13d99f0c77f2a79fbc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 11 Dec 2007 22:01:56 -0500
Subject: NFSv2/v3: Fix a memory leak when using -onolock

Neil Brown said:
> Hi Trond,
>
> We found that a machine which made moderately heavy use of
> 'automount' was leaking some nfs data structures - particularly the
> 4K allocated by rpc_alloc_iostats.
> It turns out that this only happens with filesystems with -onolock
> set.

> The problem is that if NFS_MOUNT_NONLM is set, nfs_start_lockd doesn't
> set server->destroy, so when the filesystem is unmounted, the
> ->client_acl is not shutdown, and so several resources are still
> held.  Multiple mount/umount cycles will slowly eat away memory
> several pages at a time.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: NeilBrown <neilb@suse.de>
---
 fs/nfs/client.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 70587f383f10..a6f625497612 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -410,9 +410,6 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
  */
 static void nfs_destroy_server(struct nfs_server *server)
 {
-	if (!IS_ERR(server->client_acl))
-		rpc_shutdown_client(server->client_acl);
-
 	if (!(server->flags & NFS_MOUNT_NONLM))
 		lockd_down();	/* release rpc.lockd */
 }
@@ -755,6 +752,9 @@ void nfs_free_server(struct nfs_server *server)
 
 	if (server->destroy != NULL)
 		server->destroy(server);
+
+	if (!IS_ERR(server->client_acl))
+		rpc_shutdown_client(server->client_acl);
 	if (!IS_ERR(server->client))
 		rpc_shutdown_client(server->client);
 
-- 
cgit v1.2.3


From a5576cfa5cd8d8aa874bd4ee500dc8a2e7cbad18 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Dec 2007 11:08:33 -0500
Subject: Revert "NFS: Ensure we return zero if applications attempt to write
 zero bytes"

This reverts commit b9148c6b80d802dbc2a7530b29915a80432e50c7.

On Wed, 12 Dec 2007 10:57:30 -0500, Chuck Lever wrote
> commit b9148c6b should be reverted.  It was recently forward-ported
> from some years-old patches, and is clearly not needed now.
>
> On Dec 11, 2007, at 5:21 PM, Adrian Bunk wrote:
>
>> This code became dead after commit
>> b9148c6b80d802dbc2a7530b29915a80432e50c7
>> (which BTW doesn't seem to have changed any behaviour) and can
>> therefore
>> be removed.
>>
>> Spotted by the Coverity checker.
>>
>> Signed-off-by: Adrian Bunk <bunk@kernel.org>
>>
>> ---
>> --- linux-2.6/fs/nfs/direct.c.old     2007-12-02 21:54:53.000000000 +0100
>> +++ linux-2.6/fs/nfs/direct.c 2007-12-02 21:55:10.000000000 +0100
>> @@ -897,15 +897,12 @@ ssize_t nfs_file_direct_write(struct kio
>>       if (!count)
>>               goto out;       /* return 0 */
>>
>>       retval = -EINVAL;
>>       if ((ssize_t) count < 0)
>>               goto out;
>> -     retval = 0;
>> -     if (!count)
>> -             goto out;
>>
>>       retval = nfs_sync_mapping(mapping);
>>       if (retval)
>>               goto out;
>>
>>       retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
>>

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 5e8d82f6666b..3c9d16b4f80c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -894,8 +894,6 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	retval = generic_write_checks(file, &pos, &count, 0);
 	if (retval)
 		goto out;
-	if (!count)
-		goto out;	/* return 0 */
 
 	retval = -EINVAL;
 	if ((ssize_t) count < 0)
-- 
cgit v1.2.3


From a10db50a4ae813fcb2f431f2fb039933c109a925 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Dec 2007 11:12:15 -0500
Subject: NFS: Fix an Oops in NFS unmount

Ensure that the dummy 'root dentry' is invisible to d_find_alias(). If not,
then it may be spliced into the tree if a parent directory from the same
filesystem gets mounted at a later time.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/getroot.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 0ee43843f4ec..e6242cdbaf91 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -57,6 +57,17 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 		}
 		/* Circumvent igrab(): we know the inode is not being freed */
 		atomic_inc(&inode->i_count);
+		/*
+		 * Ensure that this dentry is invisible to d_find_alias().
+		 * Otherwise, it may be spliced into the tree by
+		 * d_materialise_unique if a parent directory from the same
+		 * filesystem gets mounted at a later time.
+		 * This again causes shrink_dcache_for_umount_subtree() to
+		 * Oops, since the test for IS_ROOT() will fail.
+		 */
+		spin_lock(&dcache_lock);
+		list_del_init(&sb->s_root->d_alias);
+		spin_unlock(&dcache_lock);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From a86370fbb65a0a2cb21d28bf25a748f6cc04385b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 3 Dec 2007 14:06:23 -0800
Subject: ocfs2: fix exit-while-locked bug in ocfs2_queue_orphans()

We're holding the cluster lock when a failure might happen in
ocfs2_dir_foreach() so it needs to be released.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/journal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f9d01e25298d..7e5f7ce4761b 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1277,11 +1277,12 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 				   ocfs2_orphan_filldir);
 	if (status) {
 		mlog_errno(status);
-		goto out;
+		goto out_cluster;
 	}
 
 	*head = priv.head;
 
+out_cluster:
 	ocfs2_meta_unlock(orphan_dir_inode, 0);
 out:
 	mutex_unlock(&orphan_dir_inode->i_mutex);
-- 
cgit v1.2.3


From 92295d8054289eff0d52b4d12349f9b9df0f58e4 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 3 Dec 2007 15:02:10 -0800
Subject: ocfs2: Don't panic when truncating an empty extent

This BUG_ON() was unintentionally left in after the sparse file support was
written.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ce62c152823d..97f0db5167c4 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6093,8 +6093,6 @@ start:
 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
 
-	BUG_ON(clusters_to_del == 0);
-
 	mutex_lock(&tl_inode->i_mutex);
 	tl_sem = 1;
 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
-- 
cgit v1.2.3


From 0879c584ffcccd50a8d0f72cab3a51702613f901 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 3 Dec 2007 16:42:19 -0800
Subject: ocfs2: Allow for debugging of transaction extends

The nastiest cases of transaction extends are also the rarest. We can expose
them more quickly at the expense of performance by going straight to the
journal_restart() in ocfs2_extend_trans(). Wrap things in OCFS2_DEBUG_FS so
that we only do this when "expensive debugging" is turned on.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/journal.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7e5f7ce4761b..0e1250c2ef44 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -193,11 +193,15 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 
 	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
 
+#ifdef OCFS2_DEBUG_FS
+	status = 1;
+#else
 	status = journal_extend(handle, nblocks);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
+#endif
 
 	if (status > 0) {
 		mlog(0, "journal_extend failed, trying journal_restart\n");
-- 
cgit v1.2.3


From e8aed3450c0afd6fdb79ec233f806e3e69454dfe Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 3 Dec 2007 16:43:01 -0800
Subject: ocfs2: Re-journal buffers after transaction extend

ocfs2_extend_trans() might call journal_restart() which will commit dirty
buffers and then restart the transaction. This means that any buffers which
still need changes should be passed to journal_access() again. Some paths
during extend weren't doing this right.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c   | 66 +++++++++++++++++++++++++++++++++++++-----------------
 fs/ocfs2/journal.c |  6 +++++
 2 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 97f0db5167c4..23c8cda43f19 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2389,6 +2389,18 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 			goto out;
 		}
 
+		/*
+		 * Caller might still want to make changes to the
+		 * tree root, so re-add it to the journal here.
+		 */
+		ret = ocfs2_journal_access(handle, inode,
+					   path_root_bh(left_path),
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
 						right_path, subtree_root,
 						dealloc, &deleted);
@@ -3289,16 +3301,6 @@ static int ocfs2_insert_path(struct inode *inode,
 	int ret, subtree_index;
 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
 
-	/*
-	 * Pass both paths to the journal. The majority of inserts
-	 * will be touching all components anyway.
-	 */
-	ret = ocfs2_journal_access_path(inode, handle, right_path);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	if (left_path) {
 		int credits = handle->h_buffer_credits;
 
@@ -3323,6 +3325,16 @@ static int ocfs2_insert_path(struct inode *inode,
 		}
 	}
 
+	/*
+	 * Pass both paths to the journal. The majority of inserts
+	 * will be touching all components anyway.
+	 */
+	ret = ocfs2_journal_access_path(inode, handle, right_path);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	if (insert->ins_split != SPLIT_NONE) {
 		/*
 		 * We could call ocfs2_insert_at_leaf() for some types
@@ -3331,6 +3343,17 @@ static int ocfs2_insert_path(struct inode *inode,
 		 */
 		ocfs2_split_record(inode, left_path, right_path,
 				   insert_rec, insert->ins_split);
+
+		/*
+		 * Split might have modified either leaf and we don't
+		 * have a guarantee that the later edge insert will
+		 * dirty this for us.
+		 */
+		if (left_path)
+			ret = ocfs2_journal_dirty(handle,
+						  path_leaf_bh(left_path));
+			if (ret)
+				mlog_errno(ret);
 	} else
 		ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
 				     insert, inode);
@@ -3430,6 +3453,17 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 			mlog_errno(ret);
 			goto out;
 		}
+
+		/*
+		 * ocfs2_rotate_tree_right() might have extended the
+		 * transaction without re-journaling our tree root.
+		 */
+		ret = ocfs2_journal_access(handle, inode, di_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 	} else if (type->ins_appending == APPEND_TAIL
 		   && type->ins_contig != CONTIG_LEFT) {
 		ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
@@ -3941,7 +3975,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 {
 	int ret = 0;
 	struct ocfs2_extent_list *el = path_leaf_el(path);
-	struct buffer_head *eb_bh, *last_eb_bh = NULL;
+	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
 	struct ocfs2_merge_ctxt ctxt;
 	struct ocfs2_extent_list *rightmost_el;
@@ -3960,14 +3994,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 		goto out;
 	}
 
-	eb_bh = path_leaf_bh(path);
-	ret = ocfs2_journal_access(handle, inode, eb_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
 							    split_index,
 							    split_rec);
@@ -4029,8 +4055,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 			mlog_errno(ret);
 	}
 
-	ocfs2_journal_dirty(handle, eb_bh);
-
 out:
 	brelse(last_eb_bh);
 	return ret;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 0e1250c2ef44..8d81f6c1b877 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -174,6 +174,12 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
  * transaction. extend_trans will either extend the current handle by
  * nblocks, or commit it and start a new one with nblocks credits.
  *
+ * This might call journal_restart() which will commit dirty buffers
+ * and then restart the transaction. Before calling
+ * ocfs2_extend_trans(), any changed blocks should have been
+ * dirtied. After calling it, all blocks which need to be changed must
+ * go through another set of journal_access/journal_dirty calls.
+ *
  * WARNING: This will not release any semaphores or disk locks taken
  * during the transaction, so make sure they were taken *before*
  * start_trans or we'll have ordering deadlocks.
-- 
cgit v1.2.3


From 459e216429a04779216b61f0fb61938a459fd1ca Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 17 Dec 2007 16:19:52 -0800
Subject: ecryptfs: initialize new auth_tokens before teardown

ecryptfs_destroy_mount_crypt_stat() checks whether each
auth_tok->global_auth_tok_key is nonzero and if so puts that key.  However,
in some early mount error paths nothing has initialized the pointer, and we
try to key_put() garbage.  Running the bad cipher tests in the testsuite
exposes this, and it's happy with the following change.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/keystore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 263fed88c0ca..f458c1f35565 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1860,7 +1860,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
 	struct ecryptfs_global_auth_tok *new_auth_tok;
 	int rc = 0;
 
-	new_auth_tok = kmem_cache_alloc(ecryptfs_global_auth_tok_cache,
+	new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache,
 					GFP_KERNEL);
 	if (!new_auth_tok) {
 		rc = -ENOMEM;
-- 
cgit v1.2.3


From 9e2de407bec98fb07040f658f55fb71ba1b594f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <Uwe.Kleine-Koenig@digi.com>
Date: Mon, 17 Dec 2007 16:19:54 -0800
Subject: fs/Kconfig: grammar fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This was introduced in 4af8e944c22d8af92a7548354a9567250cc1a782

Signed-off-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 635f3e286ad8..487236c65837 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1305,7 +1305,7 @@ config JFFS2_COMPRESSION_OPTIONS
 	help
 	  Enabling this option allows you to explicitly choose which
 	  compression modules, if any, are enabled in JFFS2. Removing
-	  compressors and mean you cannot read existing file systems,
+	  compressors can mean you cannot read existing file systems,
 	  and enabling experimental compressors can mean that you
 	  write a file system which cannot be read by a standard kernel.
 
-- 
cgit v1.2.3


From b47b6f38e5202c924bfe7632dce5dda4e3d40731 Mon Sep 17 00:00:00 2001
From: "Andries E. Brouwer" <Andries.Brouwer@cwi.nl>
Date: Mon, 17 Dec 2007 16:19:55 -0800
Subject: ext3, ext4: avoid divide by zero

As it turns out, the kernel divides by EXT3_INODES_PER_GROUP(s) when
mounting an ext3 filesystem.  If that number is zero, a crash follows.
Below a patch.

This crash was reported by Joeri de Ruiter, Carst Tankink and Pim Vullers.

Cc: <linux-ext4@vger.kernel.org>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 2 +-
 fs/ext4/super.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index de55da9e28ba..cb14de1502c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1676,7 +1676,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
 	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-	if (EXT3_INODE_SIZE(sb) == 0)
+	if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
 		goto cantfind_ext3;
 	sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8031dc0e24e5..1ca0f546c466 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1797,7 +1797,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-	if (EXT4_INODE_SIZE(sb) == 0)
+	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
-- 
cgit v1.2.3


From 7c9e70efbfc3186674d93451e0fbf18365347b4d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 17 Dec 2007 16:20:07 -0800
Subject: ecryptfs: set s_blocksize from lower fs in sb

eCryptfs wasn't setting s_blocksize in it's superblock; just pick it up
from the lower FS.  Having an s_blocksize of 0 made things like "filefrag"
which call FIGETBSZ unhappy.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Mike Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/main.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index b83a512b7e08..a277754da171 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -523,6 +523,7 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
 	lower_mnt = nd.mnt;
 	ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
 	sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
+	sb->s_blocksize = lower_root->d_sb->s_blocksize;
 	ecryptfs_set_dentry_lower(sb->s_root, lower_root);
 	ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt);
 	rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0);
-- 
cgit v1.2.3


From 7a3f595cc8298df14a7c71b0d876bafd8e9e1cbf Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 17 Dec 2007 16:20:10 -0800
Subject: ecryptfs: fix fsx data corruption problems

ecryptfs in 2.6.24-rc3 wasn't surviving fsx for me at all, dying after 4
ops.  Generally, encountering problems with stale data and improperly
zeroed pages.  An extending truncate + write for example would expose stale
data.

With the changes below I got to a million ops and beyond with all mmap ops
disabled - mmap still needs work.  (A version of this patch on a RHEL5
kernel ran for over 110 million fsx ops)

I added a few comments as well, to the best of my understanding
as I read through the code.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/mmap.c       | 31 ++++++++++++++++++++-----------
 fs/ecryptfs/read_write.c | 27 +++++++++++++++++++++------
 2 files changed, 41 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 16a7a555f392..32c5711d79a3 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -263,14 +263,13 @@ out:
 	return 0;
 }
 
+/* This function must zero any hole we create */
 static int ecryptfs_prepare_write(struct file *file, struct page *page,
 				  unsigned from, unsigned to)
 {
 	int rc = 0;
+	loff_t prev_page_end_size;
 
-	if (from == 0 && to == PAGE_CACHE_SIZE)
-		goto out;	/* If we are writing a full page, it will be
-				   up to date. */
 	if (!PageUptodate(page)) {
 		rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
 						      PAGE_CACHE_SIZE,
@@ -283,22 +282,32 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 		} else
 			SetPageUptodate(page);
 	}
-	if (page->index != 0) {
-		loff_t end_of_prev_pg_pos =
-			(((loff_t)page->index << PAGE_CACHE_SHIFT) - 1);
 
-		if (end_of_prev_pg_pos > i_size_read(page->mapping->host)) {
+	prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT);
+
+	/*
+	 * If creating a page or more of holes, zero them out via truncate.
+	 * Note, this will increase i_size.
+	 */
+	if (page->index != 0) {
+		if (prev_page_end_size > i_size_read(page->mapping->host)) {
 			rc = ecryptfs_truncate(file->f_path.dentry,
-					       end_of_prev_pg_pos);
+					       prev_page_end_size);
 			if (rc) {
 				printk(KERN_ERR "Error on attempt to "
 				       "truncate to (higher) offset [%lld];"
-				       " rc = [%d]\n", end_of_prev_pg_pos, rc);
+				       " rc = [%d]\n", prev_page_end_size, rc);
 				goto out;
 			}
 		}
-		if (end_of_prev_pg_pos + 1 > i_size_read(page->mapping->host))
-			zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+	}
+	/*
+	 * Writing to a new page, and creating a small hole from start of page?
+	 * Zero it out.
+	 */
+	if ((i_size_read(page->mapping->host) == prev_page_end_size) &&
+	    (from != 0)) {
+		zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
 	}
 out:
 	return rc;
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 6b7474a4336a..948f57624c05 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -124,6 +124,10 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 	loff_t pos;
 	int rc = 0;
 
+	/*
+	 * if we are writing beyond current size, then start pos
+	 * at the current size - we'll fill in zeros from there.
+	 */
 	if (offset > ecryptfs_file_size)
 		pos = ecryptfs_file_size;
 	else
@@ -137,6 +141,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 		if (num_bytes > total_remaining_bytes)
 			num_bytes = total_remaining_bytes;
 		if (pos < offset) {
+			/* remaining zeros to write, up to destination offset */
 			size_t total_remaining_zeros = (offset - pos);
 
 			if (num_bytes > total_remaining_zeros)
@@ -167,17 +172,27 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 			}
 		}
 		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
+
+		/*
+		 * pos: where we're now writing, offset: where the request was
+		 * If current pos is before request, we are filling zeros
+		 * If we are at or beyond request, we are writing the *data*
+		 * If we're in a fresh page beyond eof, zero it in either case
+		 */
+		if (pos < offset || !start_offset_in_page) {
+			/* We are extending past the previous end of the file.
+			 * Fill in zero values to the end of the page */
+			memset(((char *)ecryptfs_page_virt
+				+ start_offset_in_page), 0,
+				PAGE_CACHE_SIZE - start_offset_in_page);
+		}
+
+		/* pos >= offset, we are now writing the data request */
 		if (pos >= offset) {
 			memcpy(((char *)ecryptfs_page_virt
 				+ start_offset_in_page),
 			       (data + data_offset), num_bytes);
 			data_offset += num_bytes;
-		} else {
-			/* We are extending past the previous end of the file.
-			 * Fill in zero values up to the start of where we
-			 * will be writing data. */
-			memset(((char *)ecryptfs_page_virt
-				+ start_offset_in_page), 0, num_bytes);
 		}
 		kunmap_atomic(ecryptfs_page_virt, KM_USER0);
 		flush_dcache_page(ecryptfs_page);
-- 
cgit v1.2.3


From 087ee8d5bec1aa6d0a1dfe3067c7298375462ceb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Dec 2007 16:20:26 -0800
Subject: Fix compilation warning in dquot.c

Fix compilation warning about discarded const.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dquot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 2809768d9c41..686ab63a7c6c 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -965,7 +965,7 @@ err_out:
 }
 #endif
 
-static inline void flush_warnings(struct dquot **dquots, char *warntype)
+static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
 {
 	int i;
 
@@ -1216,7 +1216,7 @@ warn_put_all:
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 			if (inode->i_dquot[cnt])
 				mark_dquot_dirty(inode->i_dquot[cnt]);
-	flush_warnings((struct dquot **)inode->i_dquot, warntype);
+	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return ret;
 }
-- 
cgit v1.2.3


From c734c79bc397eace039bea406997efa89f879c14 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Tue, 18 Dec 2007 16:17:41 +1100
Subject: [XFS] Don't wait for pending I/Os when purging blocks beyond eof.

On last close of a file we purge blocks beyond eof. The same code is used
when we truncate the file size down. In this case we need to wait for any
pending I/Os for dirty pages beyond the new eof. For the last close case
we are not changing the file size and therefore do not need to wait for
any I/Os to complete. This fixes a performance bottleneck where writes
into the page cache and cache flushes can become mutually exclusive.

SGI-PV: 964002
SGI-Modid: xfs-linux-melb:xfs-kern:30220a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Peter Leckie <pleckie@sgi.com>
---
 fs/xfs/xfs_inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index abf509a88915..344948082819 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1459,8 +1459,10 @@ xfs_itruncate_start(
 	mp = ip->i_mount;
 	vp = XFS_ITOV(ip);
 
-	vn_iowait(ip);  /* wait for the completion of any pending DIOs */
-	
+	/* wait for the completion of any pending DIOs */
+	if (new_size < ip->i_size)
+		vn_iowait(ip);
+
 	/*
 	 * Call toss_pages or flushinval_pages to get rid of pages
 	 * overlapping the region being removed.  We have to use
-- 
cgit v1.2.3


From 041388b54ed95cd169546bd83bacd08ee32bd7ea Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>
Date: Tue, 18 Dec 2007 16:19:34 +1100
Subject: [XFS] Put the correct offset in dirent d_off

The recent filldir regression fix was not putting the correct d_off in
each dirent. This was resulting in incorrect cookies being passed to dmapi
ioctls and the wrong offset appearing in the dirents. readdir was
unaffected as the filp->f_pos was being updated with the correct offset
and this was being written into the last dirent in each buffer. Fix the
XFS code to do the right thing.

SGI-PV: 973746
SGI-Modid: xfs-linux-melb:xfs-kern:30240a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c | 4 ++--
 fs/xfs/xfs_dir2_block.c     | 6 ++----
 fs/xfs/xfs_dir2_leaf.c      | 2 +-
 fs/xfs/xfs_dir2_sf.c        | 9 +++------
 4 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 54c564693d93..e1fcef2eb928 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -356,13 +356,13 @@ xfs_file_readdir(
 
 			reclen = sizeof(struct hack_dirent) + de->namlen;
 			size -= reclen;
-			curr_offset = de->offset /* & 0x7fffffff */;
 			de = (struct hack_dirent *)((char *)de + reclen);
+			curr_offset = de->offset /* & 0x7fffffff */;
 		}
 	}
 
  done:
- 	if (!error) {
+	if (!error) {
 		if (size == 0)
 			filp->f_pos = offset & 0x7fffffff;
 		else if (de)
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index c171767e242a..a5f4f4fb8868 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -508,7 +508,7 @@ xfs_dir2_block_getdents(
 			continue;
 
 		cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-						    ptr - (char *)block);
+					    (char *)dep - (char *)block);
 		ino = be64_to_cpu(dep->inumber);
 #if XFS_BIG_INUMS
 		ino += mp->m_inoadd;
@@ -519,9 +519,7 @@ xfs_dir2_block_getdents(
 		 */
 		if (filldir(dirent, dep->name, dep->namelen, cook,
 			    ino, DT_UNKNOWN)) {
-			*offset = xfs_dir2_db_off_to_dataptr(mp,
-					mp->m_dirdatablk,
-					(char *)dep - (char *)block);
+			*offset = cook;
 			xfs_da_brelse(NULL, bp);
 			return 0;
 		}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e7c12fa1303e..0ca0020ba09f 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1091,7 +1091,7 @@ xfs_dir2_leaf_getdents(
 		 * Won't fit.  Return to caller.
 		 */
 		if (filldir(dirent, dep->name, dep->namelen,
-			    xfs_dir2_byte_to_dataptr(mp, curoff + length),
+			    xfs_dir2_byte_to_dataptr(mp, curoff),
 			    ino, DT_UNKNOWN))
 			break;
 
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 182c70315ad1..919d275a1cef 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -752,7 +752,7 @@ xfs_dir2_sf_getdents(
 #if XFS_BIG_INUMS
 		ino += mp->m_inoadd;
 #endif
-		if (filldir(dirent, ".", 1, dotdot_offset, ino, DT_DIR)) {
+		if (filldir(dirent, ".", 1, dot_offset, ino, DT_DIR)) {
 			*offset = dot_offset;
 			return 0;
 		}
@@ -762,13 +762,11 @@ xfs_dir2_sf_getdents(
 	 * Put .. entry unless we're starting past it.
 	 */
 	if (*offset <= dotdot_offset) {
-		off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-						  XFS_DIR2_DATA_FIRST_OFFSET);
 		ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
 #if XFS_BIG_INUMS
 		ino += mp->m_inoadd;
 #endif
-		if (filldir(dirent, "..", 2, off, ino, DT_DIR)) {
+		if (filldir(dirent, "..", 2, dotdot_offset, ino, DT_DIR)) {
 			*offset = dotdot_offset;
 			return 0;
 		}
@@ -793,8 +791,7 @@ xfs_dir2_sf_getdents(
 #endif
 
 		if (filldir(dirent, sfep->name, sfep->namelen,
-			    off + xfs_dir2_data_entsize(sfep->namelen),
-			    ino, DT_UNKNOWN)) {
+					    off, ino, DT_UNKNOWN)) {
 			*offset = off;
 			return 0;
 		}
-- 
cgit v1.2.3


From 3c378158d4cd2125b42fe2b8bb23d512fdff6fe6 Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Thu, 20 Dec 2007 11:47:07 +0300
Subject: mm: fix exit_mmap BUG() on a.out binary exit

The problem was introduced by commit "mm: variable length argument
support" (b6a2fea39318e43fee84fa7b0b90d68bed92d2ba)
as it didn't update fs/binfmt_aout.c like other binfmt's.

I noticed that on alpha when accidentally launched old OSF/1
Acrobat Reader binary. Obviously, other architectures are affected
as well.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Ollie Wild <aaw@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_aout.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index e176d195e7e5..7596e1e94cde 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -319,7 +319,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
-	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
 #ifdef __sparc__
-- 
cgit v1.2.3


From bad60fdd14df32459e31cc75ab681e4458bf25cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 21 Dec 2007 10:58:56 +1100
Subject: [XFS] Fix mknod regression

This was broken by my '[XFS] simplify xfs_create/mknod/symlink prototype',
which assigned the re-shuffled ondisk dev_t back to the rdev variable in
xfs_vn_mknod. Because of that i_rdev is set to the ondisk dev_t instead of
the linux dev_t later down the function.

Fortunately the fix for it is trivial: we can just remove the assignment
because xfs_revalidate_inode has done the proper job before unlocking the
inode.

SGI-PV: 974873
SGI-Modid: xfs-linux-melb:xfs-kern:30273a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 37e116779eb1..5e8bb7f71b5a 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -332,9 +332,7 @@ xfs_vn_mknod(
 		ASSERT(vp);
 		ip = vn_to_inode(vp);
 
-		if (S_ISCHR(mode) || S_ISBLK(mode))
-			ip->i_rdev = rdev;
-		else if (S_ISDIR(mode))
+		if (S_ISDIR(mode))
 			xfs_validate_fields(ip);
 		d_instantiate(dentry, ip);
 		xfs_validate_fields(dir);
-- 
cgit v1.2.3


From 4743e0ec1217fd00f57461ebdd7979d31af18700 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@redback.melbourne.sgi.com>
Date: Fri, 21 Dec 2007 11:00:23 +1100
Subject: [XFS] Initialise current offset in xfs_file_readdir correctly

After reading the directory contents into the temporary buffer, we grab
each dirent and pass it to filldir witht eh current offset of the dirent.
The current offset was not being set for the first dirent in the temporary
buffer, which coul dresult in bad offsets being set in the f_pos field
result in looping and duplicate entries being returned from readdir.

SGI-PV: 974905
SGI-Modid: xfs-linux-melb:xfs-kern:30282a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e1fcef2eb928..4847eb83fc18 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -347,6 +347,7 @@ xfs_file_readdir(
 
 		size = buf.used;
 		de = (struct hack_dirent *)buf.dirent;
+		curr_offset = de->offset /* & 0x7fffffff */;
 		while (size > 0) {
 			if (filldir(dirent, de->name, de->namlen,
 					curr_offset & 0x7fffffff,
-- 
cgit v1.2.3


From b88629060b03adc58639f818fe0968bf5fe81b5d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 22 Dec 2007 14:03:24 -0800
Subject: ecryptfs: fix string overflow on long cipher names

Passing a cipher name > 32 chars on mount results in an overflow when the
cipher name is printed, because the last character in the struct
ecryptfs_key_tfm's cipher_name string was never zeroed.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index bbed2fd40fdc..67e8b16f7bcb 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1847,6 +1847,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
 	mutex_init(&tmp_tfm->key_tfm_mutex);
 	strncpy(tmp_tfm->cipher_name, cipher_name,
 		ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+	tmp_tfm->cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
 	tmp_tfm->key_size = key_size;
 	rc = ecryptfs_process_key_cipher(&tmp_tfm->key_tfm,
 					 tmp_tfm->cipher_name,
-- 
cgit v1.2.3


From 22dd483721939b4ea22d5d3925e69112f63c42bc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 22 Dec 2007 14:03:25 -0800
Subject: Fix computation of SKB size for quota messages

Fix computation of size of skb needed for quota message.  We should use
netlink provided functions and not just an ad-hoc number.  Also don't print
the return value from nla_put_foo() as it is always -1.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dquot.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 686ab63a7c6c..b2592abaa713 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -895,9 +895,6 @@ out_lock:
 
 #ifdef CONFIG_QUOTA_NETLINK_INTERFACE
 
-/* Size of quota netlink message - actually an upperbound for buffer size */
-#define QUOTA_NL_MSG_SIZE 32
-
 /* Netlink family structure for quota */
 static struct genl_family quota_genl_family = {
 	.id = GENL_ID_GENERATE,
@@ -914,11 +911,13 @@ static void send_warning(const struct dquot *dquot, const char warntype)
 	struct sk_buff *skb;
 	void *msg_head;
 	int ret;
+	int msg_size = 4 * nla_total_size(sizeof(u32)) +
+		       2 * nla_total_size(sizeof(u64));
 
 	/* We have to allocate using GFP_NOFS as we are called from a
 	 * filesystem performing write and thus further recursion into
 	 * the fs to free some data could cause deadlocks. */
-	skb = genlmsg_new(QUOTA_NL_MSG_SIZE, GFP_NOFS);
+	skb = genlmsg_new(msg_size, GFP_NOFS);
 	if (!skb) {
 		printk(KERN_ERR
 		  "VFS: Not enough memory to send quota warning.\n");
@@ -959,7 +958,7 @@ static void send_warning(const struct dquot *dquot, const char warntype)
 			"VFS: Failed to send notification message: %d\n", ret);
 	return;
 attr_err_out:
-	printk(KERN_ERR "VFS: Failed to compose quota message: %d\n", ret);
+	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
 err_out:
 	kfree_skb(skb);
 }
-- 
cgit v1.2.3


From c525460e2754dbb33abe2b37d3d941126b2ea830 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 22 Dec 2007 14:03:25 -0800
Subject: Don't send quota messages repeatedly when hardlimit reached

We should send quota message to netlink only once when hardlimit is
reached.  Otherwise user could easily make the system busy by trying to
exceed the hardlimit (and also the messages could be anoying if you cannot
stop writing just now).

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dquot.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index b2592abaa713..cee7c6f428f0 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -827,6 +827,18 @@ static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 	clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
 
+static int warning_issued(struct dquot *dquot, const int warntype)
+{
+	int flag = (warntype == QUOTA_NL_BHARDWARN ||
+		warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
+		((warntype == QUOTA_NL_IHARDWARN ||
+		warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);
+
+	if (!flag)
+		return 0;
+	return test_and_set_bit(flag, &dquot->dq_flags);
+}
+
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 static int flag_print_warnings = 1;
 
@@ -845,16 +857,12 @@ static inline int need_print_warning(struct dquot *dquot)
 }
 
 /* Print warning to user which exceeded quota */
-static void print_warning(struct dquot *dquot, const char warntype)
+static void print_warning(struct dquot *dquot, const int warntype)
 {
 	char *msg = NULL;
 	struct tty_struct *tty;
-	int flag = (warntype == QUOTA_NL_BHARDWARN ||
-		warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
-		((warntype == QUOTA_NL_IHARDWARN ||
-		warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);
 
-	if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags)))
+	if (!need_print_warning(dquot))
 		return;
 
 	mutex_lock(&tty_mutex);
@@ -969,7 +977,8 @@ static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
 	int i;
 
 	for (i = 0; i < MAXQUOTAS; i++)
-		if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN) {
+		if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN &&
+		    !warning_issued(dquots[i], warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 			print_warning(dquots[i], warntype[i]);
 #endif
-- 
cgit v1.2.3


From c8161f64ccdcc3ac05c7bbfebc031e7ad5ca6412 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 22 Dec 2007 14:03:26 -0800
Subject: ecryptfs: fix unlocking in error paths

Thanks to Josef Bacik for finding these.

A couple of ecryptfs error paths don't properly unlock things they locked.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Josef Bacik <jbacik@redhat.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c    | 8 ++++----
 fs/ecryptfs/messaging.c | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 67e8b16f7bcb..f8ef0af919e7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -799,7 +799,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
 	rc = ecryptfs_crypto_api_algify_cipher_name(&full_alg_name,
 						    crypt_stat->cipher, "cbc");
 	if (rc)
-		goto out;
+		goto out_unlock;
 	crypt_stat->tfm = crypto_alloc_blkcipher(full_alg_name, 0,
 						 CRYPTO_ALG_ASYNC);
 	kfree(full_alg_name);
@@ -808,12 +808,12 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
 		ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
 				"Error initializing cipher [%s]\n",
 				crypt_stat->cipher);
-		mutex_unlock(&crypt_stat->cs_tfm_mutex);
-		goto out;
+		goto out_unlock;
 	}
 	crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
-	mutex_unlock(&crypt_stat->cs_tfm_mutex);
 	rc = 0;
+out_unlock:
+	mutex_unlock(&crypt_stat->cs_tfm_mutex);
 out:
 	return rc;
 }
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index a96d341d154d..9cc2aec27b0d 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -427,6 +427,7 @@ int ecryptfs_init_messaging(unsigned int transport)
 	if (!ecryptfs_daemon_id_hash) {
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n");
+		mutex_unlock(&ecryptfs_daemon_id_hash_mux);
 		goto out;
 	}
 	for (i = 0; i < ecryptfs_hash_buckets; i++)
-- 
cgit v1.2.3


From 16317ec2e5a85884fea680d24c1b228a5602159f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 22 Dec 2007 14:03:26 -0800
Subject: ecryptfs: redo dget,mntget on dentry_open failure

Thanks to Jeff Moyer for pointing this out.

If the RDWR dentry_open() in ecryptfs_init_persistent_file fails,
it will do a dput/mntput.  Need to re-take references if we
retry as RDONLY.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Mike Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index a277754da171..e5580bcb923a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -138,11 +138,14 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 		inode_info->lower_file = dentry_open(lower_dentry,
 						     lower_mnt,
 						     (O_RDWR | O_LARGEFILE));
-		if (IS_ERR(inode_info->lower_file))
+		if (IS_ERR(inode_info->lower_file)) {
+			dget(lower_dentry);
+			mntget(lower_mnt);
 			inode_info->lower_file = dentry_open(lower_dentry,
 							     lower_mnt,
 							     (O_RDONLY
 							      | O_LARGEFILE));
+		}
 		if (IS_ERR(inode_info->lower_file)) {
 			printk(KERN_ERR "Error opening lower persistent file "
 			       "for lower_dentry [0x%p] and lower_mnt [0x%p]\n",
-- 
cgit v1.2.3


From dae5dbdbd786798ad2249e54df1156d524da30aa Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sun, 30 Dec 2007 23:49:57 +0000
Subject: [CIFS] fix SetEA failure to some Samba versions

Thanks to Oleg Gvozdev for noticing the problem.

CC: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   | 2 +-
 fs/cifs/cifssmb.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index a609599287aa..13d788f9e5f0 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,7 @@ Version 1.52
 Fix oops on second mount to server when null auth is used.
 Enable experimental Kerberos support.  Return writebehind errors on flush
 and sync so that events like out of disk space get reported properly on
-cached files.
+cached files. Fix setxattr failure to certain Samba versions.
 
 Version 1.51
 ------------
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9e8a6bef029a..618542b8ce0b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5499,7 +5499,7 @@ SetEARetry:
 	else
 		name_len = strnlen(ea_name, 255);
 
-	count = sizeof(*parm_data) + ea_value_len + name_len + 1;
+	count = sizeof(*parm_data) + ea_value_len + name_len;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
 	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find max SMB size from sess */
 	pSMB->MaxSetupCount = 0;
-- 
cgit v1.2.3


From 05b3de63da2abe804f5dbe0174298bf48949079f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Dec 2007 00:51:45 +0000
Subject: [CIFS] Only dump SPNEGO key if CONFIG_CIFS_DEBUG2 is set

The SPNEGO key data is not terribly interesting except in certain
debugging situations. Only dump it to the ring buffer if needed.

Signed-off-by: Jeff Layton <jlayton@tupile.poochiereds.net>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 1529d2b12e9c..d543accc10dd 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -122,11 +122,13 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	cFYI(1, ("key description = %s", description));
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 
+#ifdef CONFIG_CIFS_DEBUG2
 	if (cifsFYI && !IS_ERR(spnego_key)) {
 		struct cifs_spnego_msg *msg = spnego_key->payload.data;
-		cifs_dump_mem("SPNEGO reply blob:", msg->data,
-				msg->secblob_len + msg->sesskey_len);
+		cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024,
+				msg->secblob_len + msg->sesskey_len));
 	}
+#endif /* CONFIG_CIFS_DEBUG2 */
 
 out:
 	kfree(description);
-- 
cgit v1.2.3


From 1d9a8852c365fb7f8db0f8364210138985f457b8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Dec 2007 01:37:11 +0000
Subject: [CIFS] redo existing session setup if needed in cifs_mount

When cifs_mount finds an existing SMB session that it can use for a new
mount, it does not check to see whether that session is in need of being
reconnected. An easy way to reproduce:

1) mount //server/share1
2) watch /proc/fs/cifs/DebugData for the share to go DISCONNECTED
3) mount //server/share2 with same creds as in step 1.

The second mount will fail because CIFSTCon returned -EAGAIN. If you do
an operation in share1 and then reattempt the mount it will work (since
the session is reestablished).

The following patch fixes this by having cifs_mount check the status
of the session when it picks an existing session and calling
cifs_setup_session on it again if it's in need of reconnection.

Thanks to Wojciech Pilorz for the initial bug report.

Signed-off-by: Jeff Layton <jlayton@tupile.poochiereds.net>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   |  3 ++-
 fs/cifs/connect.c | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 13d788f9e5f0..0c778765bd79 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,8 @@ Version 1.52
 Fix oops on second mount to server when null auth is used.
 Enable experimental Kerberos support.  Return writebehind errors on flush
 and sync so that events like out of disk space get reported properly on
-cached files. Fix setxattr failure to certain Samba versions.
+cached files. Fix setxattr failure to certain Samba versions. Fix mount
+of second share to disconnected server session (autoreconnect on this).
 
 Version 1.51
 ------------
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index fd9147cdb5a9..658f58b99e6f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1964,7 +1964,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 	if (existingCifsSes) {
 		pSesInfo = existingCifsSes;
-		cFYI(1, ("Existing smb sess found"));
+		cFYI(1, ("Existing smb sess found (status=%d)",
+			pSesInfo->status));
+		if (pSesInfo->status == CifsNeedReconnect) {
+			cFYI(1, ("Session needs reconnect"));
+			down(&pSesInfo->sesSem);
+			rc = cifs_setup_session(xid, pSesInfo,
+						cifs_sb->local_nls);
+			up(&pSesInfo->sesSem);
+		}
 	} else if (!rc) {
 		cFYI(1, ("Existing smb sess not found"));
 		pSesInfo = sesInfoAlloc();
-- 
cgit v1.2.3


From 1a67570c76402b36695cd0725e28649ee8fe830d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Dec 2007 04:03:02 +0000
Subject: [CIFS]  use krb5 session key from first SMB session after a NegProt

Currently, any new kerberos SMB session overwrites the server's session
key. The session key should only be set by the first SMB session set up
on the socket.

Signed-off-by: Jeff Layton <jlayton@tupile.poochiereds.net>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d0cb469daab7..ce698d5f6107 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -528,9 +528,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 			rc = -EOVERFLOW;
 			goto ssetup_exit;
 		}
-		ses->server->mac_signing_key.len = msg->sesskey_len;
-		memcpy(ses->server->mac_signing_key.data.krb5, msg->data,
-			msg->sesskey_len);
+		if (first_time) {
+			ses->server->mac_signing_key.len = msg->sesskey_len;
+			memcpy(ses->server->mac_signing_key.data.krb5,
+				msg->data, msg->sesskey_len);
+		}
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 		capabilities |= CAP_EXTENDED_SECURITY;
 		pSMB->req.Capabilities = cpu_to_le32(capabilities);
-- 
cgit v1.2.3


From bb5a9a04d4cab4b13d63ac5cd3e1fb35f9583607 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 31 Dec 2007 04:21:29 +0000
Subject: [CIFS] cifs_partialpagewrite() cleanup

rc cannot be -EBADF now and condition is always true

Signed-off-by: Vasily Averin <vvs@sw.ru>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index dd26e2759b17..5f7c374ae89c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1179,12 +1179,10 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 		atomic_dec(&open_file->wrtPending);
 		/* Does mm or vfs already set times? */
 		inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
-		if ((bytes_written > 0) && (offset)) {
+		if ((bytes_written > 0) && (offset))
 			rc = 0;
-		} else if (bytes_written < 0) {
-			if (rc != -EBADF)
-				rc = bytes_written;
-		}
+		else if (bytes_written < 0)
+			rc = bytes_written;
 	} else {
 		cFYI(1, ("No writeable filehandles for inode"));
 		rc = -EIO;
-- 
cgit v1.2.3


From 28c5a02a11f70bb1fd8dd3b633206e2db3220308 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Dec 2007 04:56:21 +0000
Subject: [CIFS]  fix unicode string alignment in SPNEGO setup

Unicode strings need to be word aligned, but the code that handles that
is currently not taking the length of the SPNEGO blob into account. Fix
it to do so.

Signed-off-by: Jeff Layton <jlayton@tupile.poochiereds.net>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index ce698d5f6107..d2153abcba6d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -542,7 +542,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 
 		if (ses->capabilities & CAP_UNICODE) {
 			/* unicode strings must be word aligned */
-			if (iov[0].iov_len % 2) {
+			if ((iov[0].iov_len + iov[1].iov_len) % 2) {
 				*bcc_ptr = 0;
 				bcc_ptr++;
 			}
-- 
cgit v1.2.3


From 97837582bc1e191d2792af74c1f3762ed01243b9 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 31 Dec 2007 07:47:21 +0000
Subject: [CIFS] Allow setting mode via cifs acl

Requires cifsacl mount flag to be on and CIFS_EXPERIMENTAL enabled

CC: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES     |   2 +
 fs/cifs/cifsacl.c   | 240 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/cifs/cifspdu.h   |   3 +
 fs/cifs/cifsproto.h |   4 +-
 fs/cifs/cifssmb.c   |  65 ++++++++++++++
 fs/cifs/inode.c     |  14 ++-
 6 files changed, 315 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 0c778765bd79..edd248367b36 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -5,6 +5,8 @@ Enable experimental Kerberos support.  Return writebehind errors on flush
 and sync so that events like out of disk space get reported properly on
 cached files. Fix setxattr failure to certain Samba versions. Fix mount
 of second share to disconnected server session (autoreconnect on this).
+Add ability to modify cifs acls for handling chmod (when mounted with
+cifsacl flag).
 
 Version 1.51
 ------------
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c312adcba4fc..a7035bd18e4e 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -129,6 +129,54 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 	return (1); /* sids compare/match */
 }
 
+
+/* copy ntsd, owner sid, and group sid from a security descriptor to another */
+static void copy_sec_desc(const struct cifs_ntsd *pntsd,
+				struct cifs_ntsd *pnntsd, __u32 sidsoffset)
+{
+	int i;
+
+	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
+
+	/* copy security descriptor control portion */
+	pnntsd->revision = pntsd->revision;
+	pnntsd->type = pntsd->type;
+	pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd));
+	pnntsd->sacloffset = 0;
+	pnntsd->osidoffset = cpu_to_le32(sidsoffset);
+	pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid));
+
+	/* copy owner sid */
+	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->osidoffset));
+	nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
+
+	nowner_sid_ptr->revision = owner_sid_ptr->revision;
+	nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
+	for (i = 0; i < 6; i++)
+		nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
+	for (i = 0; i < 5; i++)
+		nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
+
+	/* copy group sid */
+	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->gsidoffset));
+	ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
+					sizeof(struct cifs_sid));
+
+	ngroup_sid_ptr->revision = group_sid_ptr->revision;
+	ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
+	for (i = 0; i < 6; i++)
+		ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
+	for (i = 0; i < 5; i++)
+		ngroup_sid_ptr->sub_auth[i] =
+				cpu_to_le32(group_sid_ptr->sub_auth[i]);
+
+	return;
+}
+
+
 /*
    change posix mode to reflect permissions
    pmode is the existing mode (we only want to overwrite part of this
@@ -220,6 +268,33 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
 	return;
 }
 
+static __le16 fill_ace_for_sid(struct cifs_ace *pntace,
+			const struct cifs_sid *psid, __u64 nmode, umode_t bits)
+{
+	int i;
+	__u16 size = 0;
+	__u32 access_req = 0;
+
+	pntace->type = ACCESS_ALLOWED;
+	pntace->flags = 0x0;
+	mode_to_access_flags(nmode, bits, &access_req);
+	if (!access_req)
+		access_req = SET_MINIMUM_RIGHTS;
+	pntace->access_req = cpu_to_le32(access_req);
+
+	pntace->sid.revision = psid->revision;
+	pntace->sid.num_subauth = psid->num_subauth;
+	for (i = 0; i < 6; i++)
+		pntace->sid.authority[i] = psid->authority[i];
+	for (i = 0; i < psid->num_subauth; i++)
+		pntace->sid.sub_auth[i] = psid->sub_auth[i];
+
+	size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
+	pntace->size = cpu_to_le16(size);
+
+	return (size);
+}
+
 
 #ifdef CONFIG_CIFS_DEBUG2
 static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
@@ -243,7 +318,7 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
 		int i;
 		cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
 			pace->sid.revision, pace->sid.num_subauth, pace->type,
-			pace->flags, pace->size));
+			pace->flags, le16_to_cpu(pace->size)));
 		for (i = 0; i < num_subauth; ++i) {
 			cFYI(1, ("ACE sub_auth[%d]: 0x%x", i,
 				le32_to_cpu(pace->sid.sub_auth[i])));
@@ -346,6 +421,28 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 }
 
 
+static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
+			struct cifs_sid *pgrpsid, __u64 nmode)
+{
+	__le16 size = 0;
+	struct cifs_acl *pnndacl;
+
+	pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl));
+
+	size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size),
+					pownersid, nmode, S_IRWXU);
+	size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
+					pgrpsid, nmode, S_IRWXG);
+	size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
+					 &sid_everyone, nmode, S_IRWXO);
+
+	pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
+	pndacl->num_aces = 3;
+
+	return (0);
+}
+
+
 static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 {
 	/* BB need to add parm so we can store the SID BB */
@@ -432,6 +529,46 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 }
 
 
+/* Convert permission bits from mode to equivalent CIFS ACL */
+static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
+				int acl_len, struct inode *inode, __u64 nmode)
+{
+	int rc = 0;
+	__u32 dacloffset;
+	__u32 ndacloffset;
+	__u32 sidsoffset;
+	struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+	struct cifs_acl *dacl_ptr = NULL;  /* no need for SACL ptr */
+	struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
+
+	if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
+		return (-EIO);
+
+	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->osidoffset));
+	group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+				le32_to_cpu(pntsd->gsidoffset));
+
+	dacloffset = le32_to_cpu(pntsd->dacloffset);
+	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+
+	ndacloffset = sizeof(struct cifs_ntsd);
+	ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+	ndacl_ptr->revision = dacl_ptr->revision;
+	ndacl_ptr->size = 0;
+	ndacl_ptr->num_aces = 0;
+
+	rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode);
+
+	sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+
+	/* copy security descriptor control portion and owner and group sid */
+	copy_sec_desc(pntsd, pnntsd, sidsoffset);
+
+	return (rc);
+}
+
+
 /* Retrieve an ACL from the server */
 static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
 				       const char *path)
@@ -487,6 +624,64 @@ static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
 	return pntsd;
 }
 
+/* Set an ACL on the server */
+static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+				struct inode *inode, const char *path)
+{
+	struct cifsFileInfo *open_file;
+	int unlock_file = FALSE;
+	int xid;
+	int rc = -EIO;
+	__u16 fid;
+	struct super_block *sb;
+	struct cifs_sb_info *cifs_sb;
+
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+#endif
+
+	if (!inode)
+		return (rc);
+
+	sb = inode->i_sb;
+	if (sb == NULL)
+		return (rc);
+
+	cifs_sb = CIFS_SB(sb);
+	xid = GetXid();
+
+	open_file = find_readable_file(CIFS_I(inode));
+	if (open_file) {
+		unlock_file = TRUE;
+		fid = open_file->netfid;
+	} else {
+		int oplock = FALSE;
+		/* open file */
+		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
+				WRITE_DAC, 0, &fid, &oplock, NULL,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc != 0) {
+			cERROR(1, ("Unable to open file to set ACL"));
+			FreeXid(xid);
+			return (rc);
+		}
+	}
+
+	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+#ifdef CONFIG_CIFS_DEBUG2
+	cFYI(1, ("SetCIFSACL rc = %d", rc));
+#endif
+	if (unlock_file == TRUE)
+		atomic_dec(&open_file->wrtPending);
+	else
+		CIFSSMBClose(xid, cifs_sb->tcon, fid);
+
+	FreeXid(xid);
+
+	return (rc);
+}
+
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
 void acl_to_uid_mode(struct inode *inode, const char *path)
 {
@@ -510,24 +705,53 @@ void acl_to_uid_mode(struct inode *inode, const char *path)
 }
 
 /* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_acl(struct inode *inode, const char *path)
+int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 {
 	int rc = 0;
 	__u32 acllen = 0;
-	struct cifs_ntsd *pntsd = NULL;
+	struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
+	struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
 
+#ifdef CONFIG_CIFS_DEBUG2
 	cFYI(1, ("set ACL from mode for %s", path));
+#endif
 
 	/* Get the security descriptor */
 	pntsd = get_cifs_acl(&acllen, inode, path);
 
-	/* Add/Modify the three ACEs for owner, group, everyone
-	   while retaining the other ACEs */
+	/* Add three ACEs for owner, group, everyone getting rid of
+	   other ACEs as chmod disables ACEs and set the security descriptor */
 
-	/* Set the security descriptor */
+	if (pntsd) {
+		/* allocate memory for the smb header,
+		   set security descriptor request security descriptor
+		   parameters, and secuirty descriptor itself */
 
+		pnntsd = kmalloc(acllen, GFP_KERNEL);
+		if (!pnntsd) {
+			cERROR(1, ("Unable to allocate security descriptor"));
+			kfree(pntsd);
+			return (-ENOMEM);
+		}
 
-	kfree(pntsd);
-	return rc;
+		rc = build_sec_desc(pntsd, pnntsd, acllen, inode, nmode);
+
+#ifdef CONFIG_CIFS_DEBUG2
+		cFYI(1, ("build_sec_desc rc: %d", rc));
+#endif
+
+		if (!rc) {
+			/* Set the security descriptor */
+			rc = set_cifs_acl(pnntsd, acllen, inode, path);
+#ifdef CONFIG_CIFS_DEBUG2
+			cFYI(1, ("set_cifs_acl rc: %d", rc));
+#endif
+		}
+
+		kfree(pnntsd);
+		kfree(pntsd);
+	}
+
+	return (rc);
 }
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index dbe6b846f37f..47f79504f57b 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -237,6 +237,9 @@
 				| DELETE | READ_CONTROL | WRITE_DAC \
 				| WRITE_OWNER | SYNCHRONIZE)
 
+#define SET_MINIMUM_RIGHTS (FILE_READ_EA | FILE_READ_ATTRIBUTES \
+				| READ_CONTROL | SYNCHRONIZE)
+
 
 /*
  * Invalid readdir handle
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8350eec49663..7093cb4b0212 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -97,7 +97,7 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
 extern void acl_to_uid_mode(struct inode *inode, const char *search_path);
-extern int mode_to_acl(struct inode *inode, const char *path);
+extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 			const char *);
@@ -342,6 +342,8 @@ extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
 		const struct nls_table *nls_codepage, int remap_special_chars);
 extern int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon,
 			__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
+extern int CIFSSMBSetCIFSACL(const int, struct cifsTconInfo *, __u16,
+			struct cifs_ntsd *, __u32);
 extern int CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
 		const unsigned char *searchName,
 		char *acl_inf, const int buflen, const int acl_type,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 618542b8ce0b..9409524e4bf8 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3156,6 +3156,71 @@ qsec_out:
 /*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
 	return rc;
 }
+
+int
+CIFSSMBSetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
+			struct cifs_ntsd *pntsd, __u32 acllen)
+{
+	__u16 byte_count, param_count, data_count, param_offset, data_offset;
+	int rc = 0;
+	int bytes_returned = 0;
+	SET_SEC_DESC_REQ *pSMB = NULL;
+	NTRANSACT_RSP *pSMBr = NULL;
+
+setCifsAclRetry:
+	rc = smb_init(SMB_COM_NT_TRANSACT, 19, tcon, (void **) &pSMB,
+			(void **) &pSMBr);
+	if (rc)
+			return (rc);
+
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+
+	param_count = 8;
+	param_offset = offsetof(struct smb_com_transaction_ssec_req, Fid) - 4;
+	data_count = acllen;
+	data_offset = param_offset + param_count;
+	byte_count = 3 /* pad */  + param_count;
+
+	pSMB->DataCount = cpu_to_le32(data_count);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->MaxParameterCount = cpu_to_le32(4);
+	pSMB->MaxDataCount = cpu_to_le32(16384);
+	pSMB->ParameterCount = cpu_to_le32(param_count);
+	pSMB->ParameterOffset = cpu_to_le32(param_offset);
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->DataOffset = cpu_to_le32(data_offset);
+	pSMB->SetupCount = 0;
+	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_SET_SECURITY_DESC);
+	pSMB->ByteCount = cpu_to_le16(byte_count+data_count);
+
+	pSMB->Fid = fid; /* file handle always le */
+	pSMB->Reserved2 = 0;
+	pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL);
+
+	if (pntsd && acllen) {
+		memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
+			(char *) pntsd,
+			acllen);
+		pSMB->hdr.smb_buf_length += (byte_count + data_count);
+
+	} else
+		pSMB->hdr.smb_buf_length += byte_count;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
+
+	cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc));
+	if (rc)
+		cFYI(1, ("Set CIFS ACL returned %d", rc));
+	cifs_buf_release(pSMB);
+
+	if (rc == -EAGAIN)
+		goto setCifsAclRetry;
+
+	return (rc);
+}
+
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 
 /* Legacy Query Path Information call for lookup to old servers such
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e915eb1d2e66..fdc0fe109d7b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1607,7 +1607,13 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 	else if (attrs->ia_valid & ATTR_MODE) {
 		rc = 0;
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+			rc = mode_to_acl(direntry->d_inode, full_path, mode);
+		else if ((mode & S_IWUGO) == 0) /* not writeable */ {
+#else
 		if ((mode & S_IWUGO) == 0) /* not writeable */ {
+#endif
 			if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
 				set_dosattr = TRUE;
 				time_buf.Attributes =
@@ -1626,10 +1632,10 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			if (time_buf.Attributes == 0)
 				time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
 		}
-		/* BB to be implemented -
-		   via Windows security descriptors or streams */
-		/* CIFSSMBWinSetPerms(xid, pTcon, full_path, mode, uid, gid,
-				      cifs_sb->local_nls); */
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+			mode_to_acl(direntry->d_inode, full_path, mode);
+#endif
 	}
 
 	if (attrs->ia_valid & ATTR_ATIME) {
-- 
cgit v1.2.3


From 6b6adc22a01941165d5af9a3e69e28e948b28f47 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 2 Jan 2008 13:07:25 +0200
Subject: slub: register slabinfo to procfs

We need to register slabinfo to procfs when CONFIG_SLUB is enabled to
make the file actually visible to user-space.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e0d064e9764e..a11968bc0516 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -451,6 +451,20 @@ static const struct file_operations proc_slabstats_operations = {
 #endif
 #endif
 
+#ifdef CONFIG_SLUB
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+	.open		= slabinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i;
@@ -733,6 +747,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 	create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations);
 #endif
+#endif
+#ifdef CONFIG_SLUB
+	create_seq_entry("slabinfo", S_IWUSR|S_IRUGO, &proc_slabinfo_operations);
 #endif
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("pagetypeinfo", S_IRUGO, &pagetypeinfo_file_ops);
-- 
cgit v1.2.3


From 158a962422e4a54dc256b6a9b9562f3d30d34d9c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Wed, 2 Jan 2008 13:04:48 -0800
Subject: Unify /proc/slabinfo configuration

Both SLUB and SLAB really did almost exactly the same thing for
/proc/slabinfo setup, using duplicate code and per-allocator #ifdef's.

This just creates a common CONFIG_SLABINFO that is enabled by both SLUB
and SLAB, and shares all the setup code.  Maybe SLOB will want this some
day too.

Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_misc.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index a11968bc0516..3462bfde89f6 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -410,7 +410,7 @@ static const struct file_operations proc_modules_operations = {
 };
 #endif
 
-#ifdef CONFIG_SLAB
+#ifdef CONFIG_SLABINFO
 static int slabinfo_open(struct inode *inode, struct file *file)
 {
 	return seq_open(file, &slabinfo_op);
@@ -451,20 +451,6 @@ static const struct file_operations proc_slabstats_operations = {
 #endif
 #endif
 
-#ifdef CONFIG_SLUB
-static int slabinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slabinfo_op);
-}
-
-static const struct file_operations proc_slabinfo_operations = {
-	.open		= slabinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i;
@@ -742,14 +728,11 @@ void __init proc_misc_init(void)
 #endif
 	create_seq_entry("stat", 0, &proc_stat_operations);
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
-#ifdef CONFIG_SLAB
+#ifdef CONFIG_SLABINFO
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 	create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations);
 #endif
-#endif
-#ifdef CONFIG_SLUB
-	create_seq_entry("slabinfo", S_IWUSR|S_IRUGO, &proc_slabinfo_operations);
 #endif
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("pagetypeinfo", S_IRUGO, &pagetypeinfo_file_ops);
-- 
cgit v1.2.3


From 831830b5a2b5d413407adf380ef62fe17d6fcbf2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 2 Jan 2008 14:09:57 +0000
Subject: restrict reading from /proc/<pid>/maps to those who share ->mm or can
 ptrace pid

Contents of /proc/*/maps is sensitive and may become sensitive after
open() (e.g.  if target originally shares our ->mm and later does exec
on suid-root binary).

Check at read() (actually, ->start() of iterator) time that mm_struct
we'd grabbed and locked is
 - still the ->mm of target
 - equal to reader's ->mm or the target is ptracable by reader.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c       | 20 ++++++++++++++++++++
 fs/proc/internal.h   |  2 ++
 fs/proc/task_mmu.c   |  3 +--
 fs/proc/task_nommu.c |  4 +---
 4 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 02a63ac04178..7411bfb0b7cc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -202,6 +202,26 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
 	 (task->state == TASK_STOPPED || task->state == TASK_TRACED) && \
 	 security_ptrace(current,task) == 0))
 
+struct mm_struct *mm_for_maps(struct task_struct *task)
+{
+	struct mm_struct *mm = get_task_mm(task);
+	if (!mm)
+		return NULL;
+	down_read(&mm->mmap_sem);
+	task_lock(task);
+	if (task->mm != mm)
+		goto out;
+	if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+		goto out;
+	task_unlock(task);
+	return mm;
+out:
+	task_unlock(task);
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	return NULL;
+}
+
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 {
 	int res = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1820eb2ef762..05b3e9006262 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -27,6 +27,8 @@ struct vmalloc_info {
 	unsigned long	largest_chunk;
 };
 
+extern struct mm_struct *mm_for_maps(struct task_struct *);
+
 #ifdef CONFIG_MMU
 #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
 extern void get_vmalloc_info(struct vmalloc_info *vmi);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c24d81a5a040..8043a3eab52c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -397,12 +397,11 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	if (!priv->task)
 		return NULL;
 
-	mm = get_task_mm(priv->task);
+	mm = mm_for_maps(priv->task);
 	if (!mm)
 		return NULL;
 
 	priv->tail_vma = tail_vma = get_gate_vma(priv->task);
-	down_read(&mm->mmap_sem);
 
 	/* Start with last addr hint */
 	if (last_addr && (vma = find_vma(mm, last_addr))) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index d8b8c7183c24..1932c2ca3457 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -165,15 +165,13 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	if (!priv->task)
 		return NULL;
 
-	mm = get_task_mm(priv->task);
+	mm = mm_for_maps(priv->task);
 	if (!mm) {
 		put_task_struct(priv->task);
 		priv->task = NULL;
 		return NULL;
 	}
 
-	down_read(&mm->mmap_sem);
-
 	/* start from the Nth VMA */
 	for (vml = mm->context.vmlist; vml; vml = vml->next)
 		if (n-- == 0)
-- 
cgit v1.2.3


From e9cc6c234bfe414ef36f484e3ad8be621854c440 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 2 Jan 2008 13:28:57 -0500
Subject: NFS: Fix a possible Oops in fs/nfs/super.c

Sigh... commit 4584f520e1f773082ef44ff4f8969a5d992b16ec (NFS: Fix NFS
mountpoint crossing...) had a slight flaw: server can be NULL if sget()
returned an existing superblock.

Fix the fix by dereferencing s->s_fs_info.

Thanks to Coverity/Adrian Bunk and Frank Filz for spotting the bug.
(See http://bugzilla.kernel.org/show_bug.cgi?id=9647)

Also add in the same namespace Oops fix for NFSv4 in both the mountpoint
crossing case, and the referral case.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ea929207f274..0b0c72a072ff 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1475,7 +1475,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
 	}
-	if (mntroot->d_inode->i_op != server->nfs_client->rpc_ops->dir_inode_ops) {
+	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
 		dput(mntroot);
 		error = -ESTALE;
 		goto error_splat_super;
@@ -1826,6 +1826,11 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
 	}
+	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
+		dput(mntroot);
+		error = -ESTALE;
+		goto error_splat_super;
+	}
 
 	s->s_flags |= MS_ACTIVE;
 	mnt->mnt_sb = s;
@@ -1900,6 +1905,11 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
 	}
+	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) {
+		dput(mntroot);
+		error = -ESTALE;
+		goto error_splat_super;
+	}
 
 	s->s_flags |= MS_ACTIVE;
 	mnt->mnt_sb = s;
-- 
cgit v1.2.3


From b274b48f3ef6e43e3831e8793c697a9573a607af Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 2 Jan 2008 13:52:03 -0500
Subject: NFSv4: Fix circular locking dependency in nfs4_kill_renewd

Erez Zadok reports:

=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.24-rc6-unionfs2 #80
-------------------------------------------------------
umount.nfs4/4017 is trying to acquire lock:
 (&(&clp->cl_renewd)->work){--..}, at: [<c0223e53>]
__cancel_work_timer+0x83/0x17f

but task is already holding lock:
 (&clp->cl_sem){----}, at: [<f8879897>] nfs4_kill_renewd+0x17/0x29 [nfs]

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (&clp->cl_sem){----}:
       [<c0230699>] __lock_acquire+0x9cc/0xb95
       [<c0230c39>] lock_acquire+0x5f/0x78
       [<c0397cb8>] down_read+0x3a/0x4c
       [<f88798e6>] nfs4_renew_state+0x1c/0x1b8 [nfs]
       [<c0223821>] run_workqueue+0xd9/0x1ac
       [<c0224220>] worker_thread+0x7a/0x86
       [<c0226b49>] kthread+0x3b/0x62
       [<c02033a3>] kernel_thread_helper+0x7/0x10
       [<ffffffff>] 0xffffffff

-> #0 (&(&clp->cl_renewd)->work){--..}:
       [<c0230589>] __lock_acquire+0x8bc/0xb95
       [<c0230c39>] lock_acquire+0x5f/0x78
       [<c0223e87>] __cancel_work_timer+0xb7/0x17f
       [<c0223f5a>] cancel_delayed_work_sync+0xb/0xd
       [<f887989e>] nfs4_kill_renewd+0x1e/0x29 [nfs]
       [<f885a8f6>] nfs_free_client+0x37/0x9e [nfs]
       [<f885ab20>] nfs_put_client+0x5d/0x62 [nfs]
       [<f885ab9a>] nfs_free_server+0x75/0xae [nfs]
       [<f8862672>] nfs4_kill_super+0x27/0x2b [nfs]
       [<c0258aab>] deactivate_super+0x3f/0x51
       [<c0269668>] mntput_no_expire+0x42/0x67
       [<c025d0e4>] path_release_on_umount+0x15/0x18
       [<c0269d30>] sys_umount+0x1a3/0x1cb
       [<c0269d71>] sys_oldumount+0x19/0x1b
       [<c02026ca>] sysenter_past_esp+0x5f/0xa5
       [<ffffffff>] 0xffffffff

Looking at the code, it would seem that taking the clp->cl_sem in
nfs4_kill_renewd is completely redundant, since we're already guaranteed to
have exclusive access to the nfs_client (we're shutting down).

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4renewd.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 3ea352d82eba..5e2e4af1a0e6 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -133,9 +133,7 @@ nfs4_renewd_prepare_shutdown(struct nfs_server *server)
 void
 nfs4_kill_renewd(struct nfs_client *clp)
 {
-	down_read(&clp->cl_sem);
 	cancel_delayed_work_sync(&clp->cl_renewd);
-	up_read(&clp->cl_sem);
 }
 
 /*
-- 
cgit v1.2.3


From bb22629ee87eed5054f8b508dbe7c58abad0a324 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 2 Jan 2008 15:19:18 -0500
Subject: NFSv4: nfs4_open_confirm must not set the open_owner as confirmed on
 error

RFC3530 states that the open_owner is confirmed if and only if the client
sends an OPEN_CONFIRM request with the appropriate sequence id and stateid
within the lease period.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f03d9d5f5ba4..571b5ec92132 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -741,10 +741,10 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
 	if (data->rpc_status == 0) {
 		memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
 				sizeof(data->o_res.stateid.data));
+		nfs_confirm_seqid(&data->owner->so_seqid, 0);
 		renew_lease(data->o_res.server, data->timestamp);
 		data->rpc_done = 1;
 	}
-	nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status);
 	nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid);
 }
 
@@ -759,7 +759,6 @@ static void nfs4_open_confirm_release(void *calldata)
 	/* In case of error, no cleanup! */
 	if (!data->rpc_done)
 		goto out_free;
-	nfs_confirm_seqid(&data->owner->so_seqid, 0);
 	state = nfs4_opendata_to_nfs4_state(data);
 	if (!IS_ERR(state))
 		nfs4_close_state(&data->path, state, data->o_arg.open_flags);
@@ -886,7 +885,6 @@ static void nfs4_open_release(void *calldata)
 	/* In case we need an open_confirm, no cleanup! */
 	if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)
 		goto out_free;
-	nfs_confirm_seqid(&data->owner->so_seqid, 0);
 	state = nfs4_opendata_to_nfs4_state(data);
 	if (!IS_ERR(state))
 		nfs4_close_state(&data->path, state, data->o_arg.open_flags);
-- 
cgit v1.2.3


From e6e21970baff4845de74584e2efc8c964a55d574 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 2 Jan 2008 16:27:16 -0500
Subject: NFSv4: Fix open_to_lock_owner sequenceid allocation...

NFSv4 file locking is currently completely broken since it doesn't respect
the OPEN sequencing when it is given an unconfirmed lock_owner and needs to
do an open_to_lock_owner. Worse: it breaks the sunrpc rules by doing a
GFP_KERNEL allocation inside an rpciod callback.

Fix is to preallocate the open seqid structure in nfs4_alloc_lockdata if we
see that the lock_owner is unconfirmed.
Then, in nfs4_lock_prepare() we wait for either the open_seqid, if
the lock_owner is still unconfirmed, or else fall back to waiting on the
standard lock_seqid.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 571b5ec92132..9e2e1c7291db 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3331,6 +3331,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 
 	p->arg.fh = NFS_FH(inode);
 	p->arg.fl = &p->fl;
+	if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) {
+		p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
+		if (p->arg.open_seqid == NULL)
+			goto out_free;
+
+	}
 	p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
 	if (p->arg.lock_seqid == NULL)
 		goto out_free;
@@ -3343,6 +3349,8 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	memcpy(&p->fl, fl, sizeof(p->fl));
 	return p;
 out_free:
+	if (p->arg.open_seqid != NULL)
+		nfs_free_seqid(p->arg.open_seqid);
 	kfree(p);
 	return NULL;
 }
@@ -3359,23 +3367,23 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 		.rpc_cred = sp->so_cred,
 	};
 
-	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
-		return;
 	dprintk("%s: begin!\n", __FUNCTION__);
 	/* Do we need to do an open_to_lock_owner? */
 	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
-		data->arg.open_seqid = nfs_alloc_seqid(&sp->so_seqid);
-		if (data->arg.open_seqid == NULL) {
-			data->rpc_status = -ENOMEM;
-			task->tk_action = NULL;
-			goto out;
-		}
+		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
+			return;
 		data->arg.open_stateid = &state->stateid;
 		data->arg.new_lock_owner = 1;
+		/* Retest in case we raced... */
+		if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED))
+			goto do_rpc;
 	}
+	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
+		return;
+	data->arg.new_lock_owner = 0;
+do_rpc:	
 	data->timestamp = jiffies;
 	rpc_call_setup(task, &msg, 0);
-out:
 	dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
 }
 
@@ -3411,8 +3419,6 @@ static void nfs4_lock_release(void *calldata)
 	struct nfs4_lockdata *data = calldata;
 
 	dprintk("%s: begin!\n", __FUNCTION__);
-	if (data->arg.open_seqid != NULL)
-		nfs_free_seqid(data->arg.open_seqid);
 	if (data->cancelled != 0) {
 		struct rpc_task *task;
 		task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
@@ -3422,6 +3428,8 @@ static void nfs4_lock_release(void *calldata)
 		dprintk("%s: cancelling lock!\n", __FUNCTION__);
 	} else
 		nfs_free_seqid(data->arg.lock_seqid);
+	if (data->arg.open_seqid != NULL)
+		nfs_free_seqid(data->arg.open_seqid);
 	nfs4_put_lock_state(data->lsp);
 	put_nfs_open_context(data->ctx);
 	kfree(data);
-- 
cgit v1.2.3


From 88e7d705c4bdb729f02173583628ccbf49dba945 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 3 Jan 2008 17:37:09 +0000
Subject: [CIFS] hold ses sem on tcp session reconnect during mount

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 658f58b99e6f..db3746c891b5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1966,13 +1966,13 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		pSesInfo = existingCifsSes;
 		cFYI(1, ("Existing smb sess found (status=%d)",
 			pSesInfo->status));
+		down(&pSesInfo->sesSem);
 		if (pSesInfo->status == CifsNeedReconnect) {
 			cFYI(1, ("Session needs reconnect"));
-			down(&pSesInfo->sesSem);
 			rc = cifs_setup_session(xid, pSesInfo,
 						cifs_sb->local_nls);
-			up(&pSesInfo->sesSem);
 		}
+		up(&pSesInfo->sesSem);
 	} else if (!rc) {
 		cFYI(1, ("Existing smb sess not found"));
 		pSesInfo = sesInfoAlloc();
@@ -3522,7 +3522,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 		sesInfoFree(ses);
 
 	FreeXid(xid);
-	return rc;	/* BB check if we should always return zero here */
+	return rc;
 }
 
 int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
-- 
cgit v1.2.3


From 29a424f28390752a4ca2349633aaacc6be494db5 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Thu, 3 Jan 2008 13:09:33 -0600
Subject: JFS: clear PAGECACHE_TAG_DIRTY for no-write pages

When JFS decides to drop a dirty metapage, it simply clears the META_dirty
bit and leave alone the PG_dirty and PAGECACHE_TAG_DIRTY bits.

When such no-write page goes to metapage_writepage(), the `relic'
PAGECACHE_TAG_DIRTY tag should be cleared, to prevent pdflush from
repeatedly trying to sync them.  This is done through
set_page_writeback(), so call it should be called in all cases.  If
no I/O is initiated, end_page_writeback() should be called immediately.

This is how __block_write_full_page() does things.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
CC: Fengguang Wu <wfg@mail.ustc.edu.cn>
---
 fs/jfs/jfs_metapage.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index f5cd8d38af7a..b27fa117f229 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -360,6 +360,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 	struct metapage *mp;
 	int redirty = 0;
 	sector_t lblock;
+	int nr_underway = 0;
 	sector_t pblock;
 	sector_t next_block = 0;
 	sector_t page_start;
@@ -371,6 +372,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 		     (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
 
 	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
 		mp = page_to_mp(page, offset);
@@ -413,11 +415,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 			if (!bio->bi_size)
 				goto dump_bio;
 			submit_bio(WRITE, bio);
+			nr_underway++;
 			bio = NULL;
-		} else {
-			set_page_writeback(page);
+		} else
 			inc_io(page);
-		}
 		xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
 		pblock = metapage_get_blocks(inode, lblock, &xlen);
 		if (!pblock) {
@@ -449,12 +450,16 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 			goto dump_bio;
 
 		submit_bio(WRITE, bio);
+		nr_underway++;
 	}
 	if (redirty)
 		redirty_page_for_writepage(wbc, page);
 
 	unlock_page(page);
 
+	if (nr_underway == 0)
+		end_page_writeback(page);
+
 	return 0;
 add_failed:
 	/* We should never reach here, since we're only adding one vec */
-- 
cgit v1.2.3


From 67e6682f18b3bf812a994ae027ff87174a297ae8 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Wed, 10 Oct 2007 11:11:24 -0500
Subject: JFS: Make sure special inode data is written after journal is flushed

This patch makes sure that data that we tried to flush before the journal
was completely written actually gets pushed to disk.

To avoid duplicating code, moved common code to write_special_inodes().

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_logmgr.c | 32 +++++++++++++++-----------------
 fs/jfs/jfs_umount.c |  4 ++--
 2 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 15a3974cdeeb..2370716d57ad 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -208,6 +208,17 @@ static struct lmStat {
 } lmStat;
 #endif
 
+static void write_special_inodes(struct jfs_log *log,
+				 int (*writer)(struct address_space *))
+{
+	struct jfs_sb_info *sbi;
+
+	list_for_each_entry(sbi, &log->sb_list, log_list) {
+		writer(sbi->ipbmap->i_mapping);
+		writer(sbi->ipimap->i_mapping);
+		writer(sbi->direct_inode->i_mapping);
+	}
+}
 
 /*
  * NAME:	lmLog()
@@ -935,22 +946,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
 	struct lrd lrd;
 	int lsn;
 	struct logsyncblk *lp;
-	struct jfs_sb_info *sbi;
 	unsigned long flags;
 
 	/* push dirty metapages out to disk */
 	if (hard_sync)
-		list_for_each_entry(sbi, &log->sb_list, log_list) {
-			filemap_fdatawrite(sbi->ipbmap->i_mapping);
-			filemap_fdatawrite(sbi->ipimap->i_mapping);
-			filemap_fdatawrite(sbi->direct_inode->i_mapping);
-		}
+		write_special_inodes(log, filemap_fdatawrite);
 	else
-		list_for_each_entry(sbi, &log->sb_list, log_list) {
-			filemap_flush(sbi->ipbmap->i_mapping);
-			filemap_flush(sbi->ipimap->i_mapping);
-			filemap_flush(sbi->direct_inode->i_mapping);
-		}
+		write_special_inodes(log, filemap_flush);
 
 	/*
 	 *	forward syncpt
@@ -1536,7 +1538,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
 {
 	int i;
 	struct tblock *target = NULL;
-	struct jfs_sb_info *sbi;
 
 	/* jfs_write_inode may call us during read-only mount */
 	if (!log)
@@ -1598,11 +1599,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
 	if (wait < 2)
 		return;
 
-	list_for_each_entry(sbi, &log->sb_list, log_list) {
-		filemap_fdatawrite(sbi->ipbmap->i_mapping);
-		filemap_fdatawrite(sbi->ipimap->i_mapping);
-		filemap_fdatawrite(sbi->direct_inode->i_mapping);
-	}
+	write_special_inodes(log, filemap_fdatawrite);
 
 	/*
 	 * If there was recent activity, we may need to wait
@@ -1611,6 +1608,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
 	if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
 		for (i = 0; i < 200; i++) {	/* Too much? */
 			msleep(250);
+			write_special_inodes(log, filemap_fdatawrite);
 			if (list_empty(&log->cqueue) &&
 			    list_empty(&log->synclist))
 				break;
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 7971f37534a3..adcf92d3b603 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
 		/*
 		 * Wait for outstanding transactions to be written to log:
 		 */
-		jfs_flush_journal(log, 2);
+		jfs_flush_journal(log, 1);
 
 	/*
 	 * close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
 	 *
 	 * remove file system from log active file system list.
 	 */
-	jfs_flush_journal(log, 2);
+	jfs_flush_journal(log, 1);
 
 	/*
 	 * Make sure all metadata makes it to disk
-- 
cgit v1.2.3


From 54af6233d1cb84cdfaa6ea44ea0db0bcf518baac Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 26 Nov 2007 14:58:10 -0600
Subject: JFS is missing a memory barrier

JFS is missing a memory barrier needed to close the critical section before
clearing the lock bit. Use lock bitops for this.

unlock_page() has a second barrier after clearing the lock, which is
required because it checks whether the waitqueue is active without locks.
Such a barrier is not required here because the waitqueue spinlock is
always taken (something to think about if performance is an issue).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_metapage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index b27fa117f229..1dfaae5adf1b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -39,11 +39,11 @@ static struct {
 #endif
 
 #define metapage_locked(mp) test_bit(META_locked, &(mp)->flag)
-#define trylock_metapage(mp) test_and_set_bit(META_locked, &(mp)->flag)
+#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag)
 
 static inline void unlock_metapage(struct metapage *mp)
 {
-	clear_bit(META_locked, &mp->flag);
+	clear_bit_unlock(META_locked, &mp->flag);
 	wake_up(&mp->wait);
 }
 
-- 
cgit v1.2.3


From 1eb3a711d6a1c8a4697a2e89d09048353b8aefd3 Mon Sep 17 00:00:00 2001
From: Jack Stone <jack@hawkeye.stone.uk.eu.org>
Date: Tue, 31 Jul 2007 09:36:53 -0500
Subject: Remove unnecessary kmalloc casts in the jfs filesystem

Signed-off-by: Jack Stone <jack@hawkeye.stone.uk.eu.org>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_dtree.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index df25ecc418af..97c66f913393 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -592,9 +592,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
 	struct component_name ciKey;
 	struct super_block *sb = ip->i_sb;
 
-	ciKey.name =
-	    (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
-				GFP_NOFS);
+	ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
 	if (ciKey.name == 0) {
 		rc = -ENOMEM;
 		goto dtSearch_Exit2;
@@ -957,9 +955,7 @@ static int dtSplitUp(tid_t tid,
 	smp = split->mp;
 	sp = DT_PAGE(ip, smp);
 
-	key.name =
-	    (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t),
-				GFP_NOFS);
+	key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
 	if (key.name == 0) {
 		DT_PUTPAGE(smp);
 		rc = -ENOMEM;
-- 
cgit v1.2.3


From a7fe0ba7eee4f7c53077ff2bed2b581db17d00df Mon Sep 17 00:00:00 2001
From: Shaun Zinck <shaun.zinck@gmail.com>
Date: Fri, 31 Aug 2007 12:57:28 -0500
Subject: JFS: use DIV_ROUND_UP where appropriate

This replaces some macros and code, which do the same thing as DIV_ROUND_UP
defined in kernel.h, to use the DIV_ROUND_UP macro.

Signed-off-by: Shaun Zinck <shaun.zinck@gmail.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_dtree.h | 4 ++--
 fs/jfs/resize.c    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 8561c6ecece0..cdac2d5bafeb 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -74,7 +74,7 @@ struct idtentry {
 #define DTIHDRDATALEN	11
 
 /* compute number of slots for entry */
-#define	NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 )
+#define	NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15))
 
 
 /*
@@ -133,7 +133,7 @@ struct dir_table_slot {
 	( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
 
 /* compute number of slots for entry */
-#define	NDTLEAF_LEGACY(klen)	( ((2 + (klen)) + (15 - 1)) / 15 )
+#define	NDTLEAF_LEGACY(klen)	(DIV_ROUND_UP((2 + (klen)), 15))
 #define	NDTLEAF	NDTINTERNAL
 
 
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 71984ee95346..7f24a0bb08ca 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -172,7 +172,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	 */
 	t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP)
 	    << L2BPERDMAP;
-	t32 = ((t64 + (BITSPERPAGE - 1)) / BITSPERPAGE) + 1 + 50;
+	t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50;
 	newFSCKSize = t32 << sbi->l2nbperpage;
 	newFSCKAddress = newLogAddress - newFSCKSize;
 
-- 
cgit v1.2.3


From 09aaa749f637b19c308464c2b65a001e67c2a16c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 13 Nov 2007 22:16:08 -0600
Subject: JFS: Remove defconfig ptr comparison to 0

Remove sparse warning: Using plain integer as NULL pointer

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_dtree.c  | 19 ++++++++++---------
 fs/jfs/jfs_imap.c   |  4 ++--
 fs/jfs/jfs_logmgr.c |  2 +-
 fs/jfs/jfs_mount.c  |  2 +-
 fs/jfs/namei.c      |  2 +-
 5 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 97c66f913393..4dcc05819998 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -284,11 +284,11 @@ static struct dir_table_slot *find_index(struct inode *ip, u32 index,
 			release_metapage(*mp);
 			*mp = NULL;
 		}
-		if (*mp == 0) {
+		if (!(*mp)) {
 			*lblock = blkno;
 			*mp = read_index_page(ip, blkno);
 		}
-		if (*mp == 0) {
+		if (!(*mp)) {
 			jfs_err("free_index: error reading directory table");
 			return NULL;
 		}
@@ -413,7 +413,8 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 		}
 		ip->i_size = PSIZE;
 
-		if ((mp = get_index_page(ip, 0)) == 0) {
+		mp = get_index_page(ip, 0);
+		if (!mp) {
 			jfs_err("add_index: get_metapage failed!");
 			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
 			memcpy(&jfs_ip->i_dirtable, temp_table,
@@ -461,7 +462,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 	} else
 		mp = read_index_page(ip, blkno);
 
-	if (mp == 0) {
+	if (!mp) {
 		jfs_err("add_index: get/read_metapage failed!");
 		goto clean_up;
 	}
@@ -499,7 +500,7 @@ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
 
 	dirtab_slot = find_index(ip, index, &mp, &lblock);
 
-	if (dirtab_slot == 0)
+	if (!dirtab_slot)
 		return;
 
 	dirtab_slot->flag = DIR_INDEX_FREE;
@@ -526,7 +527,7 @@ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
 
 	dirtab_slot = find_index(ip, index, mp, lblock);
 
-	if (dirtab_slot == 0)
+	if (!dirtab_slot)
 		return;
 
 	DTSaddress(dirtab_slot, bn);
@@ -552,7 +553,7 @@ static int read_index(struct inode *ip, u32 index,
 	struct dir_table_slot *slot;
 
 	slot = find_index(ip, index, &mp, &lblock);
-	if (slot == 0) {
+	if (!slot) {
 		return -EIO;
 	}
 
@@ -593,7 +594,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
 	struct super_block *sb = ip->i_sb;
 
 	ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
-	if (ciKey.name == 0) {
+	if (!ciKey.name) {
 		rc = -ENOMEM;
 		goto dtSearch_Exit2;
 	}
@@ -956,7 +957,7 @@ static int dtSplitUp(tid_t tid,
 	sp = DT_PAGE(ip, smp);
 
 	key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
-	if (key.name == 0) {
+	if (!key.name) {
 		DT_PUTPAGE(smp);
 		rc = -ENOMEM;
 		goto dtSplitUp_Exit;
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 3870ba8b9086..9bf29f771737 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -381,7 +381,7 @@ int diRead(struct inode *ip)
 
 	/* read the page of disk inode */
 	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
-	if (mp == 0) {
+	if (!mp) {
 		jfs_err("diRead: read_metapage failed");
 		return -EIO;
 	}
@@ -654,7 +654,7 @@ int diWrite(tid_t tid, struct inode *ip)
 	/* read the page of disk inode */
       retry:
 	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
-	if (mp == 0)
+	if (!mp)
 		return -EIO;
 
 	/* get the pointer to the disk inode */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 2370716d57ad..325a9679b95a 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2345,7 +2345,7 @@ int jfsIOWait(void *arg)
 
 	do {
 		spin_lock_irq(&log_redrive_lock);
-		while ((bp = log_redrive_list) != 0) {
+		while ((bp = log_redrive_list)) {
 			log_redrive_list = bp->l_redrive_next;
 			bp->l_redrive_next = NULL;
 			spin_unlock_irq(&log_redrive_lock);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 644429acb8c0..7b698f2ec45a 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -147,7 +147,7 @@ int jfs_mount(struct super_block *sb)
 	 */
 	if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
 		ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
-		if (ipaimap2 == 0) {
+		if (!ipaimap2) {
 			jfs_err("jfs_mount: Faild to read AGGREGATE_I");
 			rc = -EIO;
 			goto errout35;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4e0a8493cef6..d6e5ebad739a 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1103,7 +1103,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * Make sure dest inode number (if any) is what we think it is
 	 */
 	rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
-	if (rc == 0) {
+	if (!rc) {
 		if ((new_ip == 0) || (ino != new_ip->i_ino)) {
 			rc = -ESTALE;
 			goto out3;
-- 
cgit v1.2.3


From da8a41d19233c2bdcc59447aedc808fcdaabf5b7 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Tue, 13 Nov 2007 22:25:41 -0600
Subject: JFS: FIx one more plain integer as NULL pointer warning

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index d6e5ebad739a..f8718de3505e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1104,7 +1104,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 */
 	rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
 	if (!rc) {
-		if ((new_ip == 0) || (ino != new_ip->i_ino)) {
+		if ((!new_ip) || (ino != new_ip->i_ino)) {
 			rc = -ESTALE;
 			goto out3;
 		}
-- 
cgit v1.2.3


From 3fee37c1e2579ed3d6090f690e5fd8cf7fa3bb44 Mon Sep 17 00:00:00 2001
From: Akos Maroy <darkeye@tyrell.hu>
Date: Sun, 6 Jan 2008 11:15:55 +0100
Subject: fix: using joysticks in 32 bit applications on 64 bit systems

unfortunately 32 bit apps don't see the joysticks on a 64 bit system.
this prevents one playing X-Plane (http://www.x-plane.com/) or other
32-bit games with joysticks.

this is a known issue, and already raised several times:

 http://readlist.com/lists/vger.kernel.org/linux-kernel/28/144411.html

 http://www.brettcsmith.org/wiki/wiki.cgi?action=browse&diff=1&id=OzyComputer/Joystick

unfortunately this is still not fixed in the mainline kernel.

it would be nice to have this fixed, so that people can play these games
without having to patch their kernel.

the following patch solves the problem on 2.6.22.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e8b7c3a98a54..da8cb3b3592c 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -10,6 +10,8 @@
  * ioctls.
  */
 
+#include <linux/joystick.h>
+
 #include <linux/types.h>
 #include <linux/compat.h>
 #include <linux/kernel.h>
@@ -2642,6 +2644,12 @@ COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES)
 COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
 COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE)
 
+/* joystick */
+COMPATIBLE_IOCTL(JSIOCGVERSION)
+COMPATIBLE_IOCTL(JSIOCGAXES)
+COMPATIBLE_IOCTL(JSIOCGBUTTONS)
+COMPATIBLE_IOCTL(JSIOCGNAME(0))
+
 /* now things that need handlers */
 HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob)
 HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob)
-- 
cgit v1.2.3


From 45626bb26a6ecd163e5eeddd14a6137052ec4495 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 7 Jan 2008 14:22:44 -0800
Subject: core dump: real_parent ppid

The pr_ppid field reported in core dumps should match what
getppid() would have returned to that process, regardless of
whether a debugger is attached.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ba8de7ca260b..f0b3171842f2 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1384,7 +1384,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
 	prstatus->pr_sighold = p->blocked.sig[0];
 	prstatus->pr_pid = task_pid_vnr(p);
-	prstatus->pr_ppid = task_pid_vnr(p->parent);
+	prstatus->pr_ppid = task_pid_vnr(p->real_parent);
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
@@ -1430,7 +1430,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_psargs[len] = 0;
 
 	psinfo->pr_pid = task_pid_vnr(p);
-	psinfo->pr_ppid = task_pid_vnr(p->parent);
+	psinfo->pr_ppid = task_pid_vnr(p->real_parent);
 	psinfo->pr_pgrp = task_pgrp_vnr(p);
 	psinfo->pr_sid = task_session_vnr(p);
 
-- 
cgit v1.2.3


From f6d09982197c4163c70f6af0cf15bb78674105c0 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 8 Jan 2008 23:18:22 +0000
Subject: [CIFS] fix checkpatch warnings in fs/cifs/inode.c

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README  | 28 +++++++++++++++++-----------
 fs/cifs/TODO    | 14 ++++++--------
 fs/cifs/inode.c | 14 ++++++++------
 3 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index bf11329ac784..c623e2f9c5db 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -56,7 +56,8 @@ the CIFS VFS web site) copy it to the same directory in which mount.smbfs and
 similar files reside (usually /sbin).  Although the helper software is not  
 required, mount.cifs is recommended.  Eventually the Samba 3.0 utility program 
 "net" may also be helpful since it may someday provide easier mount syntax for
-users who are used to Windows e.g.  net use <mount point> <UNC name or cifs URL>
+users who are used to Windows e.g.
+	net use <mount point> <UNC name or cifs URL>
 Note that running the Winbind pam/nss module (logon service) on all of your
 Linux clients is useful in mapping Uids and Gids consistently across the
 domain to the proper network user.  The mount.cifs mount helper can be
@@ -248,7 +249,7 @@ A partial list of the supported mount options follows:
 		the CIFS session.
   password	The user password.  If the mount helper is
 		installed, the user will be prompted for password
-		if it is not supplied.
+		if not supplied.
   ip		The ip address of the target server
   unc		The target server Universal Network Name (export) to 
 		mount.	
@@ -283,7 +284,7 @@ A partial list of the supported mount options follows:
 		can be enabled by specifying file_mode and dir_mode on 
 		the client.  Note that the mount.cifs helper must be
 		at version 1.10 or higher to support specifying the uid
-		(or gid) in non-numberic form.
+		(or gid) in non-numeric form.
   gid		Set the default gid for inodes (similar to above).
   file_mode     If CIFS Unix extensions are not supported by the server
 		this overrides the default mode for file inodes.
@@ -417,9 +418,10 @@ A partial list of the supported mount options follows:
   acl   	Allow setfacl and getfacl to manage posix ACLs if server
 		supports them.  (default)
   noacl 	Do not allow setfacl and getfacl calls on this mount
-  user_xattr    Allow getting and setting user xattrs as OS/2 EAs (extended
-		attributes) to the server (default) e.g. via setfattr 
-		and getfattr utilities. 
+  user_xattr    Allow getting and setting user xattrs (those attributes whose
+		name begins with "user." or "os2.") as OS/2 EAs (extended
+		attributes) to the server.  This allows support of the
+		setfattr and getfattr utilities. (default)
   nouser_xattr  Do not allow getfattr/setfattr to get/set/list xattrs 
   mapchars      Translate six of the seven reserved characters (not backslash)
 			*?<>|:
@@ -434,6 +436,7 @@ A partial list of the supported mount options follows:
  nomapchars     Do not translate any of these seven characters (default).
  nocase         Request case insensitive path name matching (case
 		sensitive is the default if the server suports it).
+		(mount option "ignorecase" is identical to "nocase")
  posixpaths     If CIFS Unix extensions are supported, attempt to
 		negotiate posix path name support which allows certain
 		characters forbidden in typical CIFS filenames, without
@@ -485,6 +488,9 @@ A partial list of the supported mount options follows:
 			ntlmv2i Use NTLMv2 password hashing with packet signing
 			lanman  (if configured in kernel config) use older
 				lanman hash
+hard		Retry file operations if server is not responding
+soft		Limit retries to unresponsive servers (usually only
+		one retry) before returning an error.  (default)
 
 The mount.cifs mount helper also accepts a few mount options before -o
 including:
@@ -535,8 +541,8 @@ SecurityFlags		Flags which control security negotiation and
 			must use NTLM					0x02002
 			may use NTLMv2					0x00004
 			must use NTLMv2					0x04004
-			may use Kerberos security (not implemented yet) 0x00008
-			must use Kerberos (not implemented yet)         0x08008
+			may use Kerberos security			0x00008
+			must use Kerberos				0x08008
 			may use lanman (weak) password hash  		0x00010
 			must use lanman password hash			0x10010
 			may use plaintext passwords    			0x00020
@@ -626,6 +632,6 @@ returned success.
 	
 Also note that "cat /proc/fs/cifs/DebugData" will display information about 
 the active sessions and the shares that are mounted.
-Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is enabled
-but requires a user space helper (from the Samba project). NTLM and NTLMv2 and
-LANMAN support do not require this helpr.
+Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is
+on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and
+LANMAN support do not require this helper.
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index a8852c200728..92c9feac440f 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
-Version 1.49 April 26, 2007
+Version 1.52 January 3, 2008
 
 A Partial List of Missing Features
 ==================================
@@ -16,16 +16,14 @@ SecurityDescriptors
 c) Better pam/winbind integration (e.g. to handle uid mapping
 better)
 
-d) Verify that Kerberos signing works
-
-e) Cleanup now unneeded SessSetup code in
+d) Cleanup now unneeded SessSetup code in
 fs/cifs/connect.c and add back in NTLMSSP code if any servers
 need it
 
-f) MD5-HMAC signing SMB PDUs when SPNEGO style SessionSetup 
-used (Kerberos or NTLMSSP). Signing alreadyimplemented for NTLM
-and raw NTLMSSP already. This is important when enabling
-extended security and mounting to Windows 2003 Servers
+e) ms-dfs and ms-dfs host name resolution cleanup
+
+f) fix NTLMv2 signing when two mounts with different users to same
+server.
 
 g) Directory entry caching relies on a 1 second timer, rather than 
 using FindNotify or equivalent.  - (started)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index fdc0fe109d7b..d9567ba2960b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -54,9 +54,9 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 					    MAX_TREE_SIZE + 1) +
 				    strnlen(search_path, MAX_PATHCONF) + 1,
 				    GFP_KERNEL);
-			if (tmp_path == NULL) {
+			if (tmp_path == NULL)
 				return -ENOMEM;
-			}
+
 			/* have to skip first of the double backslash of
 			   UNC name */
 			strncpy(tmp_path, pTcon->treeName, MAX_TREE_SIZE);
@@ -511,7 +511,8 @@ int cifs_get_inode_info(struct inode **pinode,
 		}
 
 		spin_lock(&inode->i_lock);
-		if (is_size_safe_to_change(cifsInfo, le64_to_cpu(pfindData->EndOfFile))) {
+		if (is_size_safe_to_change(cifsInfo,
+					   le64_to_cpu(pfindData->EndOfFile))) {
 			/* can not safely shrink the file size here if the
 			   client is writing to it due to potential races */
 			i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
@@ -931,7 +932,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		(CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
 		u32 oplock = 0;
-		FILE_UNIX_BASIC_INFO * pInfo =
+		FILE_UNIX_BASIC_INFO *pInfo =
 			kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
 		if (pInfo == NULL) {
 			rc = -ENOMEM;
@@ -1610,10 +1611,11 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
 			rc = mode_to_acl(direntry->d_inode, full_path, mode);
-		else if ((mode & S_IWUGO) == 0) /* not writeable */ {
+		else if ((mode & S_IWUGO) == 0) {
 #else
-		if ((mode & S_IWUGO) == 0) /* not writeable */ {
+		if ((mode & S_IWUGO) == 0) {
 #endif
+			/* not writeable */
 			if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
 				set_dosattr = TRUE;
 				time_buf.Attributes =
-- 
cgit v1.2.3


From 9f966be8996f2829406324c68e4c67c2d64d864b Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Tue, 8 Jan 2008 15:32:41 -0800
Subject: fat: optimize fat_count_free_clusters()

On large partition, scanning the free clusters is very slow if users
doesn't use "usefree" option.

For optimizing it, this patch uses sb_breadahead() to read of FAT
sectors. On some user's 15GB partition, this patch improved it very
much (1min => 600ms).

The following is the result of 2GB partition on my machine.

without patch:
	root@devron (/)# time df -h > /dev/null

	real    0m1.202s
	user    0m0.000s
	sys     0m0.440s

with patch:
	root@devron (/)# time df -h > /dev/null

	real    0m0.378s
	user    0m0.012s
	sys     0m0.168s

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fatent.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'fs')

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 2c1b73fb82ae..5fb366992b73 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -590,21 +590,49 @@ error:
 
 EXPORT_SYMBOL_GPL(fat_free_clusters);
 
+/* 128kb is the whole sectors for FAT12 and FAT16 */
+#define FAT_READA_SIZE		(128 * 1024)
+
+static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
+			  unsigned long reada_blocks)
+{
+	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	sector_t blocknr;
+	int i, offset;
+
+	ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
+
+	for (i = 0; i < reada_blocks; i++)
+		sb_breadahead(sb, blocknr + i);
+}
+
 int fat_count_free_clusters(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct fatent_operations *ops = sbi->fatent_ops;
 	struct fat_entry fatent;
+	unsigned long reada_blocks, reada_mask, cur_block;
 	int err = 0, free;
 
 	lock_fat(sbi);
 	if (sbi->free_clusters != -1)
 		goto out;
 
+	reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
+	reada_mask = reada_blocks - 1;
+	cur_block = 0;
+
 	free = 0;
 	fatent_init(&fatent);
 	fatent_set_entry(&fatent, FAT_START_ENT);
 	while (fatent.entry < sbi->max_cluster) {
+		/* readahead of fat blocks */
+		if ((cur_block & reada_mask) == 0) {
+			unsigned long rest = sbi->fat_length - cur_block;
+			fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
+		}
+		cur_block++;
+
 		err = fat_ent_read_block(sb, &fatent);
 		if (err)
 			goto out;
-- 
cgit v1.2.3


From caeeeecfdaeada2998eb3c29c3ebd59afb79ef06 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 8 Jan 2008 15:33:02 -0800
Subject: eCryptfs: fix dentry handling on create error, unlink, and inode
 destroy

This patch corrects some erroneous dentry handling in eCryptfs.

If there is a problem creating the lower file, then there is nothing that
the persistent lower file can do to really help us.  This patch makes a
vfs_create() failure in the lower filesystem always lead to an
unconditional do_create failure in eCryptfs.

Under certain sequences of operations, the eCryptfs dentry can remain in
the dcache after an unlink.  This patch calls d_drop() on the eCryptfs
dentry to correct this.

eCryptfs has no business calling d_delete() directly on a lower
filesystem's dentry.  This patch removes the call to d_delete() on the
lower persistent file's dentry in ecryptfs_destroy_inode().

(Thanks to David Kleikamp, Eric Sandeen, and Jeff Moyer for helping
identify and resolve this issue)

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/inode.c | 20 ++++----------------
 fs/ecryptfs/super.c |  1 -
 2 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0b1ab016fa2e..5a719180983c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -120,22 +120,9 @@ ecryptfs_do_create(struct inode *directory_inode,
 	rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
 					     ecryptfs_dentry, mode, nd);
 	if (rc) {
-		struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
-		struct ecryptfs_inode_info *inode_info =
-			ecryptfs_inode_to_private(ecryptfs_inode);
-
-		printk(KERN_WARNING "%s: Error creating underlying file; "
-		       "rc = [%d]; checking for existing\n", __FUNCTION__, rc);
-		if (inode_info) {
-			mutex_lock(&inode_info->lower_file_mutex);
-			if (!inode_info->lower_file) {
-				mutex_unlock(&inode_info->lower_file_mutex);
-				printk(KERN_ERR "%s: Failure to set underlying "
-				       "file; rc = [%d]\n", __FUNCTION__, rc);
-				goto out_lock;
-			}
-			mutex_unlock(&inode_info->lower_file_mutex);
-		}
+		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
+		       "rc = [%d]\n", __FUNCTION__, rc);
+		goto out_lock;
 	}
 	rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
 				directory_inode->i_sb, 0);
@@ -451,6 +438,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
 	dentry->d_inode->i_nlink =
 		ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
 	dentry->d_inode->i_ctime = dir->i_ctime;
+	d_drop(dentry);
 out_unlock:
 	unlock_parent(lower_dentry);
 	return rc;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index f8cdab2bee3d..4859c4eecd65 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -86,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 			fput(inode_info->lower_file);
 			inode_info->lower_file = NULL;
 			d_drop(lower_dentry);
-			d_delete(lower_dentry);
 		}
 	}
 	mutex_unlock(&inode_info->lower_file_mutex);
-- 
cgit v1.2.3


From cf0594625083111ae522496dc1c256f7476939c2 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 8 Jan 2008 15:33:20 -0800
Subject: hfs: handle more on-disk corruptions without oopsing

hfs seems prone to bad things when it encounters on disk corruption.  Many
values are read from disk, and used as lengths to memcpy, as an example.
This patch fixes up several of these problematic cases.

o sanity check the on-disk maximum key lengths on mount
  (these are set to a defined value at mkfs time and shouldn't differ)
o check on-disk node keylens against the maximum key length for each tree
o fix hfs_btree_open so that going out via free_tree: doesn't wind
  up in hfs_releasepage, which wants to follow the very pointer
  we were trying to set up:
	HFS_SB(sb)->cat_tree = hfs_btree_open()
		...
		failure gets to hfs_releasepage and tries
		to follow HFS_SB(sb)->cat_tree

Tested with the fsfuzzer; it survives more than it used to.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfs/bfind.c | 12 ++++++++++++
 fs/hfs/brec.c  | 15 +++++++++++++--
 fs/hfs/btree.c | 13 ++++++++++++-
 fs/hfs/hfs.h   |  5 +++++
 4 files changed, 42 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index f13f1494d4fe..f8452a0eab56 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 		rec = (e + b) / 2;
 		len = hfs_brec_lenoff(bnode, rec, &off);
 		keylen = hfs_brec_keylen(bnode, rec);
+		if (keylen == HFS_BAD_KEYLEN) {
+			res = -EINVAL;
+			goto done;
+		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
 		cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
 		if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 	if (rec != e && e >= 0) {
 		len = hfs_brec_lenoff(bnode, e, &off);
 		keylen = hfs_brec_keylen(bnode, e);
+		if (keylen == HFS_BAD_KEYLEN) {
+			res = -EINVAL;
+			goto done;
+		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
 	}
 done:
@@ -198,6 +206,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
 
 	len = hfs_brec_lenoff(bnode, fd->record, &off);
 	keylen = hfs_brec_keylen(bnode, fd->record);
+	if (keylen == HFS_BAD_KEYLEN) {
+		res = -EINVAL;
+		goto out;
+	}
 	fd->keyoffset = off;
 	fd->keylength = keylen;
 	fd->entryoffset = off + keylen;
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 5c87cf4801fc..8626ee375ea8 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -44,10 +44,21 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
 		recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
 		if (!recoff)
 			return 0;
-		if (node->tree->attributes & HFS_TREE_BIGKEYS)
+		if (node->tree->attributes & HFS_TREE_BIGKEYS) {
 			retval = hfs_bnode_read_u16(node, recoff) + 2;
-		else
+			if (retval > node->tree->max_key_len + 2) {
+				printk(KERN_ERR "hfs: keylen %d too large\n",
+					retval);
+				retval = HFS_BAD_KEYLEN;
+			}
+		} else {
 			retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+			if (retval > node->tree->max_key_len + 1) {
+				printk(KERN_ERR "hfs: keylen %d too large\n",
+					retval);
+				retval = HFS_BAD_KEYLEN;
+			}
+		}
 	}
 	return retval;
 }
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 8a3a650abc87..31284c77bba8 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -81,6 +81,17 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 		goto fail_page;
 	if (!tree->node_count)
 		goto fail_page;
+	if ((id == HFS_EXT_CNID) && (tree->max_key_len != HFS_MAX_EXT_KEYLEN)) {
+		printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+			tree->max_key_len);
+		goto fail_page;
+	}
+	if ((id == HFS_CAT_CNID) && (tree->max_key_len != HFS_MAX_CAT_KEYLEN)) {
+		printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+			tree->max_key_len);
+		goto fail_page;
+	}
+
 	tree->node_size_shift = ffs(size) - 1;
 	tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
@@ -89,9 +100,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	return tree;
 
  fail_page:
-	tree->inode->i_mapping->a_ops = &hfs_aops;
 	page_cache_release(page);
  free_tree:
+	tree->inode->i_mapping->a_ops = &hfs_aops;
 	iput(tree->inode);
 	kfree(tree);
 	return NULL;
diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h
index 1445e3a56ed4..c6aae61adfe6 100644
--- a/fs/hfs/hfs.h
+++ b/fs/hfs/hfs.h
@@ -28,6 +28,8 @@
 #define HFS_MAX_NAMELEN		128
 #define HFS_MAX_VALENCE		32767U
 
+#define HFS_BAD_KEYLEN		0xFF
+
 /* Meanings of the drAtrb field of the MDB,
  * Reference: _Inside Macintosh: Files_ p. 2-61
  */
@@ -167,6 +169,9 @@ typedef union hfs_btree_key {
 	struct hfs_ext_key ext;
 } hfs_btree_key;
 
+#define HFS_MAX_CAT_KEYLEN	(sizeof(struct hfs_cat_key) - sizeof(u8))
+#define HFS_MAX_EXT_KEYLEN	(sizeof(struct hfs_ext_key) - sizeof(u8))
+
 typedef union hfs_btree_key btree_key;
 
 struct hfs_extent {
-- 
cgit v1.2.3


From 6103335de8afa5d780dcd512abe85c696af7b040 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 9 Jan 2008 16:21:36 +0000
Subject: [CIFS] DNS name resolution helper upcall for cifs

	Adds additional option CIFS_DFS_UPCALL to fs/Kconfig for enabling
        DFS support.  Resolved IP address is saved as a string in the
	key payload.

	Igor has a series of related patches that will follow which finish up
	CIFS DFS support

Acked-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/Kconfig       | 39 ++++++++++++++++++++++++++-------------
 fs/cifs/Makefile |  2 ++
 fs/cifs/cifsfs.c | 15 ++++++++++++++-
 3 files changed, 42 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 487236c65837..18cd22149466 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1905,13 +1905,15 @@ config CIFS
 	  file servers such as Windows 2000 (including Windows 2003, NT 4  
 	  and Windows XP) as well by Samba (which provides excellent CIFS
 	  server support for Linux and many other operating systems). Limited
-	  support for OS/2 and Windows ME and similar servers is provided as well.
-
-	  The intent of the cifs module is to provide an advanced
-	  network file system client for mounting to CIFS compliant servers,
-	  including support for dfs (hierarchical name space), secure per-user
-	  session establishment, safe distributed caching (oplock), optional
-	  packet signing, Unicode and other internationalization improvements. 
+	  support for OS/2 and Windows ME and similar servers is provided as
+	  well.
+
+	  The cifs module provides an advanced network file system
+	  client for mounting to CIFS compliant servers.  It includes
+	  support for DFS (hierarchical name space), secure per-user
+	  session establishment via Kerberos or NTLM or NTLMv2,
+	  safe distributed caching (oplock), optional packet
+	  signing, Unicode and other internationalization improvements.
 	  If you need to mount to Samba or Windows from this machine, say Y.
 
 config CIFS_STATS
@@ -1943,7 +1945,8 @@ config CIFS_WEAK_PW_HASH
 	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
 	  security mechanisms. These hash the password more securely
 	  than the mechanisms used in the older LANMAN version of the
-          SMB protocol needed to establish sessions with old SMB servers.
+	  SMB protocol but LANMAN based authentication is needed to
+	  establish sessions with some old SMB servers.
 
 	  Enabling this option allows the cifs module to mount to older
 	  LANMAN based servers such as OS/2 and Windows 95, but such
@@ -1951,8 +1954,8 @@ config CIFS_WEAK_PW_HASH
 	  security mechanisms if you are on a public network.  Unless you
 	  have a need to access old SMB servers (and are on a private 
 	  network) you probably want to say N.  Even if this support
-	  is enabled in the kernel build, they will not be used
-	  automatically. At runtime LANMAN mounts are disabled but
+	  is enabled in the kernel build, LANMAN authentication will not be
+	  used automatically. At runtime LANMAN mounts are disabled but
 	  can be set to required (or optional) either in
 	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
 	  option on the mount command. This support is disabled by 
@@ -2018,12 +2021,22 @@ config CIFS_UPCALL
 	  depends on CIFS_EXPERIMENTAL
 	  depends on KEYS
 	  help
-	    Enables an upcall mechanism for CIFS which will be used to contact
-	    userspace helper utilities to provide SPNEGO packaged Kerberos
-	    tickets which are needed to mount to certain secure servers
+	    Enables an upcall mechanism for CIFS which accesses
+	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+	    Kerberos tickets which are needed to mount to certain secure servers
 	    (for which more secure Kerberos authentication is required). If
 	    unsure, say N.
 
+config CIFS_DFS_UPCALL
+	  bool "DFS feature support (EXPERIMENTAL)"
+	  depends on CIFS_EXPERIMENTAL
+	  depends on KEYS
+	  help
+	    Enables an upcall mechanism for CIFS which contacts userspace
+	    helper utilities to provide server name resolution (host names to
+	    IP addresses) which is needed for implicit mounts of DFS junction
+	    points. If unsure, say N.
+
 config NCP_FS
 	tristate "NCP file system support (to mount NetWare volumes)"
 	depends on IPX!=n || INET
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 45e42fb97c19..09898b8dc69b 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -9,3 +9,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  readdir.o ioctl.o sess.o export.o cifsacl.o
 
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
+
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 093beaa3900d..000b4a5d3219 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -44,6 +44,7 @@
 #include "cifs_fs_sb.h"
 #include <linux/mm.h>
 #include <linux/key-type.h>
+#include "dns_resolve.h"
 #include "cifs_spnego.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
 
@@ -1014,12 +1015,17 @@ init_cifs(void)
 	rc = register_key_type(&cifs_spnego_key_type);
 	if (rc)
 		goto out_unregister_filesystem;
+#endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	rc = register_key_type(&key_type_dns_resolver);
+	if (rc)
+		goto out_unregister_key_type;
 #endif
 	oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd");
 	if (IS_ERR(oplockThread)) {
 		rc = PTR_ERR(oplockThread);
 		cERROR(1, ("error %d create oplock thread", rc));
-		goto out_unregister_key_type;
+		goto out_unregister_dfs_key_type;
 	}
 
 	dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
@@ -1033,7 +1039,11 @@ init_cifs(void)
 
  out_stop_oplock_thread:
 	kthread_stop(oplockThread);
+ out_unregister_dfs_key_type:
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	unregister_key_type(&key_type_dns_resolver);
  out_unregister_key_type:
+#endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
  out_unregister_filesystem:
@@ -1059,6 +1069,9 @@ exit_cifs(void)
 #ifdef CONFIG_PROC_FS
 	cifs_proc_clean();
 #endif
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	unregister_key_type(&key_type_dns_resolver);
+#endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
 #endif
-- 
cgit v1.2.3


From 197c183f3526dc08aa52ca97ec66c268442d4b84 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 10 Jan 2008 17:10:23 +0000
Subject: [CIFS] Forgot to add two new files from previous commit

Thanks to Igor for noticing this.
CC: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dns_resolve.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/dns_resolve.h |  32 +++++++++++++
 2 files changed, 155 insertions(+)
 create mode 100644 fs/cifs/dns_resolve.c
 create mode 100644 fs/cifs/dns_resolve.h

(limited to 'fs')

diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
new file mode 100644
index 000000000000..777a086abd6f
--- /dev/null
+++ b/fs/cifs/dns_resolve.c
@@ -0,0 +1,123 @@
+/*
+ *  fs/cifs/dns_resolve.c
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *
+ *   Contains the CIFS DFS upcall routines used for hostname to
+ *   IP address translation.
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <keys/user-type.h>
+#include "dns_resolve.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+
+static int dns_resolver_instantiate(struct key *key, const void *data,
+		size_t datalen)
+{
+	int rc = 0;
+	char *ip;
+
+	ip = kmalloc(datalen+1, GFP_KERNEL);
+	if (!ip)
+		return -ENOMEM;
+
+	memcpy(ip, data, datalen);
+	ip[datalen] = '\0';
+
+	rcu_assign_pointer(key->payload.data, ip);
+
+	return rc;
+}
+
+struct key_type key_type_dns_resolver = {
+	.name        = "dns_resolver",
+	.def_datalen = sizeof(struct in_addr),
+	.describe    = user_describe,
+	.instantiate = dns_resolver_instantiate,
+	.match       = user_match,
+};
+
+
+/* Resolves server name to ip address.
+ * input:
+ * 	unc - server UNC
+ * output:
+ * 	*ip_addr - pointer to server ip, caller responcible for freeing it.
+ * return 0 on success
+ */
+int
+dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) {
+	int rc = -EAGAIN;
+	struct key *rkey;
+	char *name;
+	int len;
+
+	if ((!ip_addr) || (!unc))
+		return -EINVAL;
+
+	/* search for server name delimiter */
+	len = strlen(unc);
+	if (len < 3) {
+		cFYI(1, ("%s: unc is too short: %s", __FUNCTION__, unc));
+		return -EINVAL;
+	}
+	len -= 2;
+	name = memchr(unc+2, '\\', len);
+	if (!name) {
+		cFYI(1, ("%s: probably server name is whole unc: %s",
+					__FUNCTION__, unc));
+	} else {
+		len = (name - unc) - 2/* leading // */;
+	}
+
+	name = kmalloc(len+1, GFP_KERNEL);
+	if (!name) {
+		rc = -ENOMEM;
+		return rc;
+	}
+	memcpy(name, unc+2, len);
+	name[len] = 0;
+
+	rkey = request_key(&key_type_dns_resolver, name, "");
+	if (!IS_ERR(rkey)) {
+		len = strlen(rkey->payload.data);
+		*ip_addr = kmalloc(len+1, GFP_KERNEL);
+		if (*ip_addr) {
+			memcpy(*ip_addr, rkey->payload.data, len);
+			(*ip_addr)[len] = '\0';
+			cFYI(1, ("%s: resolved: %s to %s", __FUNCTION__,
+					rkey->description,
+					*ip_addr
+				));
+			rc = 0;
+		} else {
+			rc = -ENOMEM;
+		}
+		key_put(rkey);
+	} else {
+		cERROR(1, ("%s: unable to resolve: %s", __FUNCTION__, name));
+	}
+
+	kfree(name);
+	return rc;
+}
+
+
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
new file mode 100644
index 000000000000..073fdc3db419
--- /dev/null
+++ b/fs/cifs/dns_resolve.h
@@ -0,0 +1,32 @@
+/*
+ *   fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS
+ *                            Handles host name to IP address resolution
+ * 
+ *   Copyright (c) International Business Machines  Corp., 2008
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _DNS_RESOLVE_H
+#define _DNS_RESOLVE_H
+
+#ifdef __KERNEL__
+#include <linux/key-type.h>
+extern struct key_type key_type_dns_resolver;
+extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
+#endif /* KERNEL */
+
+#endif /* _DNS_RESOLVE_H */
-- 
cgit v1.2.3


From d0dc3701cb46f73cf8ca393f62e325065b0bbd03 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 10 Jan 2008 16:07:54 -0500
Subject: NFSv4: Give the lock stateid its own sequence queue

Sharing the open sequence queue causes a deadlock when we try to take
both a lock sequence id and and open sequence id.

This fixes the regression reported by Dimitri Puzin and Jeff Garzik: See

	http://bugzilla.kernel.org/show_bug.cgi?id=9712

for details.

Reported-and-tested-by: Dimitri Puzin <bugs@psycast.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Tested-by: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/nfs4_fs.h   | 1 +
 fs/nfs/nfs4state.c | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b35069a2aa9e..bd1b9d663fb9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -115,6 +115,7 @@ struct nfs4_lock_state {
 #define NFS_LOCK_INITIALIZED 1
 	int			ls_flags;
 	struct nfs_seqid_counter	ls_seqid;
+	struct rpc_sequence	ls_sequence;
 	struct nfs_unique_id	ls_id;
 	nfs4_stateid		ls_stateid;
 	atomic_t		ls_count;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 23a9a36556bf..5a39c6f78acf 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -509,7 +509,10 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 	lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
 	if (lsp == NULL)
 		return NULL;
-	lsp->ls_seqid.sequence = &state->owner->so_sequence;
+	rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
+	spin_lock_init(&lsp->ls_sequence.lock);
+	INIT_LIST_HEAD(&lsp->ls_sequence.list);
+	lsp->ls_seqid.sequence = &lsp->ls_sequence;
 	atomic_set(&lsp->ls_count, 1);
 	lsp->ls_owner = fl_owner;
 	spin_lock(&clp->cl_lock);
-- 
cgit v1.2.3


From 967c9ec4ec6178bee42f4231c49a3d7f77627978 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Thu, 10 Jan 2008 16:04:25 -0600
Subject: JFS: simplify types to get rid of sparse warning

jfs_metapage.c was using uints and unsigned ints inconsistently when
regular ints suffice.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/jfs_metapage.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 1dfaae5adf1b..d1e64f2f2fcd 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -88,7 +88,7 @@ struct meta_anchor {
 };
 #define mp_anchor(page) ((struct meta_anchor *)page_private(page))
 
-static inline struct metapage *page_to_mp(struct page *page, uint offset)
+static inline struct metapage *page_to_mp(struct page *page, int offset)
 {
 	if (!PagePrivate(page))
 		return NULL;
@@ -153,7 +153,7 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
 }
 
 #else
-static inline struct metapage *page_to_mp(struct page *page, uint offset)
+static inline struct metapage *page_to_mp(struct page *page, int offset)
 {
 	return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
 }
@@ -249,7 +249,7 @@ static inline void drop_metapage(struct page *page, struct metapage *mp)
  */
 
 static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
-				    unsigned int *len)
+				    int *len)
 {
 	int rc = 0;
 	int xflag;
@@ -352,11 +352,11 @@ static void metapage_write_end_io(struct bio *bio, int err)
 static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct bio *bio = NULL;
-	unsigned int block_offset;	/* block offset of mp within page */
+	int block_offset;	/* block offset of mp within page */
 	struct inode *inode = page->mapping->host;
-	unsigned int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
-	unsigned int len;
-	unsigned int xlen;
+	int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
+	int len;
+	int xlen;
 	struct metapage *mp;
 	int redirty = 0;
 	sector_t lblock;
@@ -366,7 +366,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 	sector_t page_start;
 	unsigned long bio_bytes = 0;
 	unsigned long bio_offset = 0;
-	unsigned int offset;
+	int offset;
 
 	page_start = (sector_t)page->index <<
 		     (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -428,7 +428,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 			continue;
 		}
 		set_bit(META_io, &mp->flag);
-		len = min(xlen, (uint) JFS_SBI(inode->i_sb)->nbperpage);
+		len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
 
 		bio = bio_alloc(GFP_NOFS, 1);
 		bio->bi_bdev = inode->i_sb->s_bdev;
@@ -480,13 +480,13 @@ static int metapage_readpage(struct file *fp, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct bio *bio = NULL;
-	unsigned int block_offset;
-	unsigned int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+	int block_offset;
+	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
 	sector_t page_start;	/* address of page in fs blocks */
 	sector_t pblock;
-	unsigned int xlen;
+	int xlen;
 	unsigned int len;
-	unsigned int offset;
+	int offset;
 
 	BUG_ON(!PageLocked(page));
 	page_start = (sector_t)page->index <<
@@ -535,7 +535,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
 {
 	struct metapage *mp;
 	int ret = 1;
-	unsigned int offset;
+	int offset;
 
 	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
 		mp = page_to_mp(page, offset);
-- 
cgit v1.2.3


From e6ab15827eec0bc4444421f7ccf0223de321c708 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Fri, 11 Jan 2008 01:49:48 +0000
Subject: [CIFS] DFS support patchset: Added mountdata

Also cifs_fs_type was made not static for ussage in dfs code.

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_fs_sb.h |  5 ++++-
 fs/cifs/cifsfs.c     | 37 ++++++++++++++++++++++++++++++++++++-
 fs/cifs/cifsfs.h     |  1 +
 3 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 34af556cdd8d..8ad2330ba061 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,6 +43,9 @@ struct cifs_sb_info {
 	mode_t	mnt_dir_mode;
 	int     mnt_cifs_flags;
 	int	prepathlen;
-	char   *prepath;
+	char   *prepath; /* relative path under the share to mount to */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	char   *mountdata; /* mount options received at mount time */
+#endif
 };
 #endif				/* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 000b4a5d3219..93e107883a61 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -97,6 +97,9 @@ cifs_read_super(struct super_block *sb, void *data,
 {
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	int len;
+#endif
 	int rc = 0;
 
 	/* BB should we make this contingent on mount parm? */
@@ -106,6 +109,25 @@ cifs_read_super(struct super_block *sb, void *data,
 	if (cifs_sb == NULL)
 		return -ENOMEM;
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	/* copy mount params to sb for use in submounts */
+	/* BB: should we move this after the mount so we
+	 * do not have to do the copy on failed mounts?
+	 * BB: May be it is better to do simple copy before
+	 * complex operation (mount), and in case of fail
+	 * just exit instead of doing mount and attempting
+	 * undo it if this copy fails?*/
+	len = strlen(data);
+	cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
+	if (cifs_sb->mountdata == NULL) {
+		kfree(sb->s_fs_info);
+		sb->s_fs_info = NULL;
+		return -ENOMEM;
+	}
+	strncpy(cifs_sb->mountdata, data, len + 1);
+	cifs_sb->mountdata[len] = '\0';
+#endif
+
 	rc = cifs_mount(sb, cifs_sb, data, devname);
 
 	if (rc) {
@@ -155,6 +177,12 @@ out_no_root:
 
 out_mount_failed:
 	if (cifs_sb) {
+#ifdef CONFIG_CIFS_DFS_UPCALL
+		if (cifs_sb->mountdata) {
+			kfree(cifs_sb->mountdata);
+			cifs_sb->mountdata = NULL;
+		}
+#endif
 		if (cifs_sb->local_nls)
 			unload_nls(cifs_sb->local_nls);
 		kfree(cifs_sb);
@@ -178,6 +206,13 @@ cifs_put_super(struct super_block *sb)
 	if (rc) {
 		cERROR(1, ("cifs_umount failed with return code %d", rc));
 	}
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	if (cifs_sb->mountdata) {
+		kfree(cifs_sb->mountdata);
+		cifs_sb->mountdata = NULL;
+	}
+#endif
+
 	unload_nls(cifs_sb->local_nls);
 	kfree(cifs_sb);
 	return;
@@ -553,7 +588,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 	return remote_llseek(file, offset, origin);
 }
 
-static struct file_system_type cifs_fs_type = {
+struct file_system_type cifs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "cifs",
 	.get_sb = cifs_get_sb,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2a21dc66f0de..2e68126d07eb 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -32,6 +32,7 @@
 #define TRUE 1
 #endif
 
+extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
-- 
cgit v1.2.3


From aea6ad0ce5e215ce99fe9e3edd9268f696862d8f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 10 Jan 2008 16:43:26 +1100
Subject: [XFS] fix unaligned access in readdir

This patch should fix the issue seen on Alpha with unaligned accesses in
the new readdir code. By aligning each dirent to sizeof(u64) we'll avoid
unaligned accesses. To make doubly sure we're not hitting problems also
rearrange struct hack_dirent to avoid holes.

SGI-PV: 975411
SGI-Modid: xfs-linux-melb:xfs-kern:30302a

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 4847eb83fc18..21a1c2b1c5fc 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -261,9 +261,9 @@ xfs_file_readdir(
 #else
 
 struct hack_dirent {
-	int		namlen;
-	loff_t		offset;
 	u64		ino;
+	loff_t		offset;
+	int		namlen;
 	unsigned int	d_type;
 	char		name[];
 };
@@ -285,8 +285,10 @@ xfs_hack_filldir(
 {
 	struct hack_callback *buf = __buf;
 	struct hack_dirent *de = (struct hack_dirent *)(buf->dirent + buf->used);
+	unsigned int reclen;
 
-	if (buf->used + sizeof(struct hack_dirent) + namlen > buf->len)
+	reclen = ALIGN(sizeof(struct hack_dirent) + namlen, sizeof(u64));
+	if (buf->used + reclen > buf->len)
 		return -EINVAL;
 
 	de->namlen = namlen;
@@ -294,7 +296,7 @@ xfs_hack_filldir(
 	de->ino = ino;
 	de->d_type = d_type;
 	memcpy(de->name, name, namlen);
-	buf->used += sizeof(struct hack_dirent) + namlen;
+	buf->used += reclen;
 	return 0;
 }
 
@@ -334,7 +336,8 @@ xfs_file_readdir(
 		offset = filp->f_pos;
 
 	while (!eof) {
-		int reclen;
+		unsigned int reclen;
+
 		start_offset = offset;
 
 		buf.used = 0;
@@ -355,7 +358,8 @@ xfs_file_readdir(
 				goto done;
 			}
 
-			reclen = sizeof(struct hack_dirent) + de->namlen;
+			reclen = ALIGN(sizeof(struct hack_dirent) + de->namlen,
+				       sizeof(u64));
 			size -= reclen;
 			de = (struct hack_dirent *)((char *)de + reclen);
 			curr_offset = de->offset /* & 0x7fffffff */;
-- 
cgit v1.2.3


From 974a9f0b47da74e28f68b9c8645c3786aa5ace1a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Sat, 12 Jan 2008 14:06:34 -0800
Subject: Use access mode instead of open flags to determine needed permissions

Way back when (in commit 834f2a4a1554dc5b2598038b3fe8703defcbe467, aka
"VFS: Allow the filesystem to return a full file pointer on open intent"
to be exact), Trond changed the open logic to keep track of the original
flags to a file open, in order to pass down the the intent of a dentry
lookup to the low-level filesystem.

However, when doing that reorganization, it changed the meaning of
namei_flags, and thus inadvertently changed the test of access mode for
directories (and RO filesystem) to use the wrong flag.  So fix those
test back to use access mode ("acc_mode") rather than the open flag
("flag").

Issue noticed by Bill Roman at Datalight.

Reported-and-tested-by: Bill Roman <bill.roman@datalight.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 3b993db26cee..73e2e665817a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1605,7 +1605,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 	if (S_ISLNK(inode->i_mode))
 		return -ELOOP;
 	
-	if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
+	if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
 		return -EISDIR;
 
 	/*
@@ -1620,7 +1620,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 			return -EACCES;
 
 		flag &= ~O_TRUNC;
-	} else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
+	} else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
 		return -EROFS;
 
 	error = vfs_permission(nd, acc_mode);
-- 
cgit v1.2.3


From 84427eaef1fb91704c7112bdb598c810003b99f3 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 10 Jan 2008 12:52:04 -0800
Subject: remove task_ppid_nr_ns

task_ppid_nr_ns is called in three places.  One of these should never
have called it.  In the other two, using it broke the existing
semantics.  This was presumably accidental.  If the function had not
been there, it would have been much more obvious to the eye that those
patches were changing the behavior.  We don't need this function.

In task_state, the pid of the ptracer is not the ppid of the ptracer.

In do_task_stat, ppid is the tgid of the real_parent, not its pid.
I also moved the call outside of lock_task_sighand, since it doesn't
need it.

In sys_getppid, ppid is the tgid of the real_parent, not its pid.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 65c62e1bfd6f..810eb8fd6500 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -169,7 +169,7 @@ static inline char *task_state(struct task_struct *p, char *buffer)
 	ppid = pid_alive(p) ?
 		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
 	tpid = pid_alive(p) && p->ptrace ?
-		task_ppid_nr_ns(rcu_dereference(p->parent), ns) : 0;
+		task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
@@ -426,6 +426,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 	cgtime = gtime = cputime_zero;
 
 	rcu_read_lock();
+	ppid = task_tgid_nr_ns(task->real_parent, ns);
 	if (lock_task_sighand(task, &flags)) {
 		struct signal_struct *sig = task->signal;
 
@@ -465,7 +466,6 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 
 		sid = task_session_nr_ns(task, ns);
 		pgid = task_pgrp_nr_ns(task, ns);
-		ppid = task_ppid_nr_ns(task, ns);
 
 		unlock_task_sighand(task, &flags);
 	}
-- 
cgit v1.2.3


From ba67a39efde8312e386c6f603054f8945433d91f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 11 Jan 2008 17:06:52 -0500
Subject: knfsd: Allow NFSv2/3 WRITE calls to succeed when krb5i etc is used.

When RPCSEC/GSS and krb5i is used, requests are padded, typically to a multiple
of 8 bytes.  This can make the request look slightly longer than it
really is.

As of

	f34b95689d2ce001c "The NFSv2/NFSv3 server does not handle zero
		length WRITE request correctly",

the xdr decode routines for NFSv2 and NFSv3 reject requests that aren't
the right length, so krb5i (for example) WRITE requests can get lost.

This patch relaxes the appropriate test and enhances the related comment.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Peter Staubach <staubach@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/nfs3xdr.c | 5 ++++-
 fs/nfsd/nfsxdr.c  | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2d116d2298f8..f917fd25858a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -388,8 +388,11 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 	 * Round the length of the data which was specified up to
 	 * the next multiple of XDR units and then compare that
 	 * against the length which was actually received.
+	 * Note that when RPCSEC/GSS (for example) is used, the
+	 * data buffer can be padded so dlen might be larger
+	 * than required.  It must never be smaller.
 	 */
-	if (dlen != XDR_QUADLEN(len)*4)
+	if (dlen < XDR_QUADLEN(len)*4)
 		return 0;
 
 	if (args->count > max_blocksize) {
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 986f9b32083c..b86e3658a0af 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -313,8 +313,11 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 	 * Round the length of the data which was specified up to
 	 * the next multiple of XDR units and then compare that
 	 * against the length which was actually received.
+	 * Note that when RPCSEC/GSS (for example) is used, the
+	 * data buffer can be padded so dlen might be larger
+	 * than required.  It must never be smaller.
 	 */
-	if (dlen != XDR_QUADLEN(len)*4)
+	if (dlen < XDR_QUADLEN(len)*4)
 		return 0;
 
 	rqstp->rq_vec[0].iov_base = (void*)p;
-- 
cgit v1.2.3


From a98fdcef941e107eeabae622d85a1f476f25a160 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 15 Jan 2008 00:02:37 +0300
Subject: fix the "remove task_ppid_nr_ns" commit

Commit 84427eaef1fb91704c7112bdb598c810003b99f3 (remove task_ppid_nr_ns)
moved the task_tgid_nr_ns(task->real_parent) outside of lock_task_sighand().
This is wrong, ->real_parent could be freed/reused.

Both ->parent/real_parent point to nothing after __exit_signal() because
we remove the child from ->children list, and thus the child can't be
reparented when its parent exits.

rcu_read_lock() protects ->parent/real_parent, but _only_ if we know it was
valid before we take rcu lock.

Revert this part of the patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 810eb8fd6500..eb97f2897e2b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -426,7 +426,6 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 	cgtime = gtime = cputime_zero;
 
 	rcu_read_lock();
-	ppid = task_tgid_nr_ns(task->real_parent, ns);
 	if (lock_task_sighand(task, &flags)) {
 		struct signal_struct *sig = task->signal;
 
@@ -465,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 		}
 
 		sid = task_session_nr_ns(task, ns);
+		ppid = task_tgid_nr_ns(task->real_parent, ns);
 		pgid = task_pgrp_nr_ns(task, ns);
 
 		unlock_task_sighand(task, &flags);
-- 
cgit v1.2.3


From c23f72cae9523d29ff94eec8f30ccbdaf234b20e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Mon, 14 Jan 2008 21:21:29 -0800
Subject: Revert "writeback: introduce writeback_control.more_io to indicate
 more io"

This reverts commit 2e6883bdf49abd0e7f0d9b6297fc3be7ebb2250b, as
requested by Fengguang Wu.  It's not quite fully baked yet, and while
there are patches around to fix the problems it caused, they should get
more testing.  Says Fengguang: "I'll resend them both for -mm later on,
in a more complete patchset".

See

	http://bugzilla.kernel.org/show_bug.cgi?id=9738

for some of this discussion.

Requested-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0fca82021d76..300324bd563c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -482,8 +482,6 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 		if (wbc->nr_to_write <= 0)
 			break;
 	}
-	if (!list_empty(&sb->s_more_io))
-		wbc->more_io = 1;
 	return;		/* Leave any unwritten inodes on s_io */
 }
 
-- 
cgit v1.2.3


From e49452c67703d3647467d65275fb893589384fed Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 16 Jan 2008 12:06:14 +0900
Subject: sysfs: make sysfs_lookup() return ERR_PTR(-ENOENT) on failed lookup

sysfs tries to keep dcache a strict subset of sysfs_dirent tree by
shooting down dentries when a node is removed, that is, no negative
dentry for sysfs.  However, the lookup function returned NULL and thus
created negative dentries when the target node didn't exist.

Make sysfs_lookup() return ERR_PTR(-ENOENT) on lookup failure.  This
fixes the NULL dereference bug in sysfs_get_dentry() discovered by
bluetooth rfcomm device moving around.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sysfs/dir.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 337162935d21..b197016bbfbe 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -678,8 +678,10 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
 
 	/* no such entry */
-	if (!sd)
+	if (!sd) {
+		ret = ERR_PTR(-ENOENT);
 		goto out_unlock;
+	}
 
 	/* attach dentry and inode */
 	inode = sysfs_get_inode(sd);
-- 
cgit v1.2.3


From 456ef1553cb2b06729d64c1d1f0f2bda34e9b201 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 16 Jan 2008 12:10:53 +0900
Subject: sysfs: fix bugs in sysfs_rename/move_dir()

sysfs_rename/move_dir() have the following bugs.

 - On dentry lookup failure, kfree() is called on ERR_PTR() value.
 - sysfs_move_dir() has an extra dput() on success path.

Fix them.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sysfs/dir.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index b197016bbfbe..f281cc6584b0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -783,6 +783,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 	old_dentry = sysfs_get_dentry(sd);
 	if (IS_ERR(old_dentry)) {
 		error = PTR_ERR(old_dentry);
+		old_dentry = NULL;
 		goto out;
 	}
 
@@ -850,6 +851,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 	old_dentry = sysfs_get_dentry(sd);
 	if (IS_ERR(old_dentry)) {
 		error = PTR_ERR(old_dentry);
+		old_dentry = NULL;
 		goto out;
 	}
 	old_parent = old_dentry->d_parent;
@@ -857,6 +859,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 	new_parent = sysfs_get_dentry(new_parent_sd);
 	if (IS_ERR(new_parent)) {
 		error = PTR_ERR(new_parent);
+		new_parent = NULL;
 		goto out;
 	}
 
@@ -880,7 +883,6 @@ again:
 	error = 0;
 	d_add(new_dentry, NULL);
 	d_move(old_dentry, new_dentry);
-	dput(new_dentry);
 
 	/* Remove from old parent's list and insert into new parent's list. */
 	sysfs_unlink_sibling(sd);
-- 
cgit v1.2.3


From 46a39c1cd5d2f804b27e9a4be3fb1b510dda9570 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 17 Jan 2008 15:21:09 -0800
Subject: hfs: fix coverity-found null deref

Fix potential null deref introduced by commit
cf0594625083111ae522496dc1c256f7476939c2
http://bugzilla.kernel.org/show_bug.cgi?id=9748

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Reported-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfs/btree.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 31284c77bba8..110dd3515dc8 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -61,7 +61,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	mapping = tree->inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
-		goto free_tree;
+		goto free_inode;
 
 	/* Load the header */
 	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -99,11 +99,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	page_cache_release(page);
 	return tree;
 
- fail_page:
+fail_page:
 	page_cache_release(page);
- free_tree:
+free_inode:
 	tree->inode->i_mapping->a_ops = &hfs_aops;
 	iput(tree->inode);
+free_tree:
 	kfree(tree);
 	return NULL;
 }
-- 
cgit v1.2.3


From f63dcda197bd71c6565c2121bf70e3d371539f90 Mon Sep 17 00:00:00 2001
From: Jonas Bonn <jonas.bonn@gmail.com>
Date: Thu, 17 Jan 2008 15:21:13 -0800
Subject: jbd: do not try lock_acquire after handle made invalid

This likely fixes the oops in __lock_acquire reported as:

http://www.kerneloops.org/raw.php?rawid=2753&msgid=
http://www.kerneloops.org/raw.php?rawid=2749&msgid=

In these reported oopses, start_this_handle is returning -EROFS.

Signed-off-by: Jonas Bonn <jonas.bonn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/transaction.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 08ff6c7028cc..038ed7436199 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -288,10 +288,12 @@ handle_t *journal_start(journal_t *journal, int nblocks)
 		jbd_free_handle(handle);
 		current->journal_info = NULL;
 		handle = ERR_PTR(err);
+		goto out;
 	}
 
 	lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
 
+out:
 	return handle;
 }
 
-- 
cgit v1.2.3


From ed2b91701d97047fa9970645e43d5e551e261adb Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sun, 20 Jan 2008 00:30:29 +0000
Subject: [CIFS] Do not log path names in lookup errors

Andi Kleen noticed that we were logging access denied errors (which is
noisy in the dmesg log, and not needed to be logged) and that we were
logging path names on that an other errors (e.g. EIO) which we should
not be doing.

CC: Andi Kleen <ak@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 37dc97af1487..699ec1198409 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -517,12 +517,10 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		d_add(direntry, NULL);
 	/*	if it was once a directory (but how can we tell?) we could do
 		shrink_dcache_parent(direntry); */
-	} else {
-		cERROR(1, ("Error 0x%x on cifs_get_inode_info in lookup of %s",
-			   rc, full_path));
-		/* BB special case check for Access Denied - watch security
-		exposure of returning dir info implicitly via different rc
-		if file exists or not but no access BB */
+	} else if (rc != -EACCES) {
+		cERROR(1, ("Unexpected lookup error %d", rc));
+		/* We special case check for Access Denied - since that
+		is a common return code */
 	}
 
 	kfree(full_path);
-- 
cgit v1.2.3


From 889c94a14e38e749c8060f597ee7825ea0764229 Mon Sep 17 00:00:00 2001
From: Johann Felix Soden <johfel@users.sourceforge.net>
Date: Sun, 20 Jan 2008 14:41:18 +0100
Subject: Fix file references in documentation and Kconfig

Fix typo in arch/powerpc/boot/flatdevtree_env.h.
There is no Documentation/networking/ixgbe.txt.

README.cycladesZ is now in Documentation/.
wavelan.p.h is now in drivers/net/wireless/.
HFS.txt is now Documentation/filesystems/hfs.txt.
OSS-files are now in sound/oss/.

Signed-off-by: Johann Felix Soden <johfel@users.sourceforge.net>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 487236c65837..781b47d2f9f2 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1112,8 +1112,8 @@ config HFS_FS
 	help
 	  If you say Y here, you will be able to mount Macintosh-formatted
 	  floppy disks and hard drive partitions with full read-write access.
-	  Please read <file:fs/hfs/HFS.txt> to learn about the available mount
-	  options.
+	  Please read <file:Documentation/filesystems/hfs.txt> to learn about
+	  the available mount options.
 
 	  To compile this file system support as a module, choose M here: the
 	  module will be called hfs.
-- 
cgit v1.2.3


From 872e2be7c4056496c2871bd9b0f2fae6c374fe47 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Tue, 22 Jan 2008 18:29:20 -0800
Subject: [SPARC]: Constify function pointer tables.

Signed-off-by: Jan Engelhardt <jengelh@computergmbh.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/openpromfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d88173840082..6b7ff1618945 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -131,7 +131,7 @@ static void property_stop(struct seq_file *f, void *v)
 	/* Nothing to do */
 }
 
-static struct seq_operations property_op = {
+static const struct seq_operations property_op = {
 	.start		= property_start,
 	.next		= property_next,
 	.stop		= property_stop,
-- 
cgit v1.2.3


From 11f24fbdf511cf588c3a18e3208ee02d85db0020 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 2 Jan 2008 18:44:05 -0600
Subject: [SCSI] sysfs: fix the sysfs_add_file_to_group interfaces

I can't see a reason why these shouldn't work on every group.  However,
they only seem to work on named groups.  This patch allows the group
functions to work on anonymous groups (those with NULL names).

Acked-by: Tejun Heo <htejun@gmail.com>
Acked-by: Kay Sievers <kay.sievers@vrfy.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 fs/sysfs/file.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4045bdcc4b33..b834f1709f9f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -568,7 +568,11 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 	struct sysfs_dirent *dir_sd;
 	int error;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, group);
+	if (group)
+		dir_sd = sysfs_get_dirent(kobj->sd, group);
+	else
+		dir_sd = sysfs_get(kobj->sd);
+
 	if (!dir_sd)
 		return -ENOENT;
 
@@ -656,7 +660,10 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 {
 	struct sysfs_dirent *dir_sd;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, group);
+	if (group)
+		dir_sd = sysfs_get_dirent(kobj->sd, group);
+	else
+		dir_sd = sysfs_get(kobj->sd);
 	if (dir_sd) {
 		sysfs_hash_and_remove(dir_sd, attr->name);
 		sysfs_put(dir_sd);
-- 
cgit v1.2.3


From d4acd722b7bb5f48b9fc3848e8c2a845b100d84f Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 31 Oct 2007 09:38:04 -0500
Subject: [SCSI] sysfs: add filter function to groups

This patch allows the various users of attribute_groups to selectively
allow the appearance of group attributes.  The primary consumer of
this will be the transport classes in which we currently have
elaborate attribute selection algorithms to do this same thing.

Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 fs/sysfs/group.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index d1972374655a..0871c3dadce1 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -16,25 +16,31 @@
 #include "sysfs.h"
 
 
-static void remove_files(struct sysfs_dirent *dir_sd,
+static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			 const struct attribute_group *grp)
 {
 	struct attribute *const* attr;
+	int i;
 
-	for (attr = grp->attrs; *attr; attr++)
-		sysfs_hash_and_remove(dir_sd, (*attr)->name);
+	for (i = 0, attr = grp->attrs; *attr; i++, attr++)
+		if (!grp->is_visible ||
+		    grp->is_visible(kobj, *attr, i))
+			sysfs_hash_and_remove(dir_sd, (*attr)->name);
 }
 
-static int create_files(struct sysfs_dirent *dir_sd,
+static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			const struct attribute_group *grp)
 {
 	struct attribute *const* attr;
-	int error = 0;
+	int error = 0, i;
 
-	for (attr = grp->attrs; *attr && !error; attr++)
-		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+	for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++)
+		if (!grp->is_visible ||
+		    grp->is_visible(kobj, *attr, i))
+			error |=
+				sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
 	if (error)
-		remove_files(dir_sd, grp);
+		remove_files(dir_sd, kobj, grp);
 	return error;
 }
 
@@ -54,7 +60,7 @@ int sysfs_create_group(struct kobject * kobj,
 	} else
 		sd = kobj->sd;
 	sysfs_get(sd);
-	error = create_files(sd, grp);
+	error = create_files(sd, kobj, grp);
 	if (error) {
 		if (grp->name)
 			sysfs_remove_subdir(sd);
@@ -75,7 +81,7 @@ void sysfs_remove_group(struct kobject * kobj,
 	} else
 		sd = sysfs_get(dir_sd);
 
-	remove_files(sd, grp);
+	remove_files(sd, kobj, grp);
 	if (grp->name)
 		sysfs_remove_subdir(sd);
 
-- 
cgit v1.2.3


From 5c5e32ceeb6b64496a1842d5d99e4ac8d20166c4 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 24 Jan 2008 16:13:21 -0600
Subject: mount options: fix jfs

Add iocharset= and errors= options to /proc/mounts for jfs
filesystems.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 314bb4ff1ba8..70a14001c98f 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -598,6 +598,12 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",umask=%03o", sbi->umask);
 	if (sbi->flag & JFS_NOINTEGRITY)
 		seq_puts(seq, ",nointegrity");
+	if (sbi->nls_tab)
+		seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset);
+	if (sbi->flag & JFS_ERR_CONTINUE)
+		seq_printf(seq, ",errors=continue");
+	if (sbi->flag & JFS_ERR_PANIC)
+		seq_printf(seq, ",errors=panic");
 
 #ifdef CONFIG_QUOTA
 	if (sbi->flag & JFS_USRQUOTA)
-- 
cgit v1.2.3


From c43e259cc756ece387faae849af0058b56d78466 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Sat, 12 Jan 2008 22:05:48 +1100
Subject: security: call security_file_permission from rw_verify_area

All instances of rw_verify_area() are followed by a call to
security_file_permission(), so just call the latter from the former.

Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/compat.c     |  4 ----
 fs/read_write.c | 63 ++++++++++++++++++++++-----------------------------------
 fs/splice.c     |  8 --------
 3 files changed, 24 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 15078ce4c04a..5216c3fd7517 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1104,10 +1104,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	if (ret < 0)
 		goto out;
 
-	ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
-	if (ret)
-		goto out;
-
 	fnv = NULL;
 	if (type == READ) {
 		fn = file->f_op->read;
diff --git a/fs/read_write.c b/fs/read_write.c
index ea1f94cc722e..c4d3d17923f1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -197,25 +197,27 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 {
 	struct inode *inode;
 	loff_t pos;
+	int retval = -EINVAL;
 
 	inode = file->f_path.dentry->d_inode;
 	if (unlikely((ssize_t) count < 0))
-		goto Einval;
+		return retval;
 	pos = *ppos;
 	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
-		goto Einval;
+		return retval;
 
 	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
-		int retval = locks_mandatory_area(
+		retval = locks_mandatory_area(
 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 			inode, file, pos, count);
 		if (retval < 0)
 			return retval;
 	}
+	retval = security_file_permission(file,
+				read_write == READ ? MAY_READ : MAY_WRITE);
+	if (retval)
+		return retval;
 	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
-
-Einval:
-	return -EINVAL;
 }
 
 static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
@@ -267,18 +269,15 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 	ret = rw_verify_area(READ, file, pos, count);
 	if (ret >= 0) {
 		count = ret;
-		ret = security_file_permission (file, MAY_READ);
-		if (!ret) {
-			if (file->f_op->read)
-				ret = file->f_op->read(file, buf, count, pos);
-			else
-				ret = do_sync_read(file, buf, count, pos);
-			if (ret > 0) {
-				fsnotify_access(file->f_path.dentry);
-				add_rchar(current, ret);
-			}
-			inc_syscr(current);
+		if (file->f_op->read)
+			ret = file->f_op->read(file, buf, count, pos);
+		else
+			ret = do_sync_read(file, buf, count, pos);
+		if (ret > 0) {
+			fsnotify_access(file->f_path.dentry);
+			add_rchar(current, ret);
 		}
+		inc_syscr(current);
 	}
 
 	return ret;
@@ -325,18 +324,15 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 	ret = rw_verify_area(WRITE, file, pos, count);
 	if (ret >= 0) {
 		count = ret;
-		ret = security_file_permission (file, MAY_WRITE);
-		if (!ret) {
-			if (file->f_op->write)
-				ret = file->f_op->write(file, buf, count, pos);
-			else
-				ret = do_sync_write(file, buf, count, pos);
-			if (ret > 0) {
-				fsnotify_modify(file->f_path.dentry);
-				add_wchar(current, ret);
-			}
-			inc_syscw(current);
+		if (file->f_op->write)
+			ret = file->f_op->write(file, buf, count, pos);
+		else
+			ret = do_sync_write(file, buf, count, pos);
+		if (ret > 0) {
+			fsnotify_modify(file->f_path.dentry);
+			add_wchar(current, ret);
 		}
+		inc_syscw(current);
 	}
 
 	return ret;
@@ -603,9 +599,6 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	ret = rw_verify_area(type, file, pos, tot_len);
 	if (ret < 0)
 		goto out;
-	ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE);
-	if (ret)
-		goto out;
 
 	fnv = NULL;
 	if (type == READ) {
@@ -737,10 +730,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		goto fput_in;
 	count = retval;
 
-	retval = security_file_permission (in_file, MAY_READ);
-	if (retval)
-		goto fput_in;
-
 	/*
 	 * Get output file, and verify that it is ok..
 	 */
@@ -759,10 +748,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		goto fput_out;
 	count = retval;
 
-	retval = security_file_permission (out_file, MAY_WRITE);
-	if (retval)
-		goto fput_out;
-
 	if (!max)
 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
 
diff --git a/fs/splice.c b/fs/splice.c
index 6bdcb6107bc3..56b802bfbfa4 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -908,10 +908,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (unlikely(ret < 0))
 		return ret;
 
-	ret = security_file_permission(out, MAY_WRITE);
-	if (unlikely(ret < 0))
-		return ret;
-
 	return out->f_op->splice_write(pipe, out, ppos, len, flags);
 }
 
@@ -934,10 +930,6 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(ret < 0))
 		return ret;
 
-	ret = security_file_permission(in, MAY_READ);
-	if (unlikely(ret < 0))
-		return ret;
-
 	return in->f_op->splice_read(in, ppos, pipe, len, flags);
 }
 
-- 
cgit v1.2.3


From 6d5ae0deb1641bf615eafd8fef64218e10cb2fd0 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Fri, 25 Jan 2008 03:28:31 +0000
Subject: [CIFS] DFS support: provide shrinkable mounts

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Makefile       |   2 +-
 fs/cifs/cifs_dfs_ref.c | 375 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/cifsfs.c       |   4 +
 fs/cifs/cifsfs.h       |   4 +
 fs/cifs/cifsproto.h    |   3 +
 5 files changed, 387 insertions(+), 1 deletion(-)
 create mode 100644 fs/cifs/cifs_dfs_ref.c

(limited to 'fs')

diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 09898b8dc69b..6ba43fb346fb 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -10,4 +10,4 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 
-cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
new file mode 100644
index 000000000000..15e31f8435ba
--- /dev/null
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -0,0 +1,375 @@
+/*
+ *   Contains the CIFS DFS referral mounting routines used for handling
+ *   traversal via DFS junction point
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version
+ *   2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/vfs.h>
+#include <linux/fs.h>
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifsfs.h"
+#include "dns_resolve.h"
+#include "cifs_debug.h"
+
+LIST_HEAD(cifs_dfs_automount_list);
+
+/*
+ * DFS functions
+*/
+
+void dfs_shrink_umount_helper(struct vfsmount *vfsmnt)
+{
+	mark_mounts_for_expiry(&cifs_dfs_automount_list);
+	mark_mounts_for_expiry(&cifs_dfs_automount_list);
+	shrink_submounts(vfsmnt, &cifs_dfs_automount_list);
+}
+
+/**
+ * cifs_get_share_name	-	extracts share name from UNC
+ * @node_name:	pointer to UNC string
+ *
+ * Extracts sharename form full UNC.
+ * i.e. strips from UNC trailing path that is not part of share
+ * name and fixup missing '\' in the begining of DFS node refferal
+ * if neccessary.
+ * Returns pointer to share name on success or NULL on error.
+ * Caller is responsible for freeing returned string.
+ */
+static char *cifs_get_share_name(const char *node_name)
+{
+	int len;
+	char *UNC;
+	char *pSep;
+
+	len = strlen(node_name);
+	UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
+			 GFP_KERNEL);
+	if (!UNC)
+		return NULL;
+
+	/* get share name and server name */
+	if (node_name[1] != '\\') {
+		UNC[0] = '\\';
+		strncpy(UNC+1, node_name, len);
+		len++;
+		UNC[len] = 0;
+	} else {
+		strncpy(UNC, node_name, len);
+		UNC[len] = 0;
+	}
+
+	/* find server name end */
+	pSep = memchr(UNC+2, '\\', len-2);
+	if (!pSep) {
+		cERROR(1, ("%s: no server name end in node name: %s",
+			__FUNCTION__, node_name));
+		kfree(UNC);
+		return NULL;
+	}
+
+	/* find sharename end */
+	pSep++;
+	pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
+	if (!pSep) {
+		cERROR(1, ("%s:2 cant find share name in node name: %s",
+			__FUNCTION__, node_name));
+		kfree(UNC);
+		return NULL;
+	}
+	/* trim path up to sharename end
+	 *          * now we have share name in UNC */
+	*pSep = 0;
+
+	return UNC;
+}
+
+
+/**
+ * compose_mount_options	-	creates mount options for refferral
+ * @sb_mountdata:	parent/root DFS mount options (template)
+ * @ref_unc:		refferral server UNC
+ * @devname:		pointer for saving device name
+ *
+ * creates mount options for submount based on template options sb_mountdata
+ * and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
+ *
+ * Returns: pointer to new mount options or ERR_PTR.
+ * Caller is responcible for freeing retunrned value if it is not error.
+ */
+char *compose_mount_options(const char *sb_mountdata, const char *ref_unc,
+				char **devname)
+{
+	int rc;
+	char *mountdata;
+	int md_len;
+	char *tkn_e;
+	char *srvIP = NULL;
+	char sep = ',';
+	int off, noff;
+
+	if (sb_mountdata == NULL)
+		return ERR_PTR(-EINVAL);
+
+	*devname = cifs_get_share_name(ref_unc);
+	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
+	if (rc != 0) {
+		cERROR(1, ("%s: Failed to resolve server part of %s to IP",
+			  __FUNCTION__, *devname));
+		mountdata = ERR_PTR(rc);
+		goto compose_mount_options_out;
+	}
+	md_len = strlen(sb_mountdata) + strlen(srvIP) + strlen(ref_unc) + 3;
+	mountdata = kzalloc(md_len+1, GFP_KERNEL);
+	if (mountdata == NULL) {
+		mountdata = ERR_PTR(-ENOMEM);
+		goto compose_mount_options_out;
+	}
+
+	/* copy all options except of unc,ip,prefixpath */
+	off = 0;
+	if (strncmp(sb_mountdata, "sep=", 4) == 0) {
+			sep = sb_mountdata[4];
+			strncpy(mountdata, sb_mountdata, 5);
+			off += 5;
+	}
+	while ((tkn_e = strchr(sb_mountdata+off, sep))) {
+		noff = (tkn_e - (sb_mountdata+off)) + 1;
+		if (strnicmp(sb_mountdata+off, "unc=", 4) == 0) {
+			off += noff;
+			continue;
+		}
+		if (strnicmp(sb_mountdata+off, "ip=", 3) == 0) {
+			off += noff;
+			continue;
+		}
+		if (strnicmp(sb_mountdata+off, "prefixpath=", 3) == 0) {
+			off += noff;
+			continue;
+		}
+		strncat(mountdata, sb_mountdata+off, noff);
+		off += noff;
+	}
+	strcat(mountdata, sb_mountdata+off);
+	mountdata[md_len] = '\0';
+
+	/* copy new IP and ref share name */
+	strcat(mountdata, ",ip=");
+	strcat(mountdata, srvIP);
+	strcat(mountdata, ",unc=");
+	strcat(mountdata, *devname);
+
+	/* find & copy prefixpath */
+	tkn_e = strchr(ref_unc+2, '\\');
+	if (tkn_e) {
+		tkn_e = strchr(tkn_e+1, '\\');
+		if (tkn_e) {
+			strcat(mountdata, ",prefixpath=");
+			strcat(mountdata, tkn_e);
+		}
+	}
+
+	/*cFYI(1,("%s: parent mountdata: %s", __FUNCTION__,sb_mountdata));*/
+	/*cFYI(1, ("%s: submount mountdata: %s", __FUNCTION__, mountdata ));*/
+
+compose_mount_options_out:
+	kfree(srvIP);
+	return mountdata;
+}
+
+
+struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
+		struct dentry *dentry, char *ref_unc)
+{
+	struct cifs_sb_info *cifs_sb;
+	struct vfsmount *mnt;
+	char *mountdata;
+	char *devname;
+
+	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+	mountdata = compose_mount_options(cifs_sb->mountdata,
+						ref_unc, &devname);
+
+	if (IS_ERR(mountdata))
+		return (struct vfsmount *)mountdata;
+
+	mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+	kfree(mountdata);
+	kfree(devname);
+	return mnt;
+
+}
+
+static char *build_full_dfs_path_from_dentry(struct dentry *dentry)
+{
+	char *full_path = NULL;
+	char *search_path;
+	char *tmp_path;
+	size_t l_max_len;
+	struct cifs_sb_info *cifs_sb;
+
+	if (dentry->d_inode == NULL)
+		return NULL;
+
+	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+
+	if (cifs_sb->tcon == NULL)
+		return NULL;
+
+	search_path = build_path_from_dentry(dentry);
+	if (search_path == NULL)
+		return NULL;
+
+	if (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS) {
+		/* we should use full path name to correct working with DFS */
+		l_max_len = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE+1) +
+					strnlen(search_path, MAX_PATHCONF) + 1;
+		tmp_path = kmalloc(l_max_len, GFP_KERNEL);
+		if (tmp_path == NULL) {
+			kfree(search_path);
+			return NULL;
+		}
+		strncpy(tmp_path, cifs_sb->tcon->treeName, l_max_len);
+		strcat(tmp_path, search_path);
+		tmp_path[l_max_len-1] = 0;
+		full_path = tmp_path;
+		kfree(search_path);
+	} else {
+		full_path = search_path;
+	}
+	return full_path;
+}
+
+static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
+				struct list_head *mntlist)
+{
+	/* stolen from afs code */
+	int err;
+
+	mntget(newmnt);
+	err = do_add_mount(newmnt, nd, nd->mnt->mnt_flags, mntlist);
+	switch (err) {
+	case 0:
+		dput(nd->dentry);
+		mntput(nd->mnt);
+		nd->mnt = newmnt;
+		nd->dentry = dget(newmnt->mnt_root);
+		break;
+	case -EBUSY:
+		/* someone else made a mount here whilst we were busy */
+		while (d_mountpoint(nd->dentry) &&
+		       follow_down(&nd->mnt, &nd->dentry))
+			;
+		err = 0;
+	default:
+		mntput(newmnt);
+		break;
+	}
+	return err;
+}
+
+void dump_referral(const struct dfs_info3_param *ref)
+{
+	cFYI(1, ("DFS: ref path: %s", ref->path_name));
+	cFYI(1, ("DFS: node path: %s", ref->node_name));
+	cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type));
+	cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
+				ref->PathConsumed));
+}
+
+
+static void*
+cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dfs_info3_param *referrals = NULL;
+	unsigned int num_referrals = 0;
+	struct cifs_sb_info *cifs_sb;
+	struct cifsSesInfo *ses;
+	char *full_path = NULL;
+	int xid, i;
+	int rc = 0;
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+
+	cFYI(1, ("in %s", __FUNCTION__));
+	BUG_ON(IS_ROOT(dentry));
+
+	xid = GetXid();
+
+	dput(nd->dentry);
+	nd->dentry = dget(dentry);
+
+	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+	ses = cifs_sb->tcon->ses;
+
+	if (!ses) {
+		rc = -EINVAL;
+		goto out_err;
+	}
+
+	full_path = build_full_dfs_path_from_dentry(dentry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto out_err;
+	}
+
+	rc = get_dfs_path(xid, ses , full_path, cifs_sb->local_nls,
+		&num_referrals, &referrals,
+		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	for (i = 0; i < num_referrals; i++) {
+		dump_referral(referrals+i);
+		/* connect to a storage node */
+		if (referrals[i].flags & DFSREF_STORAGE_SERVER) {
+			int len;
+			len = strlen(referrals[i].node_name);
+			if (len < 2) {
+				cERROR(1, ("%s: Net Address path too short: %s",
+					__FUNCTION__, referrals[i].node_name));
+				rc = -EINVAL;
+				goto out_err;
+			}
+			mnt = cifs_dfs_do_refmount(nd->mnt, nd->dentry,
+						referrals[i].node_name);
+			cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
+					 __FUNCTION__,
+					referrals[i].node_name, mnt));
+
+			/* complete mount procedure if we accured submount */
+			if (!IS_ERR(mnt))
+				break;
+		}
+	}
+
+	/* we need it cause for() above could exit without valid submount */
+	rc = PTR_ERR(mnt);
+	if (IS_ERR(mnt))
+		goto out_err;
+
+	nd->mnt->mnt_flags |= MNT_SHRINKABLE;
+	rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
+
+out:
+	FreeXid(xid);
+	free_dfs_info_array(referrals, num_referrals);
+	kfree(full_path);
+	cFYI(1, ("leaving %s" , __FUNCTION__));
+	return ERR_PTR(rc);
+out_err:
+	path_release(nd);
+	goto out;
+}
+
+struct inode_operations cifs_dfs_referral_inode_operations = {
+	.follow_link = cifs_dfs_follow_mountpoint,
+};
+
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 93e107883a61..e9f4ec701092 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -471,6 +471,10 @@ static void cifs_umount_begin(struct vfsmount *vfsmnt, int flags)
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *tcon;
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	dfs_shrink_umount_helper(vfsmnt);
+#endif /* CONFIG CIFS_DFS_UPCALL */
+
 	if (!(flags & MNT_FORCE))
 		return;
 	cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2e68126d07eb..195b14de5567 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,6 +61,10 @@ extern int cifs_setattr(struct dentry *, struct iattr *);
 
 extern const struct inode_operations cifs_file_inode_ops;
 extern const struct inode_operations cifs_symlink_inode_ops;
+extern struct list_head cifs_dfs_automount_list;
+extern struct inode_operations cifs_dfs_referral_inode_operations;
+
+
 
 /* Functions related to files and directories */
 extern const struct file_operations cifs_file_ops;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 7093cb4b0212..aaaf748f6a26 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -102,6 +102,9 @@ extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 			const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern void dfs_shrink_umount_helper(struct vfsmount *vfsmnt);
+#endif
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
 
-- 
cgit v1.2.3


From 9fd5b1c906a9b4b0efb24cb2b4d20c678ff26122 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Tue, 8 Jan 2008 18:11:24 +0100
Subject: sysfs: Fix a copy-n-paste typo in comment

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index f281cc6584b0..4948d9bc405d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -440,7 +440,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 /**
  *	sysfs_remove_one - remove sysfs_dirent from parent
  *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be added
+ *	@sd: sysfs_dirent to be removed
  *
  *	Mark @sd removed and drop nlink of parent inode if @sd is a
  *	directory.  @sd is unlinked from the children list.
-- 
cgit v1.2.3


From 62ca8792560e5bd7dc09f54ed3523a7864f416c7 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 25 Sep 2007 02:03:03 +0200
Subject: coda: convert struct class_device to struct device

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: Tony Jones <tonyj@suse.de>
Cc: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/coda/psdev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index dcc6aead70f5..e3eb3556622b 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,8 +362,8 @@ static int init_coda_psdev(void)
 		goto out_chrdev;
 	}		
 	for (i = 0; i < MAX_CODADEVS; i++)
-		class_device_create(coda_psdev_class, NULL,
-				MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i);
+		device_create(coda_psdev_class, NULL,
+			      MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i);
 	coda_sysctl_init();
 	goto out;
 
@@ -405,7 +405,7 @@ static int __init init_coda(void)
 	return 0;
 out:
 	for (i = 0; i < MAX_CODADEVS; i++)
-		class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+		device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
 	class_destroy(coda_psdev_class);
 	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
 	coda_sysctl_clean();
@@ -424,7 +424,7 @@ static void __exit exit_coda(void)
                 printk("coda: failed to unregister filesystem\n");
         }
 	for (i = 0; i < MAX_CODADEVS; i++)
-		class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
+		device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
 	class_destroy(coda_psdev_class);
 	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
 	coda_sysctl_clean();
-- 
cgit v1.2.3


From 30a468b1c1b9911ae515ff8972ee10c50cca3021 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 15 Oct 2007 15:01:24 -0700
Subject: ecryptfs: clean up attribute mess

It isn't that hard to add simple kset attributes, so don't go through
all the gyrations of creating your own object type and show and store
functions.  Just use the functions that are already present.  This makes
things much simpler.

Note, the version_str string violates the "one value per file" rule for
sysfs.  I suggest changing this now (individual files per type supported
is one suggested way.)


Cc: Michael A. Halcrow <mahalcro@us.ibm.com>
Cc: Michael C. Thompson <mcthomps@us.ibm.com>
Cc: Tyler Hicks <tyhicks@ou.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ecryptfs/main.c | 85 ++++++++++++------------------------------------------
 1 file changed, 18 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e5580bcb923a..f9f32472c505 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -734,58 +734,14 @@ static int ecryptfs_init_kmem_caches(void)
 	return 0;
 }
 
-struct ecryptfs_obj {
-	char *name;
-	struct list_head slot_list;
-	struct kobject kobj;
-};
-
-struct ecryptfs_attribute {
-	struct attribute attr;
-	ssize_t(*show) (struct ecryptfs_obj *, char *);
-	ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t);
-};
-
-static ssize_t
-ecryptfs_attr_store(struct kobject *kobj,
-		    struct attribute *attr, const char *buf, size_t len)
-{
-	struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
-						kobj);
-	struct ecryptfs_attribute *attribute =
-		container_of(attr, struct ecryptfs_attribute, attr);
-
-	return (attribute->store ? attribute->store(obj, buf, len) : 0);
-}
-
-static ssize_t
-ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-	struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
-						kobj);
-	struct ecryptfs_attribute *attribute =
-		container_of(attr, struct ecryptfs_attribute, attr);
-
-	return (attribute->show ? attribute->show(obj, buf) : 0);
-}
-
-static struct sysfs_ops ecryptfs_sysfs_ops = {
-	.show = ecryptfs_attr_show,
-	.store = ecryptfs_attr_store
-};
+static decl_subsys(ecryptfs, NULL, NULL);
 
-static struct kobj_type ecryptfs_ktype = {
-	.sysfs_ops = &ecryptfs_sysfs_ops
-};
-
-static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL);
-
-static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
+static ssize_t version_show(struct kset *kset, char *buff)
 {
 	return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
 }
 
-static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
+static struct subsys_attribute version_attr = __ATTR_RO(version);
 
 static struct ecryptfs_version_str_map_elem {
 	u32 flag;
@@ -799,7 +755,7 @@ static struct ecryptfs_version_str_map_elem {
 	{ECRYPTFS_VERSIONING_MULTKEY, "multiple keys per file"}
 };
 
-static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff)
+static ssize_t version_str_show(struct kset *kset, char *buff)
 {
 	int i;
 	int remaining = PAGE_SIZE;
@@ -826,7 +782,17 @@ out:
 	return total_written;
 }
 
-static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str);
+static struct subsys_attribute version_attr_str = __ATTR_RO(version_str);
+
+static struct attribute *attributes[] = {
+	&version_attr.attr,
+	&version_attr_str.attr,
+	NULL,
+};
+
+static struct attribute_group attr_group = {
+	.attrs = attributes,
+};
 
 static int do_sysfs_registration(void)
 {
@@ -838,23 +804,11 @@ static int do_sysfs_registration(void)
 		       "Unable to register ecryptfs sysfs subsystem\n");
 		goto out;
 	}
-	rc = sysfs_create_file(&ecryptfs_subsys.kobj,
-			       &sysfs_attr_version.attr);
+	rc = sysfs_create_group(&ecryptfs_subsys.kobj, &attr_group);
 	if (rc) {
 		printk(KERN_ERR
-		       "Unable to create ecryptfs version attribute\n");
+		       "Unable to create ecryptfs version attributes\n");
 		subsystem_unregister(&ecryptfs_subsys);
-		goto out;
-	}
-	rc = sysfs_create_file(&ecryptfs_subsys.kobj,
-			       &sysfs_attr_version_str.attr);
-	if (rc) {
-		printk(KERN_ERR
-		       "Unable to create ecryptfs version_str attribute\n");
-		sysfs_remove_file(&ecryptfs_subsys.kobj,
-				  &sysfs_attr_version.attr);
-		subsystem_unregister(&ecryptfs_subsys);
-		goto out;
 	}
 out:
 	return rc;
@@ -862,10 +816,7 @@ out:
 
 static void do_sysfs_unregistration(void)
 {
-	sysfs_remove_file(&ecryptfs_subsys.kobj,
-			  &sysfs_attr_version.attr);
-	sysfs_remove_file(&ecryptfs_subsys.kobj,
-			  &sysfs_attr_version_str.attr);
+	sysfs_remove_group(&ecryptfs_subsys.kobj, &attr_group);
 	subsystem_unregister(&ecryptfs_subsys);
 }
 
-- 
cgit v1.2.3


From 2f90a851800e88436873c8d27238cf219b9ef48e Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 1 Nov 2007 20:20:52 +0100
Subject: sysfs: create optimal relative symlink targets

Instead of walking from the source down to the root of sysfs, and back
to the target, we stop at the first directory the source and the target
share.

This link:
  /devices/pci0000:00/0000:00:1d.7/usb1/1-0:1.0/ep_81

pointed to:
  ../../../../../devices/pci0000:00/0000:00:1d.0/usb2/2-0:1.0/usb_endpoint/usbdev2.1_ep81

now it just points to:
  usb_endpoint/usbdev1.1_ep81

Thanks to Denis Cheng for bringing this up, and sending the initial patch.

CC: Denis Cheng <crquan@gmail.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/symlink.c | 88 ++++++++++++++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3eac20c63c41..5f66c4466151 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -19,39 +19,6 @@
 
 #include "sysfs.h"
 
-static int object_depth(struct sysfs_dirent *sd)
-{
-	int depth = 0;
-
-	for (; sd->s_parent; sd = sd->s_parent)
-		depth++;
-
-	return depth;
-}
-
-static int object_path_length(struct sysfs_dirent * sd)
-{
-	int length = 1;
-
-	for (; sd->s_parent; sd = sd->s_parent)
-		length += strlen(sd->s_name) + 1;
-
-	return length;
-}
-
-static void fill_object_path(struct sysfs_dirent *sd, char *buffer, int length)
-{
-	--length;
-	for (; sd->s_parent; sd = sd->s_parent) {
-		int cur = strlen(sd->s_name);
-
-		/* back up enough to print this bus id with '/' */
-		length -= cur;
-		strncpy(buffer + length, sd->s_name, cur);
-		*(buffer + --length) = '/';
-	}
-}
-
 /**
  *	sysfs_create_link - create symlink between two objects.
  *	@kobj:	object whose directory we're creating the link in.
@@ -112,7 +79,6 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 	return error;
 }
 
-
 /**
  *	sysfs_remove_link - remove symlink in object's directory.
  *	@kobj:	object we're acting for.
@@ -124,24 +90,54 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
 	sysfs_hash_and_remove(kobj->sd, name);
 }
 
-static int sysfs_get_target_path(struct sysfs_dirent * parent_sd,
-				 struct sysfs_dirent * target_sd, char *path)
+static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
+				 struct sysfs_dirent *target_sd, char *path)
 {
-	char * s;
-	int depth, size;
+	struct sysfs_dirent *base, *sd;
+	char *s = path;
+	int len = 0;
+
+	/* go up to the root, stop at the base */
+	base = parent_sd;
+	while (base->s_parent) {
+		sd = target_sd->s_parent;
+		while (sd->s_parent && base != sd)
+			sd = sd->s_parent;
+
+		if (base == sd)
+			break;
+
+		strcpy(s, "../");
+		s += 3;
+		base = base->s_parent;
+	}
+
+	/* determine end of target string for reverse fillup */
+	sd = target_sd;
+	while (sd->s_parent && sd != base) {
+		len += strlen(sd->s_name) + 1;
+		sd = sd->s_parent;
+	}
 
-	depth = object_depth(parent_sd);
-	size = object_path_length(target_sd) + depth * 3 - 1;
-	if (size > PATH_MAX)
+	/* check limits */
+	if (len < 2)
+		return -EINVAL;
+	len--;
+	if ((s - path) + len > PATH_MAX)
 		return -ENAMETOOLONG;
 
-	pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+	/* reverse fillup of target string from target to base */
+	sd = target_sd;
+	while (sd->s_parent && sd != base) {
+		int slen = strlen(sd->s_name);
 
-	for (s = path; depth--; s += 3)
-		strcpy(s,"../");
+		len -= slen;
+		strncpy(s + len, sd->s_name, slen);
+		if (len)
+			s[--len] = '/';
 
-	fill_object_path(target_sd, path, size);
-	pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+		sd = sd->s_parent;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From d7b37889650bb316f5c4ad4b0569ba897120d70d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 21 Nov 2007 14:55:19 -0800
Subject: sysfs: remove SPIN_LOCK_UNLOCKED

SPIN_LOCK_UNLOCKED is deprecated, use DEFINE_SPINLOCK instead

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Tejun Heo <teheo@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4045bdcc4b33..09a0611b3364 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -66,7 +66,7 @@ static struct sysfs_ops subsys_sysfs_ops = {
  * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open
  * is protected by sysfs_open_dirent_lock.
  */
-static spinlock_t sysfs_open_dirent_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
 
 struct sysfs_open_dirent {
 	atomic_t		refcnt;
-- 
cgit v1.2.3


From 3514faca19a6fdc209734431c509631ea92b094e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 16 Oct 2007 10:11:44 -0600
Subject: kobject: remove struct kobj_type from struct kset

We don't need a "default" ktype for a kset.  We should set this
explicitly every time for each kset.  This change is needed so that we
can make ksets dynamic, and cleans up one of the odd, undocumented
assumption that the kset/kobject/ktype model has.

This patch is based on a lot of help from Kay Sievers.

Nasty bug in the block code was found by Dave Young
<hidave.darkstar@gmail.com>

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/configfs/mount.c         | 4 ++--
 fs/debugfs/inode.c          | 4 ++--
 fs/dlm/lockspace.c          | 6 ++----
 fs/ecryptfs/main.c          | 4 ++--
 fs/fuse/inode.c             | 8 ++++----
 fs/gfs2/locking/dlm/sysfs.c | 6 ++----
 fs/gfs2/sys.c               | 6 ++----
 fs/namespace.c              | 2 +-
 fs/ocfs2/cluster/masklog.c  | 2 +-
 fs/ocfs2/cluster/sys.c      | 2 +-
 fs/sysfs/file.c             | 4 +---
 11 files changed, 20 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 3bf0278ea843..374ddbd6648d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -128,7 +128,7 @@ void configfs_release_fs(void)
 }
 
 
-static decl_subsys(config, NULL, NULL);
+static decl_subsys(config, NULL);
 
 static int __init configfs_init(void)
 {
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
 	if (!configfs_dir_cachep)
 		goto out;
 
-	kobj_set_kset_s(&config_subsys, kernel_subsys);
+	config_subsys.kobj.kset = &kernel_subsys;
 	err = subsystem_register(&config_subsys);
 	if (err) {
 		kmem_cache_destroy(configfs_dir_cachep);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 6a713b33992f..f7f13516fc1a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -426,13 +426,13 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
 
-static decl_subsys(debug, NULL, NULL);
+static decl_subsys(debug, NULL);
 
 static int __init debugfs_init(void)
 {
 	int retval;
 
-	kobj_set_kset_s(&debug_subsys, kernel_subsys);
+	debug_subsys.kobj.kset = &kernel_subsys;
 	retval = subsystem_register(&debug_subsys);
 	if (retval)
 		return retval;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 6353a8384520..18e4a17b9bee 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -166,9 +166,7 @@ static struct kobj_type dlm_ktype = {
 	.release       = lockspace_kobj_release,
 };
 
-static struct kset dlm_kset = {
-	.ktype  = &dlm_ktype,
-};
+static struct kset dlm_kset;
 
 static int kobject_setup(struct dlm_ls *ls)
 {
@@ -228,7 +226,7 @@ int dlm_lockspace_init(void)
 	spin_lock_init(&lslist_lock);
 
 	kobject_set_name(&dlm_kset.kobj, "dlm");
-	kobj_set_kset_s(&dlm_kset, kernel_subsys);
+	dlm_kset.kobj.kset = &kernel_subsys;
 	error = kset_register(&dlm_kset);
 	if (error)
 		printk("dlm_lockspace_init: cannot register kset %d\n", error);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index f9f32472c505..fe2f44fa17cc 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -734,7 +734,7 @@ static int ecryptfs_init_kmem_caches(void)
 	return 0;
 }
 
-static decl_subsys(ecryptfs, NULL, NULL);
+static decl_subsys(ecryptfs, NULL);
 
 static ssize_t version_show(struct kset *kset, char *buff)
 {
@@ -798,6 +798,7 @@ static int do_sysfs_registration(void)
 {
 	int rc;
 
+	ecryptfs_subsys.kobj.kset = &fs_subsys;
 	rc = subsystem_register(&ecryptfs_subsys);
 	if (rc) {
 		printk(KERN_ERR
@@ -845,7 +846,6 @@ static int __init ecryptfs_init(void)
 		printk(KERN_ERR "Failed to register filesystem\n");
 		goto out_free_kmem_caches;
 	}
-	kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
 	rc = do_sysfs_registration();
 	if (rc) {
 		printk(KERN_ERR "sysfs registration failed\n");
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 84f9f7dfdf5b..f5e4182c482e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -744,8 +744,8 @@ static inline void unregister_fuseblk(void)
 }
 #endif
 
-static decl_subsys(fuse, NULL, NULL);
-static decl_subsys(connections, NULL, NULL);
+static decl_subsys(fuse, NULL);
+static decl_subsys(connections, NULL);
 
 static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
 {
@@ -795,12 +795,12 @@ static int fuse_sysfs_init(void)
 {
 	int err;
 
-	kobj_set_kset_s(&fuse_subsys, fs_subsys);
+	fuse_subsys.kobj.kset = &fs_subsys;
 	err = subsystem_register(&fuse_subsys);
 	if (err)
 		goto out_err;
 
-	kobj_set_kset_s(&connections_subsys, fuse_subsys);
+	connections_subsys.kobj.kset = &fuse_subsys;
 	err = subsystem_register(&connections_subsys);
 	if (err)
 		goto out_fuse_unregister;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index ae9e6a25fe2b..93e66b22757f 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -189,9 +189,7 @@ static struct kobj_type gdlm_ktype = {
 	.sysfs_ops     = &gdlm_attr_ops,
 };
 
-static struct kset gdlm_kset = {
-	.ktype  = &gdlm_ktype,
-};
+static struct kset gdlm_kset;
 
 int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
 {
@@ -224,7 +222,7 @@ int gdlm_sysfs_init(void)
 	int error;
 
 	kobject_set_name(&gdlm_kset.kobj, "lock_dlm");
-	kobj_set_kset_s(&gdlm_kset, kernel_subsys);
+	gdlm_kset.kobj.kset = &kernel_subsys;
 	error = kset_register(&gdlm_kset);
 	if (error)
 		printk("lock_dlm: cannot register kset %d\n", error);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 06e0b7768d97..d7fa54443f0c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -221,9 +221,7 @@ static struct kobj_type gfs2_ktype = {
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
-static struct kset gfs2_kset = {
-	.ktype  = &gfs2_ktype,
-};
+static struct kset gfs2_kset;
 
 /*
  * display struct lm_lockstruct fields
@@ -551,7 +549,7 @@ int gfs2_sys_init(void)
 	gfs2_sys_margs = NULL;
 	spin_lock_init(&gfs2_sys_margs_lock);
 	kobject_set_name(&gfs2_kset.kobj, "gfs2");
-	kobj_set_kset_s(&gfs2_kset, fs_subsys);
+	gfs2_kset.kobj.kset = &fs_subsys;
 	return kset_register(&gfs2_kset);
 }
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 06083885b21e..a4a3f70e7e26 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,7 +41,7 @@ static struct kmem_cache *mnt_cache __read_mostly;
 static struct rw_semaphore namespace_sem;
 
 /* /sys/fs */
-decl_subsys(fs, NULL, NULL);
+decl_subsys(fs, NULL);
 EXPORT_SYMBOL_GPL(fs_subsys);
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index a4882c8df945..dead319932b3 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
 	mlog_attr_ptrs[i] = NULL;
 
 	kobject_set_name(&mlog_kset.kobj, "logmask");
-	kobj_set_kset_s(&mlog_kset, *o2cb_subsys);
+	mlog_kset.kobj.kset = o2cb_subsys;
 	return kset_register(&mlog_kset);
 }
 
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 64f6f378fd09..880d0138bb0a 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -72,7 +72,7 @@ static struct kobj_type o2cb_subsys_type = {
 };
 
 /* gives us o2cb_subsys */
-static decl_subsys(o2cb, NULL, NULL);
+static decl_subsys(o2cb, NULL);
 
 static ssize_t
 o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 09a0611b3364..387a63662793 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -365,9 +365,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	/* if the kobject has no ktype, then we assume that it is a subsystem
 	 * itself, and use ops for it.
 	 */
-	if (kobj->kset && kobj->kset->ktype)
-		ops = kobj->kset->ktype->sysfs_ops;
-	else if (kobj->ktype)
+	if (kobj->ktype)
 		ops = kobj->ktype->sysfs_ops;
 	else
 		ops = &subsys_sysfs_ops;
-- 
cgit v1.2.3


From 4ff6abff832fbc6cb1d769f6106c841bc2b09f63 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 5 Nov 2007 22:24:43 -0800
Subject: kobject: get rid of kobject_add_dir

kobject_create_and_add is the same as kobject_add_dir, so drop
kobject_add_dir.


Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 722e12e5acc7..69685bb51c62 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -335,7 +335,7 @@ static inline void partition_sysfs_add_subdir(struct hd_struct *p)
 	struct kobject *k;
 
 	k = kobject_get(&p->kobj);
-	p->holder_dir = kobject_add_dir(k, "holders");
+	p->holder_dir = kobject_create_and_add("holders", k);
 	kobject_put(k);
 }
 
@@ -344,8 +344,8 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 	struct kobject *k;
 
 	k = kobject_get(&disk->kobj);
-	disk->holder_dir = kobject_add_dir(k, "holders");
-	disk->slave_dir = kobject_add_dir(k, "slaves");
+	disk->holder_dir = kobject_create_and_add("holders", k);
+	disk->slave_dir = kobject_create_and_add("slaves", k);
 	kobject_put(k);
 }
 
-- 
cgit v1.2.3


From 5c89e17e9c2bc03ed16320967832b33b174e6234 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kobject: convert fuse to use kobject_create

We don't need a kset here, a simple kobject will do just fine, so
dynamically create the kobject and use it.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/fuse/inode.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f5e4182c482e..92118066f1d6 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -744,9 +744,6 @@ static inline void unregister_fuseblk(void)
 }
 #endif
 
-static decl_subsys(fuse, NULL);
-static decl_subsys(connections, NULL);
-
 static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
 {
 	struct inode * inode = foo;
@@ -791,32 +788,37 @@ static void fuse_fs_cleanup(void)
 	kmem_cache_destroy(fuse_inode_cachep);
 }
 
+static struct kobject *fuse_kobj;
+static struct kobject *connections_kobj;
+
 static int fuse_sysfs_init(void)
 {
 	int err;
 
-	fuse_subsys.kobj.kset = &fs_subsys;
-	err = subsystem_register(&fuse_subsys);
-	if (err)
+	fuse_kobj = kobject_create_and_add("fuse", &fs_subsys.kobj);
+	if (!fuse_kobj) {
+		err = -ENOMEM;
 		goto out_err;
+	}
 
-	connections_subsys.kobj.kset = &fuse_subsys;
-	err = subsystem_register(&connections_subsys);
-	if (err)
+	connections_kobj = kobject_create_and_add("connections", fuse_kobj);
+	if (!connections_kobj) {
+		err = -ENOMEM;
 		goto out_fuse_unregister;
+	}
 
 	return 0;
 
  out_fuse_unregister:
-	subsystem_unregister(&fuse_subsys);
+	kobject_unregister(fuse_kobj);
  out_err:
 	return err;
 }
 
 static void fuse_sysfs_cleanup(void)
 {
-	subsystem_unregister(&connections_subsys);
-	subsystem_unregister(&fuse_subsys);
+	kobject_unregister(connections_kobj);
+	kobject_unregister(fuse_kobj);
 }
 
 static int __init fuse_init(void)
-- 
cgit v1.2.3


From 191e186bd0589e28496745275157323a6f7902ca Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kobject: convert debugfs to use kobject_create

We don't need a kset here, a simple kobject will do just fine, so
dynamically create the kobject and use it.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/inode.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f7f13516fc1a..667214200b03 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -426,20 +426,19 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
 
-static decl_subsys(debug, NULL);
+static struct kobject *debug_kobj;
 
 static int __init debugfs_init(void)
 {
 	int retval;
 
-	debug_subsys.kobj.kset = &kernel_subsys;
-	retval = subsystem_register(&debug_subsys);
-	if (retval)
-		return retval;
+	debug_kobj = kobject_create_and_add("debug", &kernel_subsys.kobj);
+	if (!debug_kobj)
+		return -EINVAL;
 
 	retval = register_filesystem(&debug_fs_type);
 	if (retval)
-		subsystem_unregister(&debug_subsys);
+		kobject_unregister(debug_kobj);
 	return retval;
 }
 
@@ -447,7 +446,7 @@ static void __exit debugfs_exit(void)
 {
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	unregister_filesystem(&debug_fs_type);
-	subsystem_unregister(&debug_subsys);
+	kobject_unregister(debug_kobj);
 }
 
 core_initcall(debugfs_init);
-- 
cgit v1.2.3


From 3794491d0c4b6355c55b0379f003900e57666a97 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kobject: convert configfs to use kobject_create

We don't need a kset here, a simple kobject will do just fine, so
dynamically create the kobject and use it.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/configfs/mount.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 374ddbd6648d..13300466464b 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -128,7 +128,7 @@ void configfs_release_fs(void)
 }
 
 
-static decl_subsys(config, NULL);
+static struct kobject *config_kobj;
 
 static int __init configfs_init(void)
 {
@@ -140,9 +140,8 @@ static int __init configfs_init(void)
 	if (!configfs_dir_cachep)
 		goto out;
 
-	config_subsys.kobj.kset = &kernel_subsys;
-	err = subsystem_register(&config_subsys);
-	if (err) {
+	config_kobj = kobject_create_and_add("config", &kernel_subsys.kobj);
+	if (!config_kobj) {
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
 		goto out;
@@ -151,7 +150,7 @@ static int __init configfs_init(void)
 	err = register_filesystem(&configfs_fs_type);
 	if (err) {
 		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
-		subsystem_unregister(&config_subsys);
+		kobject_unregister(config_kobj);
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
 		goto out;
@@ -160,7 +159,7 @@ static int __init configfs_init(void)
 	err = configfs_inode_init();
 	if (err) {
 		unregister_filesystem(&configfs_fs_type);
-		subsystem_unregister(&config_subsys);
+		kobject_unregister(config_kobj);
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
 	}
@@ -171,7 +170,7 @@ out:
 static void __exit configfs_exit(void)
 {
 	unregister_filesystem(&configfs_fs_type);
-	subsystem_unregister(&config_subsys);
+	kobject_unregister(config_kobj);
 	kmem_cache_destroy(configfs_dir_cachep);
 	configfs_dir_cachep = NULL;
 	configfs_inode_exit();
-- 
cgit v1.2.3


From 917e865df7eb020f20ffc2b4204f282a587df94f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kset: convert ecryptfs to use kset_create

Dynamically create the kset instead of declaring it statically.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Mike Halcrow <mhalcrow@us.ibm.com>
Cc: Phillip Hellewell <phillip@hellewell.homeip.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ecryptfs/main.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fe2f44fa17cc..4750d82c3db9 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -734,7 +734,7 @@ static int ecryptfs_init_kmem_caches(void)
 	return 0;
 }
 
-static decl_subsys(ecryptfs, NULL);
+static struct kset *ecryptfs_kset;
 
 static ssize_t version_show(struct kset *kset, char *buff)
 {
@@ -798,18 +798,17 @@ static int do_sysfs_registration(void)
 {
 	int rc;
 
-	ecryptfs_subsys.kobj.kset = &fs_subsys;
-	rc = subsystem_register(&ecryptfs_subsys);
-	if (rc) {
-		printk(KERN_ERR
-		       "Unable to register ecryptfs sysfs subsystem\n");
+	ecryptfs_kset = kset_create_and_add("ecryptfs", NULL, &fs_subsys.kobj);
+	if (!ecryptfs_kset) {
+		printk(KERN_ERR "Unable to create ecryptfs kset\n");
+		rc = -ENOMEM;
 		goto out;
 	}
-	rc = sysfs_create_group(&ecryptfs_subsys.kobj, &attr_group);
+	rc = sysfs_create_group(&ecryptfs_kset->kobj, &attr_group);
 	if (rc) {
 		printk(KERN_ERR
 		       "Unable to create ecryptfs version attributes\n");
-		subsystem_unregister(&ecryptfs_subsys);
+		kset_unregister(ecryptfs_kset);
 	}
 out:
 	return rc;
@@ -817,8 +816,8 @@ out:
 
 static void do_sysfs_unregistration(void)
 {
-	sysfs_remove_group(&ecryptfs_subsys.kobj, &attr_group);
-	subsystem_unregister(&ecryptfs_subsys);
+	sysfs_remove_group(&ecryptfs_kset->kobj, &attr_group);
+	kset_unregister(ecryptfs_kset);
 }
 
 static int __init ecryptfs_init(void)
-- 
cgit v1.2.3


From 00d2666623368ffd39afc875ff8a2eead2a0436c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 14:17:23 -0600
Subject: kobject: convert main fs kobject to use kobject_create

This also renames fs_subsys to fs_kobj to catch all current users with a
build error instead of a build warning which can easily be missed.


Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ecryptfs/main.c |  2 +-
 fs/fuse/inode.c    |  2 +-
 fs/gfs2/sys.c      |  2 +-
 fs/namespace.c     | 11 +++++------
 4 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4750d82c3db9..bdeac3877a84 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -798,7 +798,7 @@ static int do_sysfs_registration(void)
 {
 	int rc;
 
-	ecryptfs_kset = kset_create_and_add("ecryptfs", NULL, &fs_subsys.kobj);
+	ecryptfs_kset = kset_create_and_add("ecryptfs", NULL, fs_kobj);
 	if (!ecryptfs_kset) {
 		printk(KERN_ERR "Unable to create ecryptfs kset\n");
 		rc = -ENOMEM;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 92118066f1d6..e6e23a2ad4b3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -795,7 +795,7 @@ static int fuse_sysfs_init(void)
 {
 	int err;
 
-	fuse_kobj = kobject_create_and_add("fuse", &fs_subsys.kobj);
+	fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
 	if (!fuse_kobj) {
 		err = -ENOMEM;
 		goto out_err;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d7fa54443f0c..a0bdc4a3acf9 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -549,7 +549,7 @@ int gfs2_sys_init(void)
 	gfs2_sys_margs = NULL;
 	spin_lock_init(&gfs2_sys_margs_lock);
 	kobject_set_name(&gfs2_kset.kobj, "gfs2");
-	gfs2_kset.kobj.kset = &fs_subsys;
+	gfs2_kset.kobj.parent = fs_kobj;
 	return kset_register(&gfs2_kset);
 }
 
diff --git a/fs/namespace.c b/fs/namespace.c
index a4a3f70e7e26..61bf376e29e8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,8 +41,8 @@ static struct kmem_cache *mnt_cache __read_mostly;
 static struct rw_semaphore namespace_sem;
 
 /* /sys/fs */
-decl_subsys(fs, NULL);
-EXPORT_SYMBOL_GPL(fs_subsys);
+struct kobject *fs_kobj;
+EXPORT_SYMBOL_GPL(fs_kobj);
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -1861,10 +1861,9 @@ void __init mnt_init(void)
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
 			__FUNCTION__, err);
-	err = subsystem_register(&fs_subsys);
-	if (err)
-		printk(KERN_WARNING "%s: subsystem_register error: %d\n",
-			__FUNCTION__, err);
+	fs_kobj = kobject_create_and_add("fs", NULL);
+	if (!fs_kobj)
+		printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__);
 	init_rootfs();
 	init_mount_tree();
 }
-- 
cgit v1.2.3


From 9bec101a0c38d559a8c95b44d850cd09a7b4edef Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kset: convert gfs2 to use kset_create

Dynamically create the kset instead of declaring it statically.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/gfs2/sys.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a0bdc4a3acf9..44cfaae92e76 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -221,7 +221,7 @@ static struct kobj_type gfs2_ktype = {
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
-static struct kset gfs2_kset;
+static struct kset *gfs2_kset;
 
 /*
  * display struct lm_lockstruct fields
@@ -493,7 +493,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 {
 	int error;
 
-	sdp->sd_kobj.kset = &gfs2_kset;
+	sdp->sd_kobj.kset = gfs2_kset;
 	sdp->sd_kobj.ktype = &gfs2_ktype;
 
 	error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
@@ -548,14 +548,15 @@ int gfs2_sys_init(void)
 {
 	gfs2_sys_margs = NULL;
 	spin_lock_init(&gfs2_sys_margs_lock);
-	kobject_set_name(&gfs2_kset.kobj, "gfs2");
-	gfs2_kset.kobj.parent = fs_kobj;
-	return kset_register(&gfs2_kset);
+	gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
+	if (!gfs2_kset)
+		return -ENOMEM;
+	return 0;
 }
 
 void gfs2_sys_uninit(void)
 {
 	kfree(gfs2_sys_margs);
-	kset_unregister(&gfs2_kset);
+	kset_unregister(gfs2_kset);
 }
 
-- 
cgit v1.2.3


From 136a27507fd09006973f11b17ca971d4c176a06a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kset: convert gfs2 dlm to use kset_create

Dynamically create the kset instead of declaring it statically.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/gfs2/locking/dlm/sysfs.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 93e66b22757f..0a8614088ec6 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -189,7 +189,7 @@ static struct kobj_type gdlm_ktype = {
 	.sysfs_ops     = &gdlm_attr_ops,
 };
 
-static struct kset gdlm_kset;
+static struct kset *gdlm_kset;
 
 int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
 {
@@ -201,7 +201,7 @@ int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
 		return error;
 	}
 
-	ls->kobj.kset = &gdlm_kset;
+	ls->kobj.kset = gdlm_kset;
 	ls->kobj.ktype = &gdlm_ktype;
 	ls->kobj.parent = fskobj;
 
@@ -219,19 +219,17 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
 
 int gdlm_sysfs_init(void)
 {
-	int error;
-
-	kobject_set_name(&gdlm_kset.kobj, "lock_dlm");
-	gdlm_kset.kobj.kset = &kernel_subsys;
-	error = kset_register(&gdlm_kset);
-	if (error)
-		printk("lock_dlm: cannot register kset %d\n", error);
-
-	return error;
+	gdlm_kset = kset_create_and_add("lock_dlm", NULL,
+					&kernel_subsys.kobj);
+	if (!gdlm_kset) {
+		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
+		return -ENOMEM;
+	}
+	return 0;
 }
 
 void gdlm_sysfs_exit(void)
 {
-	kset_unregister(&gdlm_kset);
+	kset_unregister(gdlm_kset);
 }
 
-- 
cgit v1.2.3


From d405936b322220dc5cca9d2b58ef1911ae8efec9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kset: convert dlm to use kset_create

Dynamically create the kset instead of declaring it statically.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/dlm/lockspace.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 18e4a17b9bee..83a9c4dd5114 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -166,7 +166,7 @@ static struct kobj_type dlm_ktype = {
 	.release       = lockspace_kobj_release,
 };
 
-static struct kset dlm_kset;
+static struct kset *dlm_kset;
 
 static int kobject_setup(struct dlm_ls *ls)
 {
@@ -180,7 +180,7 @@ static int kobject_setup(struct dlm_ls *ls)
 	if (error)
 		return error;
 
-	ls->ls_kobj.kset = &dlm_kset;
+	ls->ls_kobj.kset = dlm_kset;
 	ls->ls_kobj.ktype = &dlm_ktype;
 	return 0;
 }
@@ -218,24 +218,22 @@ static int do_uevent(struct dlm_ls *ls, int in)
 
 int dlm_lockspace_init(void)
 {
-	int error;
-
 	ls_count = 0;
 	mutex_init(&ls_lock);
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
-	kobject_set_name(&dlm_kset.kobj, "dlm");
-	dlm_kset.kobj.kset = &kernel_subsys;
-	error = kset_register(&dlm_kset);
-	if (error)
-		printk("dlm_lockspace_init: cannot register kset %d\n", error);
-	return error;
+	dlm_kset = kset_create_and_add("dlm", NULL, &kernel_subsys.kobj);
+	if (!dlm_kset) {
+		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
+		return -ENOMEM;
+	}
+	return 0;
 }
 
 void dlm_lockspace_exit(void)
 {
-	kset_unregister(&dlm_kset);
+	kset_unregister(dlm_kset);
 }
 
 static int dlm_scand(void *data)
-- 
cgit v1.2.3


From bd35b93d8049ab47b5bfaf6b10ba39badf21d1c3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kset: convert kernel_subsys to use kset_create

Dynamically create the kset instead of declaring it statically.  We also
rename kernel_subsys to kernel_kset to catch all users of this symbol
with a build error instead of an easy-to-ignore build warning.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/configfs/mount.c         | 2 +-
 fs/debugfs/inode.c          | 2 +-
 fs/dlm/lockspace.c          | 2 +-
 fs/gfs2/locking/dlm/sysfs.c | 3 +--
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 13300466464b..c4ee7f05de8b 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
 	if (!configfs_dir_cachep)
 		goto out;
 
-	config_kobj = kobject_create_and_add("config", &kernel_subsys.kobj);
+	config_kobj = kobject_create_and_add("config", &kernel_kset->kobj);
 	if (!config_kobj) {
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 667214200b03..5ce92c3d3b59 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -432,7 +432,7 @@ static int __init debugfs_init(void)
 {
 	int retval;
 
-	debug_kobj = kobject_create_and_add("debug", &kernel_subsys.kobj);
+	debug_kobj = kobject_create_and_add("debug", &kernel_kset->kobj);
 	if (!debug_kobj)
 		return -EINVAL;
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 83a9c4dd5114..0828beb2d35d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -223,7 +223,7 @@ int dlm_lockspace_init(void)
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
-	dlm_kset = kset_create_and_add("dlm", NULL, &kernel_subsys.kobj);
+	dlm_kset = kset_create_and_add("dlm", NULL, &kernel_kset->kobj);
 	if (!dlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
 		return -ENOMEM;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 0a8614088ec6..1a92b6f7bc10 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -219,8 +219,7 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
 
 int gdlm_sysfs_init(void)
 {
-	gdlm_kset = kset_create_and_add("lock_dlm", NULL,
-					&kernel_subsys.kobj);
+	gdlm_kset = kset_create_and_add("lock_dlm", NULL, &kernel_kset->kobj);
 	if (!gdlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
 		return -ENOMEM;
-- 
cgit v1.2.3


From 386f275f5d097758f867bc99ddeaeb7a03b6b190 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 2 Nov 2007 13:47:53 +0100
Subject: Driver Core: switch all dynamic ksets to kobj_sysfs_ops

Switch all dynamically created ksets, that export simple attributes,
to kobj_attribute from subsys_attribute. Struct subsys_attribute will
be removed.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: Mike Halcrow <mhalcrow@us.ibm.com>
Cc: Phillip Hellewell <phillip@hellewell.homeip.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ecryptfs/main.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index bdeac3877a84..6ded37b467ff 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -736,12 +736,13 @@ static int ecryptfs_init_kmem_caches(void)
 
 static struct kset *ecryptfs_kset;
 
-static ssize_t version_show(struct kset *kset, char *buff)
+static ssize_t version_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buff)
 {
 	return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
 }
 
-static struct subsys_attribute version_attr = __ATTR_RO(version);
+static struct kobj_attribute version_attr = __ATTR_RO(version);
 
 static struct ecryptfs_version_str_map_elem {
 	u32 flag;
@@ -755,7 +756,8 @@ static struct ecryptfs_version_str_map_elem {
 	{ECRYPTFS_VERSIONING_MULTKEY, "multiple keys per file"}
 };
 
-static ssize_t version_str_show(struct kset *kset, char *buff)
+static ssize_t version_str_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buff)
 {
 	int i;
 	int remaining = PAGE_SIZE;
@@ -782,7 +784,7 @@ out:
 	return total_written;
 }
 
-static struct subsys_attribute version_attr_str = __ATTR_RO(version_str);
+static struct kobj_attribute version_attr_str = __ATTR_RO(version_str);
 
 static struct attribute *attributes[] = {
 	&version_attr.attr,
-- 
cgit v1.2.3


From af6370ea9268443351d6e931c702dc8162a1c8a1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 2 Nov 2007 13:20:40 -0700
Subject: ecryptfs: remove version_str file from sysfs

This file violates the one-value-per-file sysfs rule.

If you all want it added back, please do something like a per-feature
file to show what is present and what isn't.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Mike Halcrow <mhalcrow@us.ibm.com>
Cc: Phillip Hellewell <phillip@hellewell.homeip.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ecryptfs/main.c | 43 -------------------------------------------
 1 file changed, 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6ded37b467ff..d984eac9a7f5 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -744,51 +744,8 @@ static ssize_t version_show(struct kobject *kobj,
 
 static struct kobj_attribute version_attr = __ATTR_RO(version);
 
-static struct ecryptfs_version_str_map_elem {
-	u32 flag;
-	char *str;
-} ecryptfs_version_str_map[] = {
-	{ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"},
-	{ECRYPTFS_VERSIONING_PUBKEY, "pubkey"},
-	{ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"},
-	{ECRYPTFS_VERSIONING_POLICY, "policy"},
-	{ECRYPTFS_VERSIONING_XATTR, "metadata in extended attribute"},
-	{ECRYPTFS_VERSIONING_MULTKEY, "multiple keys per file"}
-};
-
-static ssize_t version_str_show(struct kobject *kobj,
-				struct kobj_attribute *attr, char *buff)
-{
-	int i;
-	int remaining = PAGE_SIZE;
-	int total_written = 0;
-
-	buff[0] = '\0';
-	for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) {
-		int entry_size;
-
-		if (!(ECRYPTFS_VERSIONING_MASK
-		      & ecryptfs_version_str_map[i].flag))
-			continue;
-		entry_size = strlen(ecryptfs_version_str_map[i].str);
-		if ((entry_size + 2) > remaining)
-			goto out;
-		memcpy(buff, ecryptfs_version_str_map[i].str, entry_size);
-		buff[entry_size++] = '\n';
-		buff[entry_size] = '\0';
-		buff += entry_size;
-		total_written += entry_size;
-		remaining -= entry_size;
-	}
-out:
-	return total_written;
-}
-
-static struct kobj_attribute version_attr_str = __ATTR_RO(version_str);
-
 static struct attribute *attributes[] = {
 	&version_attr.attr,
-	&version_attr_str.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3


From 000f2a4d8cfc1e1cbc0aa98136015e7ae7719b46 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 2 Nov 2007 13:47:53 +0100
Subject: Driver Core: kill subsys_attribute and default sysfs ops

Remove the no longer needed subsys_attributes, they are all converted to
the more sensical kobj_attributes.

There is no longer a magic fallback in sysfs attribute operations, all
kobjects which create simple attributes need explicitely a ktype
assigned, which tells the core what was intended here.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 63 +++++++++------------------------------------------------
 1 file changed, 10 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 387a63662793..8acf82bba44c 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -20,43 +20,6 @@
 
 #include "sysfs.h"
 
-#define to_sattr(a) container_of(a,struct subsys_attribute, attr)
-
-/*
- * Subsystem file operations.
- * These operations allow subsystems to have files that can be 
- * read/written. 
- */
-static ssize_t 
-subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
-{
-	struct kset *kset = to_kset(kobj);
-	struct subsys_attribute * sattr = to_sattr(attr);
-	ssize_t ret = -EIO;
-
-	if (sattr->show)
-		ret = sattr->show(kset, page);
-	return ret;
-}
-
-static ssize_t 
-subsys_attr_store(struct kobject * kobj, struct attribute * attr, 
-		  const char * page, size_t count)
-{
-	struct kset *kset = to_kset(kobj);
-	struct subsys_attribute * sattr = to_sattr(attr);
-	ssize_t ret = -EIO;
-
-	if (sattr->store)
-		ret = sattr->store(kset, page, count);
-	return ret;
-}
-
-static struct sysfs_ops subsys_sysfs_ops = {
-	.show	= subsys_attr_show,
-	.store	= subsys_attr_store,
-};
-
 /*
  * There's one sysfs_buffer for each open file and one
  * sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -354,29 +317,23 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	struct sysfs_buffer * buffer;
-	struct sysfs_ops * ops = NULL;
-	int error;
+	struct sysfs_buffer *buffer;
+	struct sysfs_ops *ops;
+	int error = -EACCES;
 
 	/* need attr_sd for attr and ops, its parent for kobj */
 	if (!sysfs_get_active_two(attr_sd))
 		return -ENODEV;
 
-	/* if the kobject has no ktype, then we assume that it is a subsystem
-	 * itself, and use ops for it.
-	 */
-	if (kobj->ktype)
+	/* every kobject with an attribute needs a ktype assigned */
+	if (kobj->ktype && kobj->ktype->sysfs_ops)
 		ops = kobj->ktype->sysfs_ops;
-	else
-		ops = &subsys_sysfs_ops;
-
-	error = -EACCES;
-
-	/* No sysfs operations, either from having no subsystem,
-	 * or the subsystem have no operations.
-	 */
-	if (!ops)
+	else {
+		printk(KERN_ERR "missing sysfs attribute operations for "
+		       "kobject: %s\n", kobject_name(kobj));
+		WARN_ON(1);
 		goto err_out;
+	}
 
 	/* File needs write support.
 	 * The inode's perms must say it's ok, 
-- 
cgit v1.2.3


From c60b71787982cefcf9fa09aa281fa8c4c685d557 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 2 Nov 2007 16:19:59 -0700
Subject: kset: convert ocfs2 to use kset_create

Dynamically create the kset instead of declaring it statically.

Also use the new kobj_attribute which cleans up this file a _lot_.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ocfs2/cluster/masklog.c |  4 +--
 fs/ocfs2/cluster/sys.c     | 83 ++++++++++++----------------------------------
 2 files changed, 23 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index dead319932b3..23c732f27529 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -146,7 +146,7 @@ static struct kset mlog_kset = {
 	.kobj   = {.ktype = &mlog_ktype},
 };
 
-int mlog_sys_init(struct kset *o2cb_subsys)
+int mlog_sys_init(struct kset *o2cb_kset)
 {
 	int i = 0;
 
@@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
 	mlog_attr_ptrs[i] = NULL;
 
 	kobject_set_name(&mlog_kset.kobj, "logmask");
-	mlog_kset.kobj.kset = o2cb_subsys;
+	mlog_kset.kobj.kset = o2cb_kset;
 	return kset_register(&mlog_kset);
 }
 
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 880d0138bb0a..a4b07730b2e1 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -28,96 +28,55 @@
 #include <linux/module.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/fs.h>
 
 #include "ocfs2_nodemanager.h"
 #include "masklog.h"
 #include "sys.h"
 
-struct o2cb_attribute {
-	struct attribute	attr;
-	ssize_t (*show)(char *buf);
-	ssize_t (*store)(const char *buf, size_t count);
-};
-
-#define O2CB_ATTR(_name, _mode, _show, _store)	\
-struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
-
-#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
 
-static ssize_t o2cb_interface_revision_show(char *buf)
+static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
+			    char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
 }
-
-static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+static struct kobj_attribute attr_version =
+	__ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
 
 static struct attribute *o2cb_attrs[] = {
-	&o2cb_attr_interface_revision.attr,
+	&attr_version.attr,
 	NULL,
 };
 
-static ssize_t
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
-	   const char * buffer, size_t count);
-static struct sysfs_ops o2cb_sysfs_ops = {
-	.show	= o2cb_show,
-	.store	= o2cb_store,
+static struct attribute_group o2cb_attr_group = {
+	.attrs = o2cb_attrs,
 };
 
-static struct kobj_type o2cb_subsys_type = {
-	.default_attrs	= o2cb_attrs,
-	.sysfs_ops	= &o2cb_sysfs_ops,
-};
-
-/* gives us o2cb_subsys */
-static decl_subsys(o2cb, NULL);
-
-static ssize_t
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
-{
-	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-	struct kset *sbs = to_kset(kobj);
-
-	BUG_ON(sbs != &o2cb_subsys);
-
-	if (o2cb_attr->show)
-		return o2cb_attr->show(buffer);
-	return -EIO;
-}
-
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
-	     const char * buffer, size_t count)
-{
-	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
-	struct kset *sbs = to_kset(kobj);
-
-	BUG_ON(sbs != &o2cb_subsys);
-
-	if (o2cb_attr->store)
-		return o2cb_attr->store(buffer, count);
-	return -EIO;
-}
+static struct kset *o2cb_kset;
 
 void o2cb_sys_shutdown(void)
 {
 	mlog_sys_shutdown();
-	subsystem_unregister(&o2cb_subsys);
+	kset_unregister(o2cb_kset);
 }
 
 int o2cb_sys_init(void)
 {
 	int ret;
 
-	o2cb_subsys.kobj.ktype = &o2cb_subsys_type;
-	ret = subsystem_register(&o2cb_subsys);
+	o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj);
+	if (!o2cb_kset)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
 	if (ret)
-		return ret;
+		goto error;
 
-	ret = mlog_sys_init(&o2cb_subsys);
+	ret = mlog_sys_init(o2cb_kset);
 	if (ret)
-		subsystem_unregister(&o2cb_subsys);
+		goto error;
+	return 0;
+error:
+	kset_unregister(o2cb_kset);
 	return ret;
 }
-- 
cgit v1.2.3


From 830d3cfb16728e2496edc2985ad8f68025135e37 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 6 Nov 2007 10:36:58 -0800
Subject: kset: convert block_subsys to use kset_create

Dynamically create the kset instead of declaring it statically.  We also
rename block_subsys to block_kset to catch all users of this symbol
with a build error instead of an easy-to-ignore build warning.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 69685bb51c62..9184215f3ef3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -316,7 +316,7 @@ static struct attribute * default_attrs[] = {
 	NULL,
 };
 
-extern struct kset block_subsys;
+extern struct kset *block_kset;
 
 static void part_release(struct kobject *kobj)
 {
@@ -393,7 +393,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	kobject_add(&p->kobj);
 	if (!disk->part_uevent_suppress)
 		kobject_uevent(&p->kobj, KOBJ_ADD);
-	sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem");
+	sysfs_create_link(&p->kobj, &block_kset->kobj, "subsystem");
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		static struct attribute addpartattr = {
 			.name = "whole_disk",
@@ -448,7 +448,7 @@ static int disk_sysfs_symlinks(struct gendisk *disk)
 			goto err_out_dev_link;
 	}
 
-	err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,
+	err = sysfs_create_link(&disk->kobj, &block_kset->kobj,
 				"subsystem");
 	if (err)
 		goto err_out_disk_name_lnk;
-- 
cgit v1.2.3


From 0ff21e46630abce11fdaaffabd72bbd4eed5ac2c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 6 Nov 2007 10:36:58 -0800
Subject: kobject: convert kernel_kset to be a kobject

kernel_kset does not need to be a kset, but a much simpler kobject now
that we have kobj_attributes.

We also rename kernel_kset to kernel_kobj to catch all users of this
symbol with a build error instead of an easy-to-ignore build warning.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/configfs/mount.c         | 2 +-
 fs/debugfs/inode.c          | 2 +-
 fs/dlm/lockspace.c          | 2 +-
 fs/gfs2/locking/dlm/sysfs.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index c4ee7f05de8b..54bf0db0d4b0 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
 	if (!configfs_dir_cachep)
 		goto out;
 
-	config_kobj = kobject_create_and_add("config", &kernel_kset->kobj);
+	config_kobj = kobject_create_and_add("config", kernel_kobj);
 	if (!config_kobj) {
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 5ce92c3d3b59..97f6381c36c2 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -432,7 +432,7 @@ static int __init debugfs_init(void)
 {
 	int retval;
 
-	debug_kobj = kobject_create_and_add("debug", &kernel_kset->kobj);
+	debug_kobj = kobject_create_and_add("debug", kernel_kobj);
 	if (!debug_kobj)
 		return -EINVAL;
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 0828beb2d35d..e64b0dc664f3 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -223,7 +223,7 @@ int dlm_lockspace_init(void)
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
-	dlm_kset = kset_create_and_add("dlm", NULL, &kernel_kset->kobj);
+	dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
 	if (!dlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
 		return -ENOMEM;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 1a92b6f7bc10..e5a4fbf7265f 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -219,7 +219,7 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
 
 int gdlm_sysfs_init(void)
 {
-	gdlm_kset = kset_create_and_add("lock_dlm", NULL, &kernel_kset->kobj);
+	gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
 	if (!gdlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
 		return -ENOMEM;
-- 
cgit v1.2.3


From 6e90aa972dda8ef86155eefcdbdc8d34165b9f39 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 6 Nov 2007 15:08:08 -0800
Subject: kobject: convert ecryptfs to use kobject_create

Using a kset for this trivial directory is an overkill.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Mike Halcrow <mhalcrow@us.ibm.com>
Cc: Phillip Hellewell <phillip@hellewell.homeip.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ecryptfs/main.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d984eac9a7f5..4f1332107bbd 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -734,7 +734,7 @@ static int ecryptfs_init_kmem_caches(void)
 	return 0;
 }
 
-static struct kset *ecryptfs_kset;
+static struct kobject *ecryptfs_kobj;
 
 static ssize_t version_show(struct kobject *kobj,
 			    struct kobj_attribute *attr, char *buff)
@@ -757,17 +757,17 @@ static int do_sysfs_registration(void)
 {
 	int rc;
 
-	ecryptfs_kset = kset_create_and_add("ecryptfs", NULL, fs_kobj);
-	if (!ecryptfs_kset) {
+	ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj);
+	if (!ecryptfs_kobj) {
 		printk(KERN_ERR "Unable to create ecryptfs kset\n");
 		rc = -ENOMEM;
 		goto out;
 	}
-	rc = sysfs_create_group(&ecryptfs_kset->kobj, &attr_group);
+	rc = sysfs_create_group(ecryptfs_kobj, &attr_group);
 	if (rc) {
 		printk(KERN_ERR
 		       "Unable to create ecryptfs version attributes\n");
-		kset_unregister(ecryptfs_kset);
+		kobject_unregister(ecryptfs_kobj);
 	}
 out:
 	return rc;
@@ -775,8 +775,8 @@ out:
 
 static void do_sysfs_unregistration(void)
 {
-	sysfs_remove_group(&ecryptfs_kset->kobj, &attr_group);
-	kset_unregister(ecryptfs_kset);
+	sysfs_remove_group(ecryptfs_kobj, &attr_group);
+	kobject_unregister(ecryptfs_kobj);
 }
 
 static int __init ecryptfs_init(void)
-- 
cgit v1.2.3


From 901195ed7f4b2f30dc5a36271887939c5d7bfb9f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 15:54:39 -0400
Subject: Kobject: change GFS2 to use kobject_init_and_add

Stop using kobject_register, as this way we can control the sending of
the uevent properly, after everything is properly initialized.

Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/dlm/lockspace.c          | 26 ++++----------------------
 fs/gfs2/locking/dlm/sysfs.c | 13 +++----------
 fs/gfs2/sys.c               | 10 +++-------
 3 files changed, 10 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index e64b0dc664f3..b750f13d0328 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -168,23 +168,6 @@ static struct kobj_type dlm_ktype = {
 
 static struct kset *dlm_kset;
 
-static int kobject_setup(struct dlm_ls *ls)
-{
-	char lsname[DLM_LOCKSPACE_LEN];
-	int error;
-
-	memset(lsname, 0, DLM_LOCKSPACE_LEN);
-	snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
-
-	error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
-	if (error)
-		return error;
-
-	ls->ls_kobj.kset = dlm_kset;
-	ls->ls_kobj.ktype = &dlm_ktype;
-	return 0;
-}
-
 static int do_uevent(struct dlm_ls *ls, int in)
 {
 	int error;
@@ -545,13 +528,12 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 		goto out_delist;
 	}
 
-	error = kobject_setup(ls);
-	if (error)
-		goto out_stop;
-
-	error = kobject_register(&ls->ls_kobj);
+	ls->ls_kobj.kset = dlm_kset;
+	error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
+				     "%s", ls->ls_name);
 	if (error)
 		goto out_stop;
+	kobject_uevent(&ls->ls_kobj, KOBJ_ADD);
 
 	/* let kobject handle freeing of ls if there's an error */
 	do_unreg = 1;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index e5a4fbf7265f..a7336b909c61 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,19 +195,12 @@ int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
 {
 	int error;
 
-	error = kobject_set_name(&ls->kobj, "%s", "lock_module");
-	if (error) {
-		log_error("can't set kobj name %d", error);
-		return error;
-	}
-
 	ls->kobj.kset = gdlm_kset;
-	ls->kobj.ktype = &gdlm_ktype;
-	ls->kobj.parent = fskobj;
-
-	error = kobject_register(&ls->kobj);
+	error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
+				     "lock_module");
 	if (error)
 		log_error("can't register kobj %d", error);
+	kobject_uevent(&ls->kobj, KOBJ_ADD);
 
 	return error;
 }
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 44cfaae92e76..8d9cd5bd5845 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -494,13 +494,8 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	int error;
 
 	sdp->sd_kobj.kset = gfs2_kset;
-	sdp->sd_kobj.ktype = &gfs2_ktype;
-
-	error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
-	if (error)
-		goto fail;
-
-	error = kobject_register(&sdp->sd_kobj);
+	error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
+				     "%s", sdp->sd_table_name);
 	if (error)
 		goto fail;
 
@@ -520,6 +515,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_args;
 
+	kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
 	return 0;
 
 fail_args:
-- 
cgit v1.2.3


From a5815ddf26aa8208d4ad79b4fba5e6bf7d5ba688 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 23:05:35 -0700
Subject: Kobject: convert fs/char_dev.c to use kobject_init/add_ng()

This converts the code to use the new kobject functions, cleaning up the
logic in doing so.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/char_dev.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index c3bfa76765c4..b2dd5a036631 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -510,9 +510,8 @@ struct cdev *cdev_alloc(void)
 {
 	struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
 	if (p) {
-		p->kobj.ktype = &ktype_cdev_dynamic;
 		INIT_LIST_HEAD(&p->list);
-		kobject_init(&p->kobj);
+		kobject_init_ng(&p->kobj, &ktype_cdev_dynamic);
 	}
 	return p;
 }
@@ -529,8 +528,7 @@ void cdev_init(struct cdev *cdev, const struct file_operations *fops)
 {
 	memset(cdev, 0, sizeof *cdev);
 	INIT_LIST_HEAD(&cdev->list);
-	cdev->kobj.ktype = &ktype_cdev_default;
-	kobject_init(&cdev->kobj);
+	kobject_init_ng(&cdev->kobj, &ktype_cdev_default);
 	cdev->ops = fops;
 }
 
-- 
cgit v1.2.3


From edfaa7c36574f1bf09c65ad602412db9da5f96bf Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Mon, 21 May 2007 22:08:01 +0200
Subject: Driver core: convert block from raw kobjects to core devices

This moves the block devices to /sys/class/block. It will create a
flat list of all block devices, with the disks and partitions in one
directory. For compatibility /sys/block is created and contains symlinks
to the disks.

  /sys/class/block
  |-- sda -> ../../devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda
  |-- sda1 -> ../../devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/sda1
  |-- sda10 -> ../../devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/sda10
  |-- sda5 -> ../../devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/sda5
  |-- sda6 -> ../../devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/sda6
  |-- sda7 -> ../../devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/sda7
  |-- sda8 -> ../../devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/sda8
  |-- sda9 -> ../../devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/sda9
  `-- sr0 -> ../../devices/pci0000:00/0000:00:1f.2/host1/target1:0:0/1:0:0:0/block/sr0

  /sys/block/
  |-- sda -> ../devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda
  `-- sr0 -> ../devices/pci0000:00/0000:00:1f.2/host1/target1:0:0/1:0:0:0/block/sr0

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/block_dev.c        |   8 +-
 fs/partitions/check.c | 315 +++++++++++++++++---------------------------------
 2 files changed, 109 insertions(+), 214 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 993f78c55221..e48a630ae266 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -738,9 +738,9 @@ EXPORT_SYMBOL(bd_release);
 static struct kobject *bdev_get_kobj(struct block_device *bdev)
 {
 	if (bdev->bd_contains != bdev)
-		return kobject_get(&bdev->bd_part->kobj);
+		return kobject_get(&bdev->bd_part->dev.kobj);
 	else
-		return kobject_get(&bdev->bd_disk->kobj);
+		return kobject_get(&bdev->bd_disk->dev.kobj);
 }
 
 static struct kobject *bdev_get_holder(struct block_device *bdev)
@@ -1176,7 +1176,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 				ret = -ENXIO;
 				goto out_first;
 			}
-			kobject_get(&p->kobj);
+			kobject_get(&p->dev.kobj);
 			bdev->bd_part = p;
 			bd_set_size(bdev, (loff_t) p->nr_sects << 9);
 		}
@@ -1299,7 +1299,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 		module_put(owner);
 
 		if (bdev->bd_contains != bdev) {
-			kobject_put(&bdev->bd_part->kobj);
+			kobject_put(&bdev->bd_part->dev.kobj);
 			bdev->bd_part = NULL;
 		}
 		bdev->bd_disk = NULL;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9184215f3ef3..97f3f5f064ee 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -195,96 +195,45 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	return ERR_PTR(res);
 }
 
-/*
- * sysfs bindings for partitions
- */
-
-struct part_attribute {
-	struct attribute attr;
-	ssize_t (*show)(struct hd_struct *,char *);
-	ssize_t (*store)(struct hd_struct *,const char *, size_t);
-};
-
-static ssize_t 
-part_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
+static ssize_t part_start_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
 {
-	struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
-	struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
-	ssize_t ret = 0;
-	if (part_attr->show)
-		ret = part_attr->show(p, page);
-	return ret;
-}
-static ssize_t
-part_attr_store(struct kobject * kobj, struct attribute * attr,
-		const char *page, size_t count)
-{
-	struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
-	struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
-	ssize_t ret = 0;
+	struct hd_struct *p = dev_to_part(dev);
 
-	if (part_attr->store)
-		ret = part_attr->store(p, page, count);
-	return ret;
+	return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
 }
 
-static struct sysfs_ops part_sysfs_ops = {
-	.show	=	part_attr_show,
-	.store	=	part_attr_store,
-};
-
-static ssize_t part_uevent_store(struct hd_struct * p,
-				 const char *page, size_t count)
+static ssize_t part_size_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
 {
-	kobject_uevent(&p->kobj, KOBJ_ADD);
-	return count;
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
-static ssize_t part_dev_read(struct hd_struct * p, char *page)
-{
-	struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj);
-	dev_t dev = MKDEV(disk->major, disk->first_minor + p->partno); 
-	return print_dev_t(page, dev);
-}
-static ssize_t part_start_read(struct hd_struct * p, char *page)
-{
-	return sprintf(page, "%llu\n",(unsigned long long)p->start_sect);
-}
-static ssize_t part_size_read(struct hd_struct * p, char *page)
-{
-	return sprintf(page, "%llu\n",(unsigned long long)p->nr_sects);
-}
-static ssize_t part_stat_read(struct hd_struct * p, char *page)
+
+static ssize_t part_stat_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
 {
-	return sprintf(page, "%8u %8llu %8u %8llu\n",
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%8u %8llu %8u %8llu\n",
 		       p->ios[0], (unsigned long long)p->sectors[0],
 		       p->ios[1], (unsigned long long)p->sectors[1]);
 }
-static struct part_attribute part_attr_uevent = {
-	.attr = {.name = "uevent", .mode = S_IWUSR },
-	.store	= part_uevent_store
-};
-static struct part_attribute part_attr_dev = {
-	.attr = {.name = "dev", .mode = S_IRUGO },
-	.show	= part_dev_read
-};
-static struct part_attribute part_attr_start = {
-	.attr = {.name = "start", .mode = S_IRUGO },
-	.show	= part_start_read
-};
-static struct part_attribute part_attr_size = {
-	.attr = {.name = "size", .mode = S_IRUGO },
-	.show	= part_size_read
-};
-static struct part_attribute part_attr_stat = {
-	.attr = {.name = "stat", .mode = S_IRUGO },
-	.show	= part_stat_read
-};
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
+static ssize_t part_fail_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
 
-static ssize_t part_fail_store(struct hd_struct * p,
+	return sprintf(buf, "%d\n", p->make_it_fail);
+}
+
+static ssize_t part_fail_store(struct device *dev,
+			       struct device_attribute *attr,
 			       const char *buf, size_t count)
 {
+	struct hd_struct *p = dev_to_part(dev);
 	int i;
 
 	if (count > 0 && sscanf(buf, "%d", &i) > 0)
@@ -292,49 +241,52 @@ static ssize_t part_fail_store(struct hd_struct * p,
 
 	return count;
 }
-static ssize_t part_fail_read(struct hd_struct * p, char *page)
-{
-	return sprintf(page, "%d\n", p->make_it_fail);
-}
-static struct part_attribute part_attr_fail = {
-	.attr = {.name = "make-it-fail", .mode = S_IRUGO | S_IWUSR },
-	.store	= part_fail_store,
-	.show	= part_fail_read
-};
+#endif
 
+static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
+static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+static struct device_attribute dev_attr_fail =
+	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
 #endif
 
-static struct attribute * default_attrs[] = {
-	&part_attr_uevent.attr,
-	&part_attr_dev.attr,
-	&part_attr_start.attr,
-	&part_attr_size.attr,
-	&part_attr_stat.attr,
+static struct attribute *part_attrs[] = {
+	&dev_attr_start.attr,
+	&dev_attr_size.attr,
+	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-	&part_attr_fail.attr,
+	&dev_attr_fail.attr,
 #endif
-	NULL,
+	NULL
 };
 
-extern struct kset *block_kset;
+static struct attribute_group part_attr_group = {
+	.attrs = part_attrs,
+};
 
-static void part_release(struct kobject *kobj)
+static struct attribute_group *part_attr_groups[] = {
+	&part_attr_group,
+	NULL
+};
+
+static void part_release(struct device *dev)
 {
-	struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
+	struct hd_struct *p = dev_to_part(dev);
 	kfree(p);
 }
 
-struct kobj_type ktype_part = {
+struct device_type part_type = {
+	.name		= "partition",
+	.groups		= part_attr_groups,
 	.release	= part_release,
-	.default_attrs	= default_attrs,
-	.sysfs_ops	= &part_sysfs_ops,
 };
 
 static inline void partition_sysfs_add_subdir(struct hd_struct *p)
 {
 	struct kobject *k;
 
-	k = kobject_get(&p->kobj);
+	k = kobject_get(&p->dev.kobj);
 	p->holder_dir = kobject_create_and_add("holders", k);
 	kobject_put(k);
 }
@@ -343,7 +295,7 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 {
 	struct kobject *k;
 
-	k = kobject_get(&disk->kobj);
+	k = kobject_get(&disk->dev.kobj);
 	disk->holder_dir = kobject_create_and_add("holders", k);
 	disk->slave_dir = kobject_create_and_add("slaves", k);
 	kobject_put(k);
@@ -352,6 +304,7 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 void delete_partition(struct gendisk *disk, int part)
 {
 	struct hd_struct *p = disk->part[part-1];
+
 	if (!p)
 		return;
 	if (!p->nr_sects)
@@ -361,113 +314,55 @@ void delete_partition(struct gendisk *disk, int part)
 	p->nr_sects = 0;
 	p->ios[0] = p->ios[1] = 0;
 	p->sectors[0] = p->sectors[1] = 0;
-	sysfs_remove_link(&p->kobj, "subsystem");
 	kobject_unregister(p->holder_dir);
-	kobject_uevent(&p->kobj, KOBJ_REMOVE);
-	kobject_del(&p->kobj);
-	kobject_put(&p->kobj);
+	device_del(&p->dev);
+	put_device(&p->dev);
 }
 
 void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
+	int err;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return;
-	
+
 	p->start_sect = start;
 	p->nr_sects = len;
 	p->partno = part;
 	p->policy = disk->policy;
 
-	if (isdigit(disk->kobj.k_name[strlen(disk->kobj.k_name)-1]))
-		kobject_set_name(&p->kobj, "%sp%d",
-				 kobject_name(&disk->kobj), part);
+	if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
+		snprintf(p->dev.bus_id, BUS_ID_SIZE,
+		"%sp%d", disk->dev.bus_id, part);
 	else
-		kobject_set_name(&p->kobj, "%s%d",
-				 kobject_name(&disk->kobj),part);
-	p->kobj.parent = &disk->kobj;
-	p->kobj.ktype = &ktype_part;
-	kobject_init(&p->kobj);
-	kobject_add(&p->kobj);
-	if (!disk->part_uevent_suppress)
-		kobject_uevent(&p->kobj, KOBJ_ADD);
-	sysfs_create_link(&p->kobj, &block_kset->kobj, "subsystem");
+		snprintf(p->dev.bus_id, BUS_ID_SIZE,
+			 "%s%d", disk->dev.bus_id, part);
+
+	device_initialize(&p->dev);
+	p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
+	p->dev.class = &block_class;
+	p->dev.type = &part_type;
+	p->dev.parent = &disk->dev;
+	disk->part[part-1] = p;
+
+	/* delay uevent until 'holders' subdir is created */
+	p->dev.uevent_suppress = 1;
+	device_add(&p->dev);
+	partition_sysfs_add_subdir(p);
+	p->dev.uevent_suppress = 0;
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		static struct attribute addpartattr = {
 			.name = "whole_disk",
 			.mode = S_IRUSR | S_IRGRP | S_IROTH,
 		};
-
-		sysfs_create_file(&p->kobj, &addpartattr);
+		err = sysfs_create_file(&p->dev.kobj, &addpartattr);
 	}
-	partition_sysfs_add_subdir(p);
-	disk->part[part-1] = p;
-}
 
-static char *make_block_name(struct gendisk *disk)
-{
-	char *name;
-	static char *block_str = "block:";
-	int size;
-	char *s;
-
-	size = strlen(block_str) + strlen(disk->disk_name) + 1;
-	name = kmalloc(size, GFP_KERNEL);
-	if (!name)
-		return NULL;
-	strcpy(name, block_str);
-	strcat(name, disk->disk_name);
-	/* ewww... some of these buggers have / in name... */
-	s = strchr(name, '/');
-	if (s)
-		*s = '!';
-	return name;
-}
-
-static int disk_sysfs_symlinks(struct gendisk *disk)
-{
-	struct device *target = get_device(disk->driverfs_dev);
-	int err;
-	char *disk_name = NULL;
-
-	if (target) {
-		disk_name = make_block_name(disk);
-		if (!disk_name) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-
-		err = sysfs_create_link(&disk->kobj, &target->kobj, "device");
-		if (err)
-			goto err_out_disk_name;
-
-		err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);
-		if (err)
-			goto err_out_dev_link;
-	}
-
-	err = sysfs_create_link(&disk->kobj, &block_kset->kobj,
-				"subsystem");
-	if (err)
-		goto err_out_disk_name_lnk;
-
-	kfree(disk_name);
-
-	return 0;
-
-err_out_disk_name_lnk:
-	if (target) {
-		sysfs_remove_link(&target->kobj, disk_name);
-err_out_dev_link:
-		sysfs_remove_link(&disk->kobj, "device");
-err_out_disk_name:
-		kfree(disk_name);
-err_out:
-		put_device(target);
-	}
-	return err;
+	/* suppress uevent if the disk supresses it */
+	if (!disk->dev.uevent_suppress)
+		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
 }
 
 /* Not exported, helper to add_disk(). */
@@ -479,19 +374,29 @@ void register_disk(struct gendisk *disk)
 	struct hd_struct *p;
 	int err;
 
-	kobject_set_name(&disk->kobj, "%s", disk->disk_name);
-	/* ewww... some of these buggers have / in name... */
-	s = strchr(disk->kobj.k_name, '/');
+	disk->dev.parent = disk->driverfs_dev;
+	disk->dev.devt = MKDEV(disk->major, disk->first_minor);
+
+	strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN);
+	/* ewww... some of these buggers have / in the name... */
+	s = strchr(disk->dev.bus_id, '/');
 	if (s)
 		*s = '!';
-	if ((err = kobject_add(&disk->kobj)))
+
+	/* delay uevents, until we scanned partition table */
+	disk->dev.uevent_suppress = 1;
+
+	if (device_add(&disk->dev))
 		return;
-	err = disk_sysfs_symlinks(disk);
+#ifndef CONFIG_SYSFS_DEPRECATED
+	err = sysfs_create_link(block_depr, &disk->dev.kobj,
+				kobject_name(&disk->dev.kobj));
 	if (err) {
-		kobject_del(&disk->kobj);
+		device_del(&disk->dev);
 		return;
 	}
- 	disk_sysfs_add_subdirs(disk);
+#endif
+	disk_sysfs_add_subdirs(disk);
 
 	/* No minors to use for partitions */
 	if (disk->minors == 1)
@@ -505,25 +410,23 @@ void register_disk(struct gendisk *disk)
 	if (!bdev)
 		goto exit;
 
-	/* scan partition table, but suppress uevents */
 	bdev->bd_invalidated = 1;
-	disk->part_uevent_suppress = 1;
 	err = blkdev_get(bdev, FMODE_READ, 0);
-	disk->part_uevent_suppress = 0;
 	if (err < 0)
 		goto exit;
 	blkdev_put(bdev);
 
 exit:
-	/* announce disk after possible partitions are already created */
-	kobject_uevent(&disk->kobj, KOBJ_ADD);
+	/* announce disk after possible partitions are created */
+	disk->dev.uevent_suppress = 0;
+	kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
 	for (i = 1; i < disk->minors; i++) {
 		p = disk->part[i-1];
 		if (!p || !p->nr_sects)
 			continue;
-		kobject_uevent(&p->kobj, KOBJ_ADD);
+		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
 	}
 }
 
@@ -602,19 +505,11 @@ void del_gendisk(struct gendisk *disk)
 	disk_stat_set_all(disk, 0);
 	disk->stamp = 0;
 
-	kobject_uevent(&disk->kobj, KOBJ_REMOVE);
 	kobject_unregister(disk->holder_dir);
 	kobject_unregister(disk->slave_dir);
-	if (disk->driverfs_dev) {
-		char *disk_name = make_block_name(disk);
-		sysfs_remove_link(&disk->kobj, "device");
-		if (disk_name) {
-			sysfs_remove_link(&disk->driverfs_dev->kobj, disk_name);
-			kfree(disk_name);
-		}
-		put_device(disk->driverfs_dev);
-		disk->driverfs_dev = NULL;
-	}
-	sysfs_remove_link(&disk->kobj, "subsystem");
-	kobject_del(&disk->kobj);
+	disk->driverfs_dev = NULL;
+#ifndef CONFIG_SYSFS_DEPRECATED
+	sysfs_remove_link(block_depr, disk->dev.bus_id);
+#endif
+	device_del(&disk->dev);
 }
-- 
cgit v1.2.3


From f9cb074bff8e762ef24c44678a5a7d907f82fbeb Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 23:05:35 -0700
Subject: Kobject: rename kobject_init_ng() to kobject_init()

Now that the old kobject_init() function is gone, rename
kobject_init_ng() to kobject_init() to clean up the namespace.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/char_dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index b2dd5a036631..2c7a8b5b4598 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -511,7 +511,7 @@ struct cdev *cdev_alloc(void)
 	struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
 	if (p) {
 		INIT_LIST_HEAD(&p->list);
-		kobject_init_ng(&p->kobj, &ktype_cdev_dynamic);
+		kobject_init(&p->kobj, &ktype_cdev_dynamic);
 	}
 	return p;
 }
@@ -528,7 +528,7 @@ void cdev_init(struct cdev *cdev, const struct file_operations *fops)
 {
 	memset(cdev, 0, sizeof *cdev);
 	INIT_LIST_HEAD(&cdev->list);
-	kobject_init_ng(&cdev->kobj, &ktype_cdev_default);
+	kobject_init(&cdev->kobj, &ktype_cdev_default);
 	cdev->ops = fops;
 }
 
-- 
cgit v1.2.3


From 197b12d6796a3bca187f22a8978a33d51e2bcd79 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 20 Dec 2007 08:13:05 -0800
Subject: Kobject: convert fs/* from kobject_unregister() to kobject_put()

There is no need for kobject_unregister() anymore, thanks to Kay's
kobject cleanup changes, so replace all instances of it with
kobject_put().


Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/configfs/mount.c         | 6 +++---
 fs/debugfs/inode.c          | 4 ++--
 fs/dlm/lockspace.c          | 4 ++--
 fs/ecryptfs/main.c          | 4 ++--
 fs/fuse/inode.c             | 6 +++---
 fs/gfs2/locking/dlm/sysfs.c | 2 +-
 fs/gfs2/sys.c               | 4 ++--
 fs/partitions/check.c       | 6 +++---
 8 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 54bf0db0d4b0..de3b31d0a37d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -150,7 +150,7 @@ static int __init configfs_init(void)
 	err = register_filesystem(&configfs_fs_type);
 	if (err) {
 		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
-		kobject_unregister(config_kobj);
+		kobject_put(config_kobj);
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
 		goto out;
@@ -159,7 +159,7 @@ static int __init configfs_init(void)
 	err = configfs_inode_init();
 	if (err) {
 		unregister_filesystem(&configfs_fs_type);
-		kobject_unregister(config_kobj);
+		kobject_put(config_kobj);
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
 	}
@@ -170,7 +170,7 @@ out:
 static void __exit configfs_exit(void)
 {
 	unregister_filesystem(&configfs_fs_type);
-	kobject_unregister(config_kobj);
+	kobject_put(config_kobj);
 	kmem_cache_destroy(configfs_dir_cachep);
 	configfs_dir_cachep = NULL;
 	configfs_inode_exit();
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 97f6381c36c2..d26e2826ba5b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -438,7 +438,7 @@ static int __init debugfs_init(void)
 
 	retval = register_filesystem(&debug_fs_type);
 	if (retval)
-		kobject_unregister(debug_kobj);
+		kobject_put(debug_kobj);
 	return retval;
 }
 
@@ -446,7 +446,7 @@ static void __exit debugfs_exit(void)
 {
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	unregister_filesystem(&debug_fs_type);
-	kobject_unregister(debug_kobj);
+	kobject_put(debug_kobj);
 }
 
 core_initcall(debugfs_init);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index b750f13d0328..5c108c49cb8c 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -579,7 +579,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	kfree(ls->ls_rsbtbl);
  out_lsfree:
 	if (do_unreg)
-		kobject_unregister(&ls->ls_kobj);
+		kobject_put(&ls->ls_kobj);
 	else
 		kfree(ls);
  out:
@@ -728,7 +728,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 	dlm_clear_members(ls);
 	dlm_clear_members_gone(ls);
 	kfree(ls->ls_node_array);
-	kobject_unregister(&ls->ls_kobj);
+	kobject_put(&ls->ls_kobj);
 	/* The ls structure will be freed when the kobject is done with */
 
 	mutex_lock(&ls_lock);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4f1332107bbd..0249aa4ae181 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -767,7 +767,7 @@ static int do_sysfs_registration(void)
 	if (rc) {
 		printk(KERN_ERR
 		       "Unable to create ecryptfs version attributes\n");
-		kobject_unregister(ecryptfs_kobj);
+		kobject_put(ecryptfs_kobj);
 	}
 out:
 	return rc;
@@ -776,7 +776,7 @@ out:
 static void do_sysfs_unregistration(void)
 {
 	sysfs_remove_group(ecryptfs_kobj, &attr_group);
-	kobject_unregister(ecryptfs_kobj);
+	kobject_put(ecryptfs_kobj);
 }
 
 static int __init ecryptfs_init(void)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e6e23a2ad4b3..e5e80d1a4687 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -810,15 +810,15 @@ static int fuse_sysfs_init(void)
 	return 0;
 
  out_fuse_unregister:
-	kobject_unregister(fuse_kobj);
+	kobject_put(fuse_kobj);
  out_err:
 	return err;
 }
 
 static void fuse_sysfs_cleanup(void)
 {
-	kobject_unregister(connections_kobj);
-	kobject_unregister(fuse_kobj);
+	kobject_put(connections_kobj);
+	kobject_put(fuse_kobj);
 }
 
 static int __init fuse_init(void)
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a7336b909c61..a87b09839761 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -207,7 +207,7 @@ int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
 
 void gdlm_kobject_release(struct gdlm_ls *ls)
 {
-	kobject_unregister(&ls->kobj);
+	kobject_put(&ls->kobj);
 }
 
 int gdlm_sysfs_init(void)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 8d9cd5bd5845..3a3176b846f3 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -525,7 +525,7 @@ fail_counters:
 fail_lockstruct:
 	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
-	kobject_unregister(&sdp->sd_kobj);
+	kobject_put(&sdp->sd_kobj);
 fail:
 	fs_err(sdp, "error %d adding sysfs files", error);
 	return error;
@@ -537,7 +537,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 	sysfs_remove_group(&sdp->sd_kobj, &counters_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
-	kobject_unregister(&sdp->sd_kobj);
+	kobject_put(&sdp->sd_kobj);
 }
 
 int gfs2_sys_init(void)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 97f3f5f064ee..739da701ae7b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -314,7 +314,7 @@ void delete_partition(struct gendisk *disk, int part)
 	p->nr_sects = 0;
 	p->ios[0] = p->ios[1] = 0;
 	p->sectors[0] = p->sectors[1] = 0;
-	kobject_unregister(p->holder_dir);
+	kobject_put(p->holder_dir);
 	device_del(&p->dev);
 	put_device(&p->dev);
 }
@@ -505,8 +505,8 @@ void del_gendisk(struct gendisk *disk)
 	disk_stat_set_all(disk, 0);
 	disk->stamp = 0;
 
-	kobject_unregister(disk->holder_dir);
-	kobject_unregister(disk->slave_dir);
+	kobject_put(disk->holder_dir);
+	kobject_put(disk->slave_dir);
 	disk->driverfs_dev = NULL;
 #ifndef CONFIG_SYSFS_DEPRECATED
 	sysfs_remove_link(block_depr, disk->dev.bus_id);
-- 
cgit v1.2.3


From cc7e79b168a552152299bd8a8254dc099aacc993 Mon Sep 17 00:00:00 2001
From: Wendy Cheng <wcheng@redhat.com>
Date: Fri, 5 Oct 2007 00:27:58 -0400
Subject: [GFS2] Handle multiple glock demote requests

Fix a race condition where multiple glock demote requests are sent to
a node back-to-back. This patch does a check inside handle_callback()
to see whether a demote request is in progress. If true, it sets a flag
to make sure run_queue() will loop again to handle the new request,
instead of erronously setting gl_demote_state to a different state.

Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c  | 15 ++++++++++++++-
 fs/gfs2/incore.h |  2 ++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a37efe4aae6f..104e83ff874f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -567,7 +567,10 @@ static int rq_demote(struct gfs2_glock *gl)
 		gfs2_demote_wake(gl);
 		return 0;
 	}
+
 	set_bit(GLF_LOCK, &gl->gl_flags);
+	set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+
 	if (gl->gl_demote_state == LM_ST_UNLOCKED ||
 	    gl->gl_state != LM_ST_EXCLUSIVE) {
 		spin_unlock(&gl->gl_spin);
@@ -576,7 +579,9 @@ static int rq_demote(struct gfs2_glock *gl)
 		spin_unlock(&gl->gl_spin);
 		gfs2_glock_xmote_th(gl, NULL);
 	}
+
 	spin_lock(&gl->gl_spin);
+	clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
 
 	return 0;
 }
@@ -606,6 +611,11 @@ static void run_queue(struct gfs2_glock *gl)
 
 		} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
 			blocked = rq_demote(gl);
+			if (gl->gl_waiters2 && !blocked) {
+				set_bit(GLF_DEMOTE, &gl->gl_flags);
+				gl->gl_demote_state = LM_ST_UNLOCKED;
+			}
+			gl->gl_waiters2 = 0;
 		} else if (!list_empty(&gl->gl_waiters3)) {
 			gh = list_entry(gl->gl_waiters3.next,
 					struct gfs2_holder, gh_list);
@@ -722,7 +732,10 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 		}
 	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
 			gl->gl_demote_state != state) {
-		gl->gl_demote_state = LM_ST_UNLOCKED;
+		if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
+			gl->gl_waiters2 = 1;
+		else 
+			gl->gl_demote_state = LM_ST_UNLOCKED;
 	}
 	spin_unlock(&gl->gl_spin);
 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eaddfb5a8e6f..662182bfbff7 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -171,6 +171,7 @@ enum {
 	GLF_DEMOTE		= 3,
 	GLF_PENDING_DEMOTE	= 4,
 	GLF_DIRTY		= 5,
+	GLF_DEMOTE_IN_PROGRESS	= 6,
 };
 
 struct gfs2_glock {
@@ -190,6 +191,7 @@ struct gfs2_glock {
 	struct list_head gl_holders;
 	struct list_head gl_waiters1;	/* HIF_MUTEX */
 	struct list_head gl_waiters3;	/* HIF_PROMOTE */
+	int gl_waiters2;		/* GIF_DEMOTE */
 
 	const struct gfs2_glock_operations *gl_ops;
 
-- 
cgit v1.2.3


From 51ff87bdd9f21a5d3672517b75d25ab5842d94a8 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 15 Oct 2007 14:42:35 +0100
Subject: [GFS2] Clean up internal read function

As requested by Christoph, this patch cleans up GFS2's internal
read function so that it no longer uses the do_generic_mapping_read
function. This function is obsolete and GFS2 is the last user of it.

As a side effect the internal read code gets smaller and easier
to read and gfs2_readpage is split into two. One function has the locking
and the other function has the rest of the logic.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
---
 fs/gfs2/inode.c       |   1 -
 fs/gfs2/ops_address.c | 151 +++++++++++++++++++++++++++++++++-----------------
 fs/gfs2/ops_address.h |   3 +
 fs/gfs2/ops_file.c    |  45 ---------------
 fs/gfs2/ops_file.h    |  24 --------
 fs/gfs2/ops_inode.h   |   4 ++
 fs/gfs2/quota.c       |   7 +--
 fs/gfs2/rgrp.c        |   2 +-
 8 files changed, 111 insertions(+), 126 deletions(-)
 delete mode 100644 fs/gfs2/ops_file.h

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5f6dc32946cd..ad0fe373dca5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -31,7 +31,6 @@
 #include "log.h"
 #include "meta_io.h"
 #include "ops_address.h"
-#include "ops_file.h"
 #include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 9679f8b9870d..9bb24b1d9c05 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -20,6 +20,7 @@
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/swap.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -32,7 +33,6 @@
 #include "quota.h"
 #include "trans.h"
 #include "rgrp.h"
-#include "ops_file.h"
 #include "super.h"
 #include "util.h"
 #include "glops.h"
@@ -231,62 +231,115 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 
 
 /**
- * gfs2_readpage - readpage with locking
- * @file: The file to read a page for. N.B. This may be NULL if we are
- * reading an internal file.
+ * __gfs2_readpage - readpage
+ * @file: The file to read a page for
  * @page: The page to read
  *
- * Returns: errno
+ * This is the core of gfs2's readpage. Its used by the internal file
+ * reading code as in that case we already hold the glock. Also its
+ * called by gfs2_readpage() once the required lock has been granted.
+ *
  */
 
-static int gfs2_readpage(struct file *file, struct page *page)
+static int __gfs2_readpage(void *file, struct page *page)
 {
 	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
 	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-	struct gfs2_file *gf = NULL;
-	struct gfs2_holder gh;
 	int error;
-	int do_unlock = 0;
-
-	if (likely(file != &gfs2_internal_file_sentinel)) {
-		if (file) {
-			gf = file->private_data;
-			if (test_bit(GFF_EXLOCK, &gf->f_flags))
-				/* gfs2_sharewrite_fault has grabbed the ip->i_gl already */
-				goto skip_lock;
-		}
-		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
-		do_unlock = 1;
-		error = gfs2_glock_nq_atime(&gh);
-		if (unlikely(error))
-			goto out_unlock;
-	}
 
-skip_lock:
 	if (gfs2_is_stuffed(ip)) {
 		error = stuffed_readpage(ip, page);
 		unlock_page(page);
-	} else
+	} else {
 		error = mpage_readpage(page, gfs2_get_block);
+	}
 
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = -EIO;
+		return -EIO;
 
-	if (do_unlock) {
-		gfs2_glock_dq_m(1, &gh);
-		gfs2_holder_uninit(&gh);
+	return error;
+}
+
+/**
+ * gfs2_readpage - read a page of a file
+ * @file: The file to read
+ * @page: The page of the file
+ *
+ * This deals with the locking required. If the GFF_EXLOCK flags is set
+ * then we already hold the glock (due to page fault) and thus we call
+ * __gfs2_readpage() directly. Otherwise we use a trylock in order to
+ * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
+ * in the event that we are unable to get the lock.
+ */
+
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+	struct gfs2_holder gh;
+	int error;
+
+	if (file) {
+		struct gfs2_file *gf = file->private_data;
+		if (test_bit(GFF_EXLOCK, &gf->f_flags))
+			return __gfs2_readpage(file, page);
 	}
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
+	error = gfs2_glock_nq_atime(&gh);
+	if (unlikely(error)) {
+		unlock_page(page);
+		goto out;
+	}
+	error = __gfs2_readpage(file, page);
+	gfs2_glock_dq(&gh);
 out:
-	return error;
-out_unlock:
-	unlock_page(page);
+	gfs2_holder_uninit(&gh);
 	if (error == GLR_TRYFAILED) {
-		error = AOP_TRUNCATED_PAGE;
 		yield();
+		return AOP_TRUNCATED_PAGE;
 	}
-	if (do_unlock)
-		gfs2_holder_uninit(&gh);
-	goto out;
+	return error;
+}
+
+/**
+ * gfs2_internal_read - read an internal file
+ * @ip: The gfs2 inode
+ * @ra_state: The readahead state (or NULL for no readahead)
+ * @buf: The buffer to fill
+ * @pos: The file position
+ * @size: The amount to read
+ *
+ */
+
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+                       char *buf, loff_t *pos, unsigned size)
+{
+	struct address_space *mapping = ip->i_inode.i_mapping;
+	unsigned long index = *pos / PAGE_CACHE_SIZE;
+	unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+	unsigned copied = 0;
+	unsigned amt;
+	struct page *page;
+	void *p;
+
+	do {
+		amt = size - copied;
+		if (offset + size > PAGE_CACHE_SIZE)
+			amt = PAGE_CACHE_SIZE - offset;
+		page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		p = kmap_atomic(page, KM_USER0);
+		memcpy(buf + copied, p + offset, amt);
+		kunmap_atomic(p, KM_USER0);
+		mark_page_accessed(page);
+		page_cache_release(page);
+		copied += amt;
+		index++;
+		offset = 0;
+	} while(copied < size);
+	(*pos) += size;
+	return size;
 }
 
 /**
@@ -314,21 +367,19 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
 	int ret = 0;
 	int do_unlock = 0;
 
-	if (likely(file != &gfs2_internal_file_sentinel)) {
-		if (file) {
-			struct gfs2_file *gf = file->private_data;
-			if (test_bit(GFF_EXLOCK, &gf->f_flags))
-				goto skip_lock;
-		}
-		gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
-				 LM_FLAG_TRY_1CB|GL_ATIME, &gh);
-		do_unlock = 1;
-		ret = gfs2_glock_nq_atime(&gh);
-		if (ret == GLR_TRYFAILED)
-			goto out_noerror;
-		if (unlikely(ret))
-			goto out_unlock;
+	if (file) {
+		struct gfs2_file *gf = file->private_data;
+		if (test_bit(GFF_EXLOCK, &gf->f_flags))
+			goto skip_lock;
 	}
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
+			 LM_FLAG_TRY_1CB|GL_ATIME, &gh);
+	do_unlock = 1;
+	ret = gfs2_glock_nq_atime(&gh);
+	if (ret == GLR_TRYFAILED)
+		goto out_noerror;
+	if (unlikely(ret))
+		goto out_unlock;
 skip_lock:
 	if (!gfs2_is_stuffed(ip))
 		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
index fa1b5b3d28b9..e8fe83fcd583 100644
--- a/fs/gfs2/ops_address.h
+++ b/fs/gfs2/ops_address.h
@@ -18,5 +18,8 @@ extern const struct address_space_operations gfs2_file_aops;
 extern int gfs2_get_block(struct inode *inode, sector_t lblock,
 			  struct buffer_head *bh_result, int create);
 extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+			      struct file_ra_state *ra_state,
+			      char *buf, loff_t *pos, unsigned size);
 
 #endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index bb11fd6752d3..a729c86b8be1 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -33,7 +33,6 @@
 #include "lm.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_file.h"
 #include "ops_vm.h"
 #include "quota.h"
 #include "rgrp.h"
@@ -41,50 +40,6 @@
 #include "util.h"
 #include "eaops.h"
 
-/*
- * Most fields left uninitialised to catch anybody who tries to
- * use them. f_flags set to prevent file_accessed() from touching
- * any other part of this. Its use is purely as a flag so that we
- * know (in readpage()) whether or not do to locking.
- */
-struct file gfs2_internal_file_sentinel = {
-	.f_flags = O_NOATIME|O_RDONLY,
-};
-
-static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
-			   unsigned long offset, unsigned long size)
-{
-	char *kaddr;
-	unsigned long count = desc->count;
-
-	if (size > count)
-		size = count;
-
-	kaddr = kmap(page);
-	memcpy(desc->arg.data, kaddr + offset, size);
-	kunmap(page);
-
-	desc->count = count - size;
-	desc->written += size;
-	desc->arg.buf += size;
-	return size;
-}
-
-int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
-		       char *buf, loff_t *pos, unsigned size)
-{
-	struct inode *inode = &ip->i_inode;
-	read_descriptor_t desc;
-	desc.written = 0;
-	desc.arg.data = buf;
-	desc.count = size;
-	desc.error = 0;
-	do_generic_mapping_read(inode->i_mapping, ra_state,
-				&gfs2_internal_file_sentinel, pos, &desc,
-				gfs2_read_actor);
-	return desc.written ? desc.written : desc.error;
-}
-
 /**
  * gfs2_llseek - seek to a location in a file
  * @file: the file
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
deleted file mode 100644
index 7e5d8ec9c846..000000000000
--- a/fs/gfs2/ops_file.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_FILE_DOT_H__
-#define __OPS_FILE_DOT_H__
-
-#include <linux/fs.h>
-struct gfs2_inode;
-
-extern struct file gfs2_internal_file_sentinel;
-extern int gfs2_internal_read(struct gfs2_inode *ip,
-			      struct file_ra_state *ra_state,
-			      char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_inode_flags(struct inode *inode);
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-
-#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index 34f0caac1a03..edb519cb05ee 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -16,5 +16,9 @@ extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
 extern const struct inode_operations gfs2_dev_iops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+
+extern void gfs2_set_inode_flags(struct inode *inode);
 
 #endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index addb51e0f135..4996f0ef3007 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -59,7 +59,6 @@
 #include "super.h"
 #include "trans.h"
 #include "inode.h"
-#include "ops_file.h"
 #include "ops_address.h"
 #include "util.h"
 
@@ -793,11 +792,9 @@ static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
 	struct gfs2_holder i_gh;
 	struct gfs2_quota_host q;
 	char buf[sizeof(struct gfs2_quota)];
-	struct file_ra_state ra_state;
 	int error;
 	struct gfs2_quota_lvb *qlvb;
 
-	file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
 restart:
 	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
 	if (error)
@@ -820,8 +817,8 @@ restart:
 
 		memset(buf, 0, sizeof(struct gfs2_quota));
 		pos = qd2offset(qd);
-		error = gfs2_internal_read(ip, &ra_state, buf,
-					   &pos, sizeof(struct gfs2_quota));
+		error = gfs2_internal_read(ip, NULL, buf, &pos,
+					   sizeof(struct gfs2_quota));
 		if (error < 0)
 			goto fail_gunlock;
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 708c287e1d0e..09848aac45f6 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -25,10 +25,10 @@
 #include "rgrp.h"
 #include "super.h"
 #include "trans.h"
-#include "ops_file.h"
 #include "util.h"
 #include "log.h"
 #include "inode.h"
+#include "ops_address.h"
 
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
-- 
cgit v1.2.3


From 3cc3f710ce0effe397b830826a1a081fa81f11c7 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 15 Oct 2007 15:40:33 +0100
Subject: [GFS2] Use ->page_mkwrite() for mmap()

This cleans up the mmap() code path for GFS2 by implementing the
page_mkwrite function for GFS2. We are thus able to use the
generic filemap_fault function for our ->fault() implementation.

This now means that shared writable mappings will be much more
efficiently shared across the cluster if there is a reasonable
proportion of read activity (the greater proportion, the better).

As a side effect, it also reduces the size of the code, removes
special cases from readpage and readpages, and makes the code
path easier to follow.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile      |   2 +-
 fs/gfs2/glops.c       |   9 +--
 fs/gfs2/incore.h      |   8 ---
 fs/gfs2/ops_address.c |  45 +++-----------
 fs/gfs2/ops_file.c    | 131 +++++++++++++++++++++++++++++++++++---
 fs/gfs2/ops_vm.c      | 169 --------------------------------------------------
 fs/gfs2/ops_vm.h      |  18 ------
 7 files changed, 131 insertions(+), 251 deletions(-)
 delete mode 100644 fs/gfs2/ops_vm.c
 delete mode 100644 fs/gfs2/ops_vm.h

(limited to 'fs')

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 04ad0caebedb..8fff11058cee 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
 	glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
 	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
-	ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
+	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
 obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4670dcb2a877..110f03d66f4b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -86,15 +86,10 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
 	if (!ip || !S_ISREG(inode->i_mode))
 		return;
 
-	if (!test_bit(GIF_PAGED, &ip->i_flags))
-		return;
-
 	unmap_shared_mapping_range(inode->i_mapping, 0, 0);
-
 	if (test_bit(GIF_SW_PAGED, &ip->i_flags))
 		set_bit(GLF_DIRTY, &gl->gl_flags);
 
-	clear_bit(GIF_SW_PAGED, &ip->i_flags);
 }
 
 /**
@@ -234,10 +229,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 			set_bit(GIF_INVALID, &ip->i_flags);
 	}
 
-	if (ip && S_ISREG(ip->i_inode.i_mode)) {
+	if (ip && S_ISREG(ip->i_inode.i_mode))
 		truncate_inode_pages(ip->i_inode.i_mapping, 0);
-		clear_bit(GIF_PAGED, &ip->i_flags);
-	}
 }
 
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 662182bfbff7..55c72f01cf31 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -241,7 +241,6 @@ struct gfs2_alloc {
 enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
-	GIF_PAGED		= 2,
 	GIF_SW_PAGED		= 3,
 };
 
@@ -289,19 +288,12 @@ static inline struct gfs2_inode *GFS2_I(struct inode *inode)
 	return container_of(inode, struct gfs2_inode, i_inode);
 }
 
-/* To be removed? */
 static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
 {
 	return inode->i_sb->s_fs_info;
 }
 
-enum {
-	GFF_DID_DIRECT_ALLOC	= 0,
-	GFF_EXLOCK = 1,
-};
-
 struct gfs2_file {
-	unsigned long f_flags;		/* GFF_... */
 	struct mutex f_fl_mutex;
 	struct gfs2_holder f_fl_gh;
 };
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 9bb24b1d9c05..1696e5d9d112 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -265,9 +265,7 @@ static int __gfs2_readpage(void *file, struct page *page)
  * @file: The file to read
  * @page: The page of the file
  *
- * This deals with the locking required. If the GFF_EXLOCK flags is set
- * then we already hold the glock (due to page fault) and thus we call
- * __gfs2_readpage() directly. Otherwise we use a trylock in order to
+ * This deals with the locking required. We use a trylock in order to
  * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
  * in the event that we are unable to get the lock.
  */
@@ -278,12 +276,6 @@ static int gfs2_readpage(struct file *file, struct page *page)
 	struct gfs2_holder gh;
 	int error;
 
-	if (file) {
-		struct gfs2_file *gf = file->private_data;
-		if (test_bit(GFF_EXLOCK, &gf->f_flags))
-			return __gfs2_readpage(file, page);
-	}
-
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
 	error = gfs2_glock_nq_atime(&gh);
 	if (unlikely(error)) {
@@ -354,9 +346,8 @@ int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
  * 2. We don't handle stuffed files here we let readpage do the honours.
  * 3. mpage_readpages() does most of the heavy lifting in the common case.
  * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
- * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
- *    well as read-ahead.
  */
+
 static int gfs2_readpages(struct file *file, struct address_space *mapping,
 			  struct list_head *pages, unsigned nr_pages)
 {
@@ -364,40 +355,20 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_holder gh;
-	int ret = 0;
-	int do_unlock = 0;
+	int ret;
 
-	if (file) {
-		struct gfs2_file *gf = file->private_data;
-		if (test_bit(GFF_EXLOCK, &gf->f_flags))
-			goto skip_lock;
-	}
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
-			 LM_FLAG_TRY_1CB|GL_ATIME, &gh);
-	do_unlock = 1;
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
 	ret = gfs2_glock_nq_atime(&gh);
-	if (ret == GLR_TRYFAILED)
-		goto out_noerror;
 	if (unlikely(ret))
-		goto out_unlock;
-skip_lock:
+		goto out_uninit;
 	if (!gfs2_is_stuffed(ip))
 		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
-
-	if (do_unlock) {
-		gfs2_glock_dq_m(1, &gh);
-		gfs2_holder_uninit(&gh);
-	}
-out:
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		ret = -EIO;
 	return ret;
-out_noerror:
-	ret = 0;
-out_unlock:
-	if (do_unlock)
-		gfs2_holder_uninit(&gh);
-	goto out;
 }
 
 /**
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index a729c86b8be1..6f3aeb059c61 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -33,7 +33,6 @@
 #include "lm.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_vm.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
@@ -169,7 +168,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 	if (put_user(fsflags, ptr))
 		error = -EFAULT;
 
-	gfs2_glock_dq_m(1, &gh);
+	gfs2_glock_dq(&gh);
 	gfs2_holder_uninit(&gh);
 	return error;
 }
@@ -293,6 +292,125 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	return -ENOTTY;
 }
 
+/**
+ * gfs2_allocate_page_backing - Use bmap to allocate blocks
+ * @page: The (locked) page to allocate backing for
+ *
+ * We try to allocate all the blocks required for the page in
+ * one go. This might fail for various reasons, so we keep
+ * trying until all the blocks to back this page are allocated.
+ * If some of the blocks are already allocated, thats ok too.
+ */
+
+static int gfs2_allocate_page_backing(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head bh;
+	unsigned long size = PAGE_CACHE_SIZE;
+	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	do {
+		bh.b_state = 0;
+		bh.b_size = size;
+		gfs2_block_map(inode, lblock, 1, &bh);
+		if (!buffer_mapped(&bh))
+			return -EIO;
+		size -= bh.b_size;
+		lblock += (bh.b_size >> inode->i_blkbits);
+	} while(size > 0);
+	return 0;
+}
+
+/**
+ * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
+ * @vma: The virtual memory area
+ * @page: The page which is about to become writable
+ *
+ * When the page becomes writable, we need to ensure that we have
+ * blocks allocated on disk to back that page.
+ */
+
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned long last_index;
+	u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
+	unsigned int data_blocks, ind_blocks, rblocks;
+	int alloc_required = 0;
+	struct gfs2_holder gh;
+	struct gfs2_alloc *al;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
+	ret = gfs2_glock_nq_atime(&gh);
+	if (ret)
+		goto out;
+
+	set_bit(GIF_SW_PAGED, &ip->i_flags);
+	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
+	if (ret || !alloc_required)
+		goto out_unlock;
+
+	ip->i_alloc.al_requested = 0;
+	al = gfs2_alloc_get(ip);
+	ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (ret)
+		goto out_alloc_put;
+	ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+	if (ret)
+		goto out_quota_unlock;
+	al->al_requested = data_blocks + ind_blocks;
+	ret = gfs2_inplace_reserve(ip);
+	if (ret)
+		goto out_quota_unlock;
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+	ret = gfs2_trans_begin(sdp, rblocks, 0);
+	if (ret)
+		goto out_trans_fail;
+
+	lock_page(page);
+	ret = -EINVAL;
+	last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
+	if (page->index > last_index)
+		goto out_unlock_page;
+	if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
+		goto out_unlock_page;
+	if (gfs2_is_stuffed(ip)) {
+		ret = gfs2_unstuff_dinode(ip, page);
+		if (ret)
+			goto out_unlock_page;
+	}
+	ret = gfs2_allocate_page_backing(page);
+
+out_unlock_page:
+	unlock_page(page);
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_quota_unlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&gh);
+out:
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static struct vm_operations_struct gfs2_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = gfs2_page_mkwrite,
+};
+
 
 /**
  * gfs2_mmap -
@@ -315,14 +433,7 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 		return error;
 	}
 
-	/* This is VM_MAYWRITE instead of VM_WRITE because a call
-	   to mprotect() can turn on VM_WRITE later. */
-
-	if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
-	    (VM_MAYSHARE | VM_MAYWRITE))
-		vma->vm_ops = &gfs2_vm_ops_sharewrite;
-	else
-		vma->vm_ops = &gfs2_vm_ops_private;
+	vma->vm_ops = &gfs2_vm_ops;
 
 	gfs2_glock_dq_uninit(&i_gh);
 
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
deleted file mode 100644
index 927d739d4685..000000000000
--- a/fs/gfs2/ops_vm.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "bmap.h"
-#include "glock.h"
-#include "inode.h"
-#include "ops_vm.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-
-static int gfs2_private_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host);
-
-	set_bit(GIF_PAGED, &ip->i_flags);
-	return filemap_fault(vma, vmf);
-}
-
-static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned long index = page->index;
-	u64 lblock = index << (PAGE_CACHE_SHIFT -
-				    sdp->sd_sb.sb_bsize_shift);
-	unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
-	struct gfs2_alloc *al;
-	unsigned int data_blocks, ind_blocks;
-	unsigned int x;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-
-	error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
-	if (error)
-		goto out;
-
-	error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
-	if (error)
-		goto out_gunlock_q;
-
-	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
-
-	al->al_requested = data_blocks + ind_blocks;
-
-	error = gfs2_inplace_reserve(ip);
-	if (error)
-		goto out_gunlock_q;
-
-	error = gfs2_trans_begin(sdp, al->al_rgd->rd_length +
-				 ind_blocks + RES_DINODE +
-				 RES_STATFS + RES_QUOTA, 0);
-	if (error)
-		goto out_ipres;
-
-	if (gfs2_is_stuffed(ip)) {
-		error = gfs2_unstuff_dinode(ip, NULL);
-		if (error)
-			goto out_trans;
-	}
-
-	for (x = 0; x < blocks; ) {
-		u64 dblock;
-		unsigned int extlen;
-		int new = 1;
-
-		error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
-		if (error)
-			goto out_trans;
-
-		lblock += extlen;
-		x += extlen;
-	}
-
-	gfs2_assert_warn(sdp, al->al_alloced);
-
-out_trans:
-	gfs2_trans_end(sdp);
-out_ipres:
-	gfs2_inplace_release(ip);
-out_gunlock_q:
-	gfs2_quota_unlock(ip);
-out:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
-						struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct gfs2_file *gf = file->private_data;
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	struct gfs2_holder i_gh;
-	int alloc_required;
-	int error;
-	int ret = 0;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-	if (error)
-		goto out;
-
-	set_bit(GIF_PAGED, &ip->i_flags);
-	set_bit(GIF_SW_PAGED, &ip->i_flags);
-
-	error = gfs2_write_alloc_required(ip,
-					(u64)vmf->pgoff << PAGE_CACHE_SHIFT,
-					PAGE_CACHE_SIZE, &alloc_required);
-	if (error) {
-		ret = VM_FAULT_OOM; /* XXX: are these right? */
-		goto out_unlock;
-	}
-
-	set_bit(GFF_EXLOCK, &gf->f_flags);
-	ret = filemap_fault(vma, vmf);
-	clear_bit(GFF_EXLOCK, &gf->f_flags);
-	if (ret & VM_FAULT_ERROR)
-		goto out_unlock;
-
-	if (alloc_required) {
-		/* XXX: do we need to drop page lock around alloc_page_backing?*/
-		error = alloc_page_backing(ip, vmf->page);
-		if (error) {
-			/*
-			 * VM_FAULT_LOCKED should always be the case for
-			 * filemap_fault, but it may not be in a future
-			 * implementation.
-			 */
-			if (ret & VM_FAULT_LOCKED)
-				unlock_page(vmf->page);
-			page_cache_release(vmf->page);
-			ret = VM_FAULT_OOM;
-			goto out_unlock;
-		}
-		set_page_dirty(vmf->page);
-	}
-
-out_unlock:
-	gfs2_glock_dq_uninit(&i_gh);
-out:
-	return ret;
-}
-
-struct vm_operations_struct gfs2_vm_ops_private = {
-	.fault = gfs2_private_fault,
-};
-
-struct vm_operations_struct gfs2_vm_ops_sharewrite = {
-	.fault = gfs2_sharewrite_fault,
-};
-
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
deleted file mode 100644
index 4ae8f43ed5e3..000000000000
--- a/fs/gfs2/ops_vm.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_VM_DOT_H__
-#define __OPS_VM_DOT_H__
-
-#include <linux/mm.h>
-
-extern struct vm_operations_struct gfs2_vm_ops_private;
-extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
-
-#endif /* __OPS_VM_DOT_H__ */
-- 
cgit v1.2.3


From f91a0d3e24e4b0198be5fae20d45a35c40d1efce Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 15 Oct 2007 16:29:05 +0100
Subject: [GFS2] Remove useless i_cache from inodes

The i_cache was designed to keep references to the indirect blocks
used during block mapping so that they didn't have to be looked
up continually. The idea failed because there are too many places
where the i_cache needs to be freed, and this has in the past been
the cause of many bugs.

In addition there was no performance benefit being gained since the
disk blocks in question were cached anyway. So this patch removes
it in order to simplify the code to prepare for other changes which
would otherwise have had to add further support for this feature.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c       | 21 +------------
 fs/gfs2/incore.h      |  2 --
 fs/gfs2/inode.c       | 13 ++++-----
 fs/gfs2/log.c         |  6 ++--
 fs/gfs2/log.h         |  2 +-
 fs/gfs2/main.c        |  1 -
 fs/gfs2/meta_io.c     | 81 +++++++--------------------------------------------
 fs/gfs2/meta_io.h     |  1 -
 fs/gfs2/ops_address.c |  1 -
 fs/gfs2/super.c       |  1 -
 10 files changed, 19 insertions(+), 110 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 110f03d66f4b..ba124230393b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,7 +56,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 		bd = list_entry(head->next, struct gfs2_bufdata,
 				bd_ail_gl_list);
 		bh = bd->bd_bh;
-		gfs2_remove_from_ail(NULL, bd);
+		gfs2_remove_from_ail(bd);
 		bd->bd_bh = NULL;
 		bh->b_private = NULL;
 		bd->bd_blkno = bh->b_blocknr;
@@ -286,23 +286,6 @@ static int inode_go_lock(struct gfs2_holder *gh)
 	return error;
 }
 
-/**
- * inode_go_unlock - operation done before an inode lock is unlocked by a
- *		     process
- * @gl: the glock
- * @flags:
- *
- */
-
-static void inode_go_unlock(struct gfs2_holder *gh)
-{
-	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_inode *ip = gl->gl_object;
-
-	if (ip)
-		gfs2_meta_cache_flush(ip);
-}
-
 /**
  * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
  * @gl: the glock
@@ -377,7 +360,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
 
 	if (gl->gl_state != LM_ST_UNLOCKED &&
 	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-		gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
 		j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
 		error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -437,7 +419,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 	.go_inval = inode_go_inval,
 	.go_demote_ok = inode_go_demote_ok,
 	.go_lock = inode_go_lock,
-	.go_unlock = inode_go_unlock,
 	.go_type = LM_TYPE_INODE,
 	.go_min_hold_time = HZ / 10,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 55c72f01cf31..5662ff9f86e1 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -275,8 +275,6 @@ struct gfs2_inode {
 	spinlock_t i_spin;
 	struct rw_semaphore i_rw_mutex;
 	unsigned long i_last_pfault;
-
-	struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
 };
 
 /*
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index ad0fe373dca5..af493fc6c8ce 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -293,11 +293,6 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	return 0;
 }
 
-static void gfs2_inode_bh(struct gfs2_inode *ip, struct buffer_head *bh)
-{
-	ip->i_cache[0] = bh;
-}
-
 /**
  * gfs2_inode_refresh - Refresh the incore copy of the dinode
  * @ip: The GFS2 inode
@@ -965,7 +960,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
 	int error;
 	u64 generation;
-	struct buffer_head *bh=NULL;
+	struct buffer_head *bh = NULL;
 
 	if (!name->len || name->len > GFS2_FNAMESIZE)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -1002,8 +997,6 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (IS_ERR(inode))
 		goto fail_gunlock2;
 
-	gfs2_inode_bh(GFS2_I(inode), bh);
-
 	error = gfs2_inode_refresh(GFS2_I(inode));
 	if (error)
 		goto fail_gunlock2;
@@ -1020,6 +1013,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock2;
 
+	if (bh)
+		brelse(bh);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	return inode;
@@ -1031,6 +1026,8 @@ fail_gunlock2:
 fail_gunlock:
 	gfs2_glock_dq(ghs);
 fail:
+	if (bh)
+		brelse(bh);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 7df702473252..70b404d2774b 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -68,14 +68,12 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
  *
  */
 
-void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd)
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 {
 	bd->bd_ail = NULL;
 	list_del_init(&bd->bd_ail_st_list);
 	list_del_init(&bd->bd_ail_gl_list);
 	atomic_dec(&bd->bd_gl->gl_ail_count);
-	if (mapping)
-		gfs2_meta_cache_flush(GFS2_I(mapping->host));
 	brelse(bd->bd_bh);
 }
 
@@ -248,7 +246,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 		bd = list_entry(head->prev, struct gfs2_bufdata,
 				bd_ail_st_list);
 		gfs2_assert(sdp, bd->bd_ail == ai);
-		gfs2_remove_from_ail(bd->bd_bh->b_page->mapping, bd);
+		gfs2_remove_from_ail(bd);
 	}
 }
 
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index dae282400627..24e7161486e2 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -59,7 +59,7 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 				      struct buffer_head *real);
 void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
 void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd);
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
 
 void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7ecfe0d3a491..653fd5a6203a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -31,7 +31,6 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
 	inode_init_once(&ip->i_inode);
 	spin_lock_init(&ip->i_spin);
 	init_rwsem(&ip->i_rw_mutex);
-	memset(ip->i_cache, 0, sizeof(ip->i_cache));
 }
 
 static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 4da423985e4f..01ef90253ed1 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -317,7 +317,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
 	}
 	if (bd) {
 		if (bd->bd_ail) {
-			gfs2_remove_from_ail(NULL, bd);
+			gfs2_remove_from_ail(bd);
 			bh->b_private = NULL;
 			bd->bd_bh = NULL;
 			bd->bd_blkno = bh->b_blocknr;
@@ -357,32 +357,6 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	}
 }
 
-/**
- * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
- * @ip: The GFS2 inode
- *
- * This releases buffers that are in the most-recently-used array of
- * blocks used for indirect block addressing for this inode.
- */
-
-void gfs2_meta_cache_flush(struct gfs2_inode *ip)
-{
-	struct buffer_head **bh_slot;
-	unsigned int x;
-
-	spin_lock(&ip->i_spin);
-
-	for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
-		bh_slot = &ip->i_cache[x];
-		if (*bh_slot) {
-			brelse(*bh_slot);
-			*bh_slot = NULL;
-		}
-	}
-
-	spin_unlock(&ip->i_spin);
-}
-
 /**
  * gfs2_meta_indirect_buffer - Get a metadata buffer
  * @ip: The GFS2 inode
@@ -391,8 +365,6 @@ void gfs2_meta_cache_flush(struct gfs2_inode *ip)
  * @new: Non-zero if we may create a new buffer
  * @bhp: the buffer is returned here
  *
- * Try to use the gfs2_inode's MRU metadata tree cache.
- *
  * Returns: errno
  */
 
@@ -401,58 +373,25 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_glock *gl = ip->i_gl;
-	struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
-	int in_cache = 0;
-
-	BUG_ON(!gl);
-	BUG_ON(!sdp);
-
-	spin_lock(&ip->i_spin);
-	if (*bh_slot && (*bh_slot)->b_blocknr == num) {
-		bh = *bh_slot;
-		get_bh(bh);
-		in_cache = 1;
-	}
-	spin_unlock(&ip->i_spin);
-
-	if (!bh)
-		bh = getbuf(gl, num, CREATE);
-
-	if (!bh)
-		return -ENOBUFS;
+	struct buffer_head *bh;
+	int ret = 0;
 
 	if (new) {
-		if (gfs2_assert_warn(sdp, height))
-			goto err;
-		meta_prep_new(bh);
+		BUG_ON(height == 0);
+		bh = gfs2_meta_new(gl, num);
 		gfs2_trans_add_bh(ip->i_gl, bh, 1);
 		gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 		gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
 	} else {
 		u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
-		if (!buffer_uptodate(bh)) {
-			ll_rw_block(READ_META, 1, &bh);
-			if (gfs2_meta_wait(sdp, bh))
-				goto err;
+		ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+		if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
+			brelse(bh);
+			ret = -EIO;
 		}
-		if (gfs2_metatype_check(sdp, bh, mtype))
-			goto err;
-	}
-
-	if (!in_cache) {
-		spin_lock(&ip->i_spin);
-		if (*bh_slot)
-			brelse(*bh_slot);
-		*bh_slot = bh;
-		get_bh(bh);
-		spin_unlock(&ip->i_spin);
 	}
-
 	*bhp = bh;
-	return 0;
-err:
-	brelse(bh);
-	return -EIO;
+	return ret;
 }
 
 /**
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index b7048222ebb4..73e3b1c76fe1 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -56,7 +56,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
 
 void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
 
-void gfs2_meta_cache_flush(struct gfs2_inode *ip);
 int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
 			      int new, struct buffer_head **bhp);
 
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 1696e5d9d112..4c4ef7f59909 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -154,7 +154,6 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
 	error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
 	if (done_trans)
 		gfs2_trans_end(sdp);
-	gfs2_meta_cache_flush(ip);
 	return error;
 
 out_ignore:
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index dd3e737f528e..5183dfb9342a 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -543,7 +543,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 	if (error)
 		return error;
 
-	gfs2_meta_cache_flush(ip);
 	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
 	error = gfs2_find_jhead(sdp->sd_jdesc, &head);
-- 
cgit v1.2.3


From e7e36f143565d14950055c893cfaf4400ad64d34 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 16 Oct 2007 11:47:04 +0100
Subject: [GFS2] Remove unused field in struct gfs2_inode

Removes a field that is not used.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h    | 1 -
 fs/gfs2/ops_super.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 5662ff9f86e1..e53da7d4cfff 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -274,7 +274,6 @@ struct gfs2_inode {
 
 	spinlock_t i_spin;
 	struct rw_semaphore i_rw_mutex;
-	unsigned long i_last_pfault;
 };
 
 /*
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 950f31460e8b..5e524217944a 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -487,7 +487,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
 	if (ip) {
 		ip->i_flags = 0;
 		ip->i_gl = NULL;
-		ip->i_last_pfault = jiffies;
 	}
 	return &ip->i_inode;
 }
-- 
cgit v1.2.3


From bf36a713169432643d4fc7eeb4e0ace96d791d26 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 17 Oct 2007 08:35:19 +0100
Subject: [GFS2] Add gfs2_is_writeback()

This adds a function "gfs2_is_writeback()" along the lines of the
existing "gfs2_is_jdata()" in order to clean up the code and make
the various tests for the inode mode more obvious. It also fixes
the PageChecked() logic where we were resetting the flag too early
in the case of an error path.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c        |  6 ++----
 fs/gfs2/incore.h      |  2 +-
 fs/gfs2/inode.h       |  6 ++++++
 fs/gfs2/ops_address.c | 10 ++++------
 4 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93fa427bb5f5..1cfd493e30fb 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -59,7 +59,6 @@ struct strip_mine {
 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 			       u64 block, struct page *page)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct buffer_head *bh;
 	int release = 0;
@@ -95,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 	set_buffer_uptodate(bh);
 	if (!gfs2_is_jdata(ip))
 		mark_buffer_dirty(bh);
-	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+	if (!gfs2_is_writeback(ip))
 		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 
 	if (release) {
@@ -879,7 +878,6 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
 {
 	struct inode *inode = mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	loff_t from = inode->i_size;
 	unsigned long index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@ -931,7 +929,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
 		err = 0;
 	}
 
-	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+	if (!gfs2_is_writeback(ip))
 		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 
 	zero_user_page(page, offset, length, KM_USER0);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e53da7d4cfff..82dfe9bd270b 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -285,7 +285,7 @@ static inline struct gfs2_inode *GFS2_I(struct inode *inode)
 	return container_of(inode, struct gfs2_inode, i_inode);
 }
 
-static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
+static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode)
 {
 	return inode->i_sb->s_fs_info;
 }
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 351ac87ab384..bed3dc212a18 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -20,6 +20,12 @@ static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
 	return ip->i_di.di_flags & GFS2_DIF_JDATA;
 }
 
+static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
+}
+
 static inline int gfs2_is_dir(const struct gfs2_inode *ip)
 {
 	return S_ISDIR(ip->i_inode.i_mode);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 4c4ef7f59909..ed154af86171 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -138,12 +138,11 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
 		return 0; /* don't care */
 	}
 
-	if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) &&
-	    PageChecked(page)) {
-		ClearPageChecked(page);
+	if (PageChecked(page)) {
 		error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
 		if (error)
 			goto out_ignore;
+		ClearPageChecked(page);
 		if (!page_has_buffers(page)) {
 			create_empty_buffers(page, inode->i_sb->s_blocksize,
 					     (1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -180,9 +179,8 @@ static int gfs2_writepages(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
 
-	if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip))
+	if (gfs2_is_writeback(ip))
 		return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
 
 	return generic_writepages(mapping, wbc);
@@ -606,7 +604,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	if (gfs2_is_stuffed(ip))
 		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
 
-	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+	if (!gfs2_is_writeback(ip))
 		gfs2_page_add_databufs(ip, page, from, to);
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-- 
cgit v1.2.3


From 5561093e2cac9f7d2a77e39cc689b8d2b7f9b2bc Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 17 Oct 2007 08:47:38 +0100
Subject: [GFS2] Introduce gfs2_set_aops()

Just like ext3 we now have three sets of address space operations
to cover the cases of writeback, ordered and journalled data
writes. This means that the individual operations can now become
less complicated as we are able to remove some of the tests for
file data mode from the code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c       |  4 ++-
 fs/gfs2/inode.h       |  6 ++++
 fs/gfs2/ops_address.c | 78 +++++++++++++++++++++++++++++++++------------------
 fs/gfs2/ops_address.h |  2 +-
 fs/gfs2/ops_file.c    | 13 ++++++++-
 5 files changed, 72 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index af493fc6c8ce..532784eb5ba4 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -136,7 +136,6 @@ void gfs2_set_iop(struct inode *inode)
 	if (S_ISREG(mode)) {
 		inode->i_op = &gfs2_file_iops;
 		inode->i_fop = &gfs2_file_fops;
-		inode->i_mapping->a_ops = &gfs2_file_aops;
 	} else if (S_ISDIR(mode)) {
 		inode->i_op = &gfs2_dir_iops;
 		inode->i_fop = &gfs2_dir_fops;
@@ -290,6 +289,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	di->di_entries = be32_to_cpu(str->di_entries);
 
 	di->di_eattr = be64_to_cpu(str->di_eattr);
+	if (S_ISREG(ip->i_inode.i_mode))
+		gfs2_set_aops(&ip->i_inode);
+
 	return 0;
 }
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index bed3dc212a18..d44650662615 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -26,6 +26,12 @@ static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
 	return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
 }
 
+static inline int gfs2_is_ordered(const struct gfs2_inode *ip)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip);
+}
+
 static inline int gfs2_is_dir(const struct gfs2_inode *ip)
 {
 	return S_ISDIR(ip->i_inode.i_mode);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index ed154af86171..207014f363d8 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -162,28 +162,18 @@ out_ignore:
 }
 
 /**
- * gfs2_writepages - Write a bunch of dirty pages back to disk
+ * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
  * @mapping: The mapping to write
  * @wbc: Write-back control
  *
- * For journaled files and/or ordered writes this just falls back to the
- * kernel's default writepages path for now. We will probably want to change
- * that eventually (i.e. when we look at allocate on flush).
- *
- * For the data=writeback case though we can already ignore buffer heads
+ * For the data=writeback case we can already ignore buffer heads
  * and write whole extents at once. This is a big reduction in the
  * number of I/O requests we send and the bmap calls we make in this case.
  */
-static int gfs2_writepages(struct address_space *mapping,
-			   struct writeback_control *wbc)
+static int gfs2_writeback_writepages(struct address_space *mapping,
+				     struct writeback_control *wbc)
 {
-	struct inode *inode = mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	if (gfs2_is_writeback(ip))
-		return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
-
-	return generic_writepages(mapping, wbc);
+	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
 }
 
 /**
@@ -644,11 +634,7 @@ failed:
  
 static int gfs2_set_page_dirty(struct page *page)
 {
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-
-	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
-		SetPageChecked(page);
+	SetPageChecked(page);
 	return __set_page_dirty_buffers(page);
 }
 
@@ -738,13 +724,9 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
 {
 	/*
 	 * Should we return an error here? I can't see that O_DIRECT for
-	 * a journaled file makes any sense. For now we'll silently fall
-	 * back to buffered I/O, likewise we do the same for stuffed
-	 * files since they are (a) small and (b) unaligned.
+	 * a stuffed file makes any sense. For now we'll silently fall
+	 * back to buffered I/O
 	 */
-	if (gfs2_is_jdata(ip))
-		return 0;
-
 	if (gfs2_is_stuffed(ip))
 		return 0;
 
@@ -855,9 +837,22 @@ cannot_release:
 	return 0;
 }
 
-const struct address_space_operations gfs2_file_aops = {
+static const struct address_space_operations gfs2_writeback_aops = {
+	.writepage = gfs2_writepage,
+	.writepages = gfs2_writeback_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.sync_page = block_sync_page,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+};
+
+static const struct address_space_operations gfs2_ordered_aops = {
 	.writepage = gfs2_writepage,
-	.writepages = gfs2_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
 	.sync_page = block_sync_page,
@@ -870,3 +865,30 @@ const struct address_space_operations gfs2_file_aops = {
 	.direct_IO = gfs2_direct_IO,
 };
 
+static const struct address_space_operations gfs2_jdata_aops = {
+	.writepage = gfs2_writepage,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.sync_page = block_sync_page,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.set_page_dirty = gfs2_set_page_dirty,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+};
+
+void gfs2_set_aops(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (gfs2_is_writeback(ip))
+		inode->i_mapping->a_ops = &gfs2_writeback_aops;
+	else if (gfs2_is_ordered(ip))
+		inode->i_mapping->a_ops = &gfs2_ordered_aops;
+	else if (gfs2_is_jdata(ip))
+		inode->i_mapping->a_ops = &gfs2_jdata_aops;
+	else
+		BUG();
+}
+
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
index e8fe83fcd583..d3b76d0cdc81 100644
--- a/fs/gfs2/ops_address.h
+++ b/fs/gfs2/ops_address.h
@@ -14,12 +14,12 @@
 #include <linux/buffer_head.h>
 #include <linux/mm.h>
 
-extern const struct address_space_operations gfs2_file_aops;
 extern int gfs2_get_block(struct inode *inode, sector_t lblock,
 			  struct buffer_head *bh_result, int create);
 extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
 extern int gfs2_internal_read(struct gfs2_inode *ip,
 			      struct file_ra_state *ra_state,
 			      char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
 
 #endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 6f3aeb059c61..ad5daaa6babc 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -38,6 +38,7 @@
 #include "trans.h"
 #include "util.h"
 #include "eaops.h"
+#include "ops_address.h"
 
 /**
  * gfs2_llseek - seek to a location in a file
@@ -245,7 +246,16 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 		if (error)
 			goto out;
 	}
-
+	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
+		if (flags & GFS2_DIF_JDATA)
+			gfs2_log_flush(sdp, ip->i_gl);
+		error = filemap_fdatawrite(inode->i_mapping);
+		if (error)
+			goto out;
+		error = filemap_fdatawait(inode->i_mapping);
+		if (error)
+			goto out;
+	}
 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
 	if (error)
 		goto out;
@@ -257,6 +267,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
 	gfs2_set_inode_flags(inode);
+	gfs2_set_aops(inode);
 out_trans_end:
 	gfs2_trans_end(sdp);
 out:
-- 
cgit v1.2.3


From 9ff8ec32e58875022447af619bec6e5aee7c77e4 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 28 Sep 2007 13:49:05 +0100
Subject: [GFS2] Split gfs2_writepage into three cases

This patch splits gfs2_writepage into separate functions for each of
the three cases: writeback, ordered and journalled. As a result
it becomes a lot easier to see what each one is doing. The common
code is moved into gfs2_writepage_common.

This fixes a performance bug where we were doing more work than
strictly required in the ordered write case.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c        |  17 ++++----
 fs/gfs2/ops_address.c | 112 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 101 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c27cea761c6..e901f8f7d650 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -556,17 +556,20 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 
 	lock_buffer(bd->bd_bh);
 	gfs2_log_lock(sdp);
-	if (!list_empty(&bd->bd_list_tr))
-		goto out;
-	tr->tr_touched = 1;
-	if (gfs2_is_jdata(ip)) {
-		tr->tr_num_buf++;
-		list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+	if (tr) {
+		if (!list_empty(&bd->bd_list_tr))
+			goto out;
+		tr->tr_touched = 1;
+		if (gfs2_is_jdata(ip)) {
+			tr->tr_num_buf++;
+			list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+		}
 	}
 	if (!list_empty(&le->le_list))
 		goto out;
 
-	__glock_lo_add(sdp, &bd->bd_gl->gl_le);
+	if (tr)
+		__glock_lo_add(sdp, &bd->bd_gl->gl_le);
 	if (gfs2_is_jdata(ip)) {
 		gfs2_pin(sdp, bd->bd_bh);
 		tr->tr_num_databuf_new++;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 207014f363d8..4bf73ed945ae 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -103,16 +103,15 @@ static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
 }
 
 /**
- * gfs2_writepage - Write complete page
- * @page: Page to write
- *
- * Returns: errno
+ * gfs2_writepage_common - Common bits of writepage
+ * @page: The page to be written
+ * @wbc: The writeback control
  *
- * Some of this is copied from block_write_full_page() although we still
- * call it to do most of the work.
+ * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
  */
 
-static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
+static int gfs2_writepage_common(struct page *page,
+				 struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -120,23 +119,94 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
 	loff_t i_size = i_size_read(inode);
 	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
-	int error;
-	int done_trans = 0;
+	int ret = -EIO;
 
-	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
-		unlock_page(page);
-		return -EIO;
-	}
+	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+		goto out;
+	ret = 0;
 	if (current->journal_info)
-		goto out_ignore;
-
+		goto redirty;
 	/* Is the page fully outside i_size? (truncate in progress) */
-        offset = i_size & (PAGE_CACHE_SIZE-1);
+	offset = i_size & (PAGE_CACHE_SIZE-1);
 	if (page->index > end_index || (page->index == end_index && !offset)) {
 		page->mapping->a_ops->invalidatepage(page, 0);
-		unlock_page(page);
-		return 0; /* don't care */
+		goto out;
 	}
+	return 1;
+redirty:
+	redirty_page_for_writepage(wbc, page);
+out:
+	unlock_page(page);
+	return 0;
+}
+
+/**
+ * gfs2_writeback_writepage - Write page for writeback mappings
+ * @page: The page
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_writeback_writepage(struct page *page,
+				    struct writeback_control *wbc)
+{
+	int ret;
+
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret <= 0)
+		return ret;
+
+	ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
+	if (ret == -EAGAIN)
+		ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+	return ret;
+}
+
+/**
+ * gfs2_ordered_writepage - Write page for ordered data files
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_ordered_writepage(struct page *page,
+				  struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int ret;
+
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret <= 0)
+		return ret;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				     (1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+	gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
+	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * gfs2_jdata_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ */
+
+static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int error;
+	int done_trans = 0;
+
+	error = gfs2_writepage_common(page, wbc);
+	if (error <= 0)
+		return error;
 
 	if (PageChecked(page)) {
 		error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
@@ -838,7 +908,7 @@ cannot_release:
 }
 
 static const struct address_space_operations gfs2_writeback_aops = {
-	.writepage = gfs2_writepage,
+	.writepage = gfs2_writeback_writepage,
 	.writepages = gfs2_writeback_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
@@ -852,7 +922,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
 };
 
 static const struct address_space_operations gfs2_ordered_aops = {
-	.writepage = gfs2_writepage,
+	.writepage = gfs2_ordered_writepage,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
 	.sync_page = block_sync_page,
@@ -866,7 +936,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
 };
 
 static const struct address_space_operations gfs2_jdata_aops = {
-	.writepage = gfs2_writepage,
+	.writepage = gfs2_jdata_writepage,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
 	.sync_page = block_sync_page,
-- 
cgit v1.2.3


From b8e7cbb65bcc99630e123422c6829ce3c0fcdf14 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 17 Oct 2007 09:04:24 +0100
Subject: [GFS2] Add writepages for GFS2 jdata

This patch resolves a lock ordering issue where we had been getting
a transaction lock in the wrong order with respect to the page lock.
By using writepages rather than just writepage, it is then possible
to start a transaction before locking the page, and thus matching the
locking order elsewhere in the code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c         |   2 +-
 fs/gfs2/ops_address.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 206 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 70b404d2774b..1e1fe8def375 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -650,7 +650,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
 		get_bh(bh);
 		gfs2_log_unlock(sdp);
 		lock_buffer(bh);
-		if (test_clear_buffer_dirty(bh)) {
+		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
 			bh->b_end_io = end_buffer_write_sync;
 			submit_bh(WRITE, bh);
 		} else {
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 4bf73ed945ae..48913e569907 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -21,6 +21,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
 #include <linux/swap.h>
+#include <linux/pagevec.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -188,6 +189,34 @@ static int gfs2_ordered_writepage(struct page *page,
 	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
 }
 
+/**
+ * __gfs2_jdata_writepage - The core of jdata writepage
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ * This is shared between writepage and writepages and implements the
+ * core of the writepage operation. If a transaction is required then
+ * PageChecked will have been set and the transaction will have
+ * already been started before this is called.
+ */
+
+static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		if (!page_has_buffers(page)) {
+			create_empty_buffers(page, inode->i_sb->s_blocksize,
+					     (1 << BH_Dirty)|(1 << BH_Uptodate));
+		}
+		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+	}
+	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
 /**
  * gfs2_jdata_writepage - Write complete page
  * @page: Page to write
@@ -199,7 +228,6 @@ static int gfs2_ordered_writepage(struct page *page,
 static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	int error;
 	int done_trans = 0;
@@ -209,18 +237,14 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
 		return error;
 
 	if (PageChecked(page)) {
+		if (wbc->sync_mode != WB_SYNC_ALL)
+			goto out_ignore;
 		error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
 		if (error)
 			goto out_ignore;
-		ClearPageChecked(page);
-		if (!page_has_buffers(page)) {
-			create_empty_buffers(page, inode->i_sb->s_blocksize,
-					     (1 << BH_Dirty)|(1 << BH_Uptodate));
-		}
-		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
 		done_trans = 1;
 	}
-	error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+	error = __gfs2_jdata_writepage(page, wbc);
 	if (done_trans)
 		gfs2_trans_end(sdp);
 	return error;
@@ -246,6 +270,178 @@ static int gfs2_writeback_writepages(struct address_space *mapping,
 	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
 }
 
+/**
+ * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
+ * @mapping: The mapping
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call for each page
+ * @pvec: The vector of pages
+ * @nr_pages: The number of pages to write
+ *
+ * Returns: non-zero if loop should terminate, zero otherwise
+ */
+
+static int gfs2_write_jdata_pagevec(struct address_space *mapping,
+				    struct writeback_control *wbc,
+				    struct pagevec *pvec,
+				    int nr_pages, pgoff_t end)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
+	unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int i;
+	int ret;
+
+	ret = gfs2_trans_begin(sdp, nrblocks, 0);
+	if (ret < 0)
+		return ret;
+
+	for(i = 0; i < nr_pages; i++) {
+		struct page *page = pvec->pages[i];
+
+		lock_page(page);
+
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			continue;
+		}
+
+		if (!wbc->range_cyclic && page->index > end) {
+			ret = 1;
+			unlock_page(page);
+			continue;
+		}
+
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+
+		if (PageWriteback(page) ||
+		    !clear_page_dirty_for_io(page)) {
+			unlock_page(page);
+			continue;
+		}
+
+		/* Is the page fully outside i_size? (truncate in progress) */
+		if (page->index > end_index || (page->index == end_index && !offset)) {
+			page->mapping->a_ops->invalidatepage(page, 0);
+			unlock_page(page);
+			continue;
+		}
+
+		ret = __gfs2_jdata_writepage(page, wbc);
+
+		if (ret || (--(wbc->nr_to_write) <= 0))
+			ret = 1;
+		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+			wbc->encountered_congestion = 1;
+			ret = 1;
+		}
+
+	}
+	gfs2_trans_end(sdp);
+	return ret;
+}
+
+/**
+ * gfs2_write_cache_jdata - Like write_cache_pages but different
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call
+ * @data: The data to pass to writepage
+ *
+ * The reason that we use our own function here is that we need to
+ * start transactions before we grab page locks. This allows us
+ * to get the ordering right.
+ */
+
+static int gfs2_write_cache_jdata(struct address_space *mapping,
+				  struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+
+retry:
+	 while (!done && (index <= end) &&
+		(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					       PAGECACHE_TAG_DIRTY,
+					       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		scanned = 1;
+		ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
+		if (ret)
+			done = 1;
+		if (ret > 0)
+			ret = 0;
+
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+	return ret;
+}
+
+
+/**
+ * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * 
+ */
+
+static int gfs2_jdata_writepages(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	int ret;
+
+	ret = gfs2_write_cache_jdata(mapping, wbc);
+	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
+		gfs2_log_flush(sdp, ip->i_gl);
+		ret = gfs2_write_cache_jdata(mapping, wbc);
+	}
+	return ret;
+}
+
 /**
  * stuffed_readpage - Fill in a Linux page with stuffed file data
  * @ip: the inode
@@ -937,6 +1133,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
 
 static const struct address_space_operations gfs2_jdata_aops = {
 	.writepage = gfs2_jdata_writepage,
+	.writepages = gfs2_jdata_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
 	.sync_page = block_sync_page,
-- 
cgit v1.2.3


From c41d4f09f13671f98ba4b82fdc94420cdc09be08 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 17 Oct 2007 14:05:41 +0100
Subject: [GFS2] Don't hold page lock when starting transaction

This is an addendum to the new AOPs work which moves the point
at which we take the page lock so that we don't get it until
the last possible moment. This resolves a conflict between
starting transactions and the page lock.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 48913e569907..ae782d2cbdec 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -657,18 +657,10 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	if (unlikely(error))
 		goto out_uninit;
 
-	error = -ENOMEM;
-	page = __grab_cache_page(mapping, index);
-	*pagep = page;
-	if (!page)
-		goto out_unlock;
-
 	gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
-
 	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
 	if (error)
-		goto out_putpage;
-
+		goto out_unlock;
 
 	ip->i_alloc.al_requested = 0;
 	if (alloc_required) {
@@ -699,40 +691,47 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	if (error)
 		goto out_trans_fail;
 
+	error = -ENOMEM;
+	page = __grab_cache_page(mapping, index);
+	*pagep = page;
+	if (unlikely(!page))
+		goto out_endtrans;
+
 	if (gfs2_is_stuffed(ip)) {
+		error = 0;
 		if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
 			error = gfs2_unstuff_dinode(ip, page);
 			if (error == 0)
 				goto prepare_write;
-		} else if (!PageUptodate(page))
+		} else if (!PageUptodate(page)) {
 			error = stuffed_readpage(ip, page);
+		}
 		goto out;
 	}
 
 prepare_write:
 	error = block_prepare_write(page, from, to, gfs2_get_block);
-
 out:
-	if (error) {
-		gfs2_trans_end(sdp);
+	if (error == 0)
+		return 0;
+
+	page_cache_release(page);
+	if (pos + len > ip->i_inode.i_size)
+		vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+out_endtrans:
+	gfs2_trans_end(sdp);
 out_trans_fail:
-		if (alloc_required) {
-			gfs2_inplace_release(ip);
+	if (alloc_required) {
+		gfs2_inplace_release(ip);
 out_qunlock:
-			gfs2_quota_unlock(ip);
+		gfs2_quota_unlock(ip);
 out_alloc_put:
-			gfs2_alloc_put(ip);
-		}
-out_putpage:
-		page_cache_release(page);
-		if (pos + len > ip->i_inode.i_size)
-			vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+		gfs2_alloc_put(ip);
+	}
 out_unlock:
-		gfs2_glock_dq_m(1, &ip->i_gh);
+	gfs2_glock_dq(&ip->i_gh);
 out_uninit:
-		gfs2_holder_uninit(&ip->i_gh);
-	}
-
+	gfs2_holder_uninit(&ip->i_gh);
 	return error;
 }
 
-- 
cgit v1.2.3


From 47e83b509127f5e83ae5d93afd5c7cb9241acc38 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 18 Oct 2007 11:15:50 +0100
Subject: [GFS2] Use correct include file in ops_address.c

Something changed in the upstream kernel, and it needs this
one-liner to allow ops_address.c to build.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index ae782d2cbdec..7353933483bb 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -20,7 +20,7 @@
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
-#include <linux/swap.h>
+#include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 
 #include "gfs2.h"
-- 
cgit v1.2.3


From 60b0d0877986b8fa70148f06055422d2ed858e88 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 31 Oct 2007 14:24:33 +0000
Subject: [GFS2] Remove unused variables

These haven't been used for some time, remove them.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 4 ----
 fs/gfs2/super.c  | 4 ----
 2 files changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 82dfe9bd270b..f7a50fed4b52 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -412,9 +412,6 @@ struct gfs2_args {
 struct gfs2_tune {
 	spinlock_t gt_spin;
 
-	unsigned int gt_ilimit;
-	unsigned int gt_ilimit_tries;
-	unsigned int gt_ilimit_min;
 	unsigned int gt_demote_secs; /* Cache retention for unheld glock */
 	unsigned int gt_incore_log_blocks;
 	unsigned int gt_log_flush_secs;
@@ -434,7 +431,6 @@ struct gfs2_tune {
 	unsigned int gt_new_files_jdata;
 	unsigned int gt_new_files_directio;
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
-	unsigned int gt_lockdump_size;
 	unsigned int gt_stall_secs; /* Detects trouble! */
 	unsigned int gt_complain_secs;
 	unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5183dfb9342a..26edb7f9f4b8 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -51,9 +51,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
 {
 	spin_lock_init(&gt->gt_spin);
 
-	gt->gt_ilimit = 100;
-	gt->gt_ilimit_tries = 3;
-	gt->gt_ilimit_min = 1;
 	gt->gt_demote_secs = 300;
 	gt->gt_incore_log_blocks = 1024;
 	gt->gt_log_flush_secs = 60;
@@ -71,7 +68,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_new_files_jdata = 0;
 	gt->gt_new_files_directio = 0;
 	gt->gt_max_readahead = 1 << 18;
-	gt->gt_lockdump_size = 131072;
 	gt->gt_stall_secs = 600;
 	gt->gt_complain_secs = 10;
 	gt->gt_reclaim_limit = 5000;
-- 
cgit v1.2.3


From c2932e03dbcfe7ea9052953dbd5f3157183c1e9b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 1 Nov 2007 09:26:54 +0000
Subject: [GFS2] Remove "reclaim limit"

This call to reclaim glocks is not needed, and in particular we don't want it
in the fast path for locking glocks. The limit was entirely arbitrary anyway
and we can't expect users to adjust things like this, the remaining code will
do the right thing on its own.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c  | 9 ---------
 fs/gfs2/incore.h | 1 -
 fs/gfs2/super.c  | 1 -
 fs/gfs2/sys.c    | 2 --
 4 files changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 104e83ff874f..159a5479c4e4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -507,21 +507,12 @@ static int rq_mutex(struct gfs2_holder *gh)
 static int rq_promote(struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
 
 	if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
 		if (list_empty(&gl->gl_holders)) {
 			gl->gl_req_gh = gh;
 			set_bit(GLF_LOCK, &gl->gl_flags);
 			spin_unlock(&gl->gl_spin);
-
-			if (atomic_read(&sdp->sd_reclaim_count) >
-			    gfs2_tune_get(sdp, gt_reclaim_limit) &&
-			    !(gh->gh_flags & LM_FLAG_PRIORITY)) {
-				gfs2_reclaim_glock(sdp);
-				gfs2_reclaim_glock(sdp);
-			}
-
 			gfs2_glock_xmote_th(gh->gh_gl, gh);
 			spin_lock(&gl->gl_spin);
 		}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f7a50fed4b52..089dba412cc0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -433,7 +433,6 @@ struct gfs2_tune {
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
 	unsigned int gt_stall_secs; /* Detects trouble! */
 	unsigned int gt_complain_secs;
-	unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
 	unsigned int gt_statfs_quantum;
 	unsigned int gt_statfs_slow;
 };
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 26edb7f9f4b8..548cc8ba0703 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -70,7 +70,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_max_readahead = 1 << 18;
 	gt->gt_stall_secs = 600;
 	gt->gt_complain_secs = 10;
-	gt->gt_reclaim_limit = 5000;
 	gt->gt_statfs_quantum = 30;
 	gt->gt_statfs_slow = 0;
 }
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 06e0b7768d97..1359198aed63 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -433,7 +433,6 @@ TUNE_ATTR(quota_quantum, 0);
 TUNE_ATTR(atime_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
-TUNE_ATTR(reclaim_limit, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(new_files_directio, 0);
@@ -456,7 +455,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_atime_quantum.attr,
 	&tune_attr_max_readahead.attr,
 	&tune_attr_complain_secs.attr,
-	&tune_attr_reclaim_limit.attr,
 	&tune_attr_statfs_slow.attr,
 	&tune_attr_quota_simul_sync.attr,
 	&tune_attr_quota_cache_secs.attr,
-- 
cgit v1.2.3


From 52d4c74b08bf859f698ddb4e8a43c0dc8d4a0685 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 1 Nov 2007 09:34:14 +0000
Subject: [GFS2] Add sync_page to metadata address space operations

This set of address space operations was missing a sync_page
operation.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/meta_io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 01ef90253ed1..4b1aced9023d 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -50,6 +50,7 @@ static int gfs2_aspace_writepage(struct page *page,
 static const struct address_space_operations aspace_aops = {
 	.writepage = gfs2_aspace_writepage,
 	.releasepage = gfs2_releasepage,
+	.sync_page = block_sync_page,
 };
 
 /**
-- 
cgit v1.2.3


From 3042a2ccd68d2b609d283219e51cba363aa35c1d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 2 Nov 2007 08:39:34 +0000
Subject: [GFS2] Reorder writeback for glock sync

Previously we were doing (write data, wait for data, write metadata, wait
for metadata). After this patch we so (write metadata, write data, wait for
data, wait for metadata) which should be more efficient.

Also I noticed that the drop_bh and xmote_bh functions were almost
identical. In fact the only difference was a single test, and that
test is such that in the drop_bh case, it would always evaluate to
the correct result. As such we can use the xmote_bh functions in
all the places where we were using the drop_bh function and remove
the drop_bh functions.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 13 +++++-----
 fs/gfs2/glops.c | 80 ++++++++++++---------------------------------------------
 2 files changed, 22 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 159a5479c4e4..e668808b127f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -947,8 +947,8 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	unsigned int ret;
 
-	if (glops->go_drop_th)
-		glops->go_drop_th(gl);
+	if (glops->go_xmote_th)
+		glops->go_xmote_th(gl);
 
 	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
 	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -1252,12 +1252,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 	list_del_init(&gh->gh_list);
 
 	if (list_empty(&gl->gl_holders)) {
-		spin_unlock(&gl->gl_spin);
-
-		if (glops->go_unlock)
+		if (glops->go_unlock) {
+			spin_unlock(&gl->gl_spin);
 			glops->go_unlock(gh);
-
-		spin_lock(&gl->gl_spin);
+			spin_lock(&gl->gl_spin);
+		}
 		gl->gl_stamp = jiffies;
 	}
 
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index ba124230393b..c663b7a0f410 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -138,43 +138,33 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
 static void inode_go_sync(struct gfs2_glock *gl)
 {
 	struct gfs2_inode *ip = gl->gl_object;
+	struct address_space *metamapping = gl->gl_aspace->i_mapping;
+	int error;
+
+	if (gl->gl_state != LM_ST_UNLOCKED)
+		gfs2_pte_inval(gl);
+	if (gl->gl_state != LM_ST_EXCLUSIVE)
+		return;
 
 	if (ip && !S_ISREG(ip->i_inode.i_mode))
 		ip = NULL;
 
 	if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
-		if (ip && !gfs2_is_jdata(ip))
-			filemap_fdatawrite(ip->i_inode.i_mapping);
 		gfs2_log_flush(gl->gl_sbd, gl);
-		if (ip && gfs2_is_jdata(ip))
-			filemap_fdatawrite(ip->i_inode.i_mapping);
-		gfs2_meta_sync(gl);
+		filemap_fdatawrite(metamapping);
 		if (ip) {
 			struct address_space *mapping = ip->i_inode.i_mapping;
-			int error = filemap_fdatawait(mapping);
+			filemap_fdatawrite(mapping);
+			error = filemap_fdatawait(mapping);
 			mapping_set_error(mapping, error);
 		}
+		error = filemap_fdatawait(metamapping);
+		mapping_set_error(metamapping, error);
 		clear_bit(GLF_DIRTY, &gl->gl_flags);
 		gfs2_ail_empty_gl(gl);
 	}
 }
 
-/**
- * inode_go_xmote_th - promote/demote a glock
- * @gl: the glock
- * @state: the requested state
- * @flags:
- *
- */
-
-static void inode_go_xmote_th(struct gfs2_glock *gl)
-{
-	if (gl->gl_state != LM_ST_UNLOCKED)
-		gfs2_pte_inval(gl);
-	if (gl->gl_state == LM_ST_EXCLUSIVE)
-		inode_go_sync(gl);
-}
-
 /**
  * inode_go_xmote_bh - After promoting/demoting a glock
  * @gl: the glock
@@ -195,22 +185,6 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl)
 	}
 }
 
-/**
- * inode_go_drop_th - unlock a glock
- * @gl: the glock
- *
- * Invoked from rq_demote().
- * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
- * is being purged from our node's glock cache; we're dropping lock.
- */
-
-static void inode_go_drop_th(struct gfs2_glock *gl)
-{
-	gfs2_pte_inval(gl);
-	if (gl->gl_state == LM_ST_EXCLUSIVE)
-		inode_go_sync(gl);
-}
-
 /**
  * inode_go_inval - prepare a inode glock to be released
  * @gl: the glock
@@ -326,14 +300,14 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 
 /**
- * trans_go_xmote_th - promote/demote the transaction glock
+ * trans_go_sync - promote/demote the transaction glock
  * @gl: the glock
  * @state: the requested state
  * @flags:
  *
  */
 
-static void trans_go_xmote_th(struct gfs2_glock *gl)
+static void trans_go_sync(struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 
@@ -376,24 +350,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
 	}
 }
 
-/**
- * trans_go_drop_th - unlock the transaction glock
- * @gl: the glock
- *
- * We want to sync the device even with localcaching.  Remember
- * that localcaching journal replay only marks buffers dirty.
- */
-
-static void trans_go_drop_th(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-
-	if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-		gfs2_meta_syncfs(sdp);
-		gfs2_log_shutdown(sdp);
-	}
-}
-
 /**
  * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
  * @gl: the glock
@@ -408,14 +364,12 @@ static int quota_go_demote_ok(struct gfs2_glock *gl)
 
 const struct gfs2_glock_operations gfs2_meta_glops = {
 	.go_xmote_th = meta_go_sync,
-	.go_drop_th = meta_go_sync,
 	.go_type = LM_TYPE_META,
 };
 
 const struct gfs2_glock_operations gfs2_inode_glops = {
-	.go_xmote_th = inode_go_xmote_th,
+	.go_xmote_th = inode_go_sync,
 	.go_xmote_bh = inode_go_xmote_bh,
-	.go_drop_th = inode_go_drop_th,
 	.go_inval = inode_go_inval,
 	.go_demote_ok = inode_go_demote_ok,
 	.go_lock = inode_go_lock,
@@ -425,7 +379,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_xmote_th = meta_go_sync,
-	.go_drop_th = meta_go_sync,
 	.go_inval = meta_go_inval,
 	.go_demote_ok = rgrp_go_demote_ok,
 	.go_lock = rgrp_go_lock,
@@ -435,9 +388,8 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 };
 
 const struct gfs2_glock_operations gfs2_trans_glops = {
-	.go_xmote_th = trans_go_xmote_th,
+	.go_xmote_th = trans_go_sync,
 	.go_xmote_bh = trans_go_xmote_bh,
-	.go_drop_th = trans_go_drop_th,
 	.go_type = LM_TYPE_NONDISK,
 };
 
-- 
cgit v1.2.3


From e589665eb97b297412fb16b4c1737a01a91db903 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 2 Nov 2007 09:14:31 +0000
Subject: [GFS2] Remove flags no longer required

The HIF_MUTEX and HIF_PROMOTE flags were set on the glock holders
depending upon which of the two waiters lists they were going to
be queued upon. They were then tested when the holders were taken
off the lists to ensure that the right type of holder was being
dequeued.

Since we are already using separate lists, there doesn't seem a
lot of point having these flags as well, and since setting them
and testing them is in the fast path for locking and unlocking
glock, this patch removes them.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c  | 17 ++---------------
 fs/gfs2/incore.h |  4 ----
 2 files changed, 2 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e668808b127f..5fbd9d34ce23 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -594,12 +594,7 @@ static void run_queue(struct gfs2_glock *gl)
 		if (!list_empty(&gl->gl_waiters1)) {
 			gh = list_entry(gl->gl_waiters1.next,
 					struct gfs2_holder, gh_list);
-
-			if (test_bit(HIF_MUTEX, &gh->gh_iflags))
-				blocked = rq_mutex(gh);
-			else
-				gfs2_assert_warn(gl->gl_sbd, 0);
-
+			blocked = rq_mutex(gh);
 		} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
 			blocked = rq_demote(gl);
 			if (gl->gl_waiters2 && !blocked) {
@@ -610,12 +605,7 @@ static void run_queue(struct gfs2_glock *gl)
 		} else if (!list_empty(&gl->gl_waiters3)) {
 			gh = list_entry(gl->gl_waiters3.next,
 					struct gfs2_holder, gh_list);
-
-			if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
-				blocked = rq_promote(gh);
-			else
-				gfs2_assert_warn(gl->gl_sbd, 0);
-
+			blocked = rq_promote(gh);
 		} else
 			break;
 
@@ -636,7 +626,6 @@ static void gfs2_glmutex_lock(struct gfs2_glock *gl)
 	struct gfs2_holder gh;
 
 	gfs2_holder_init(gl, 0, 0, &gh);
-	set_bit(HIF_MUTEX, &gh.gh_iflags);
 	if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags))
 		BUG();
 
@@ -1160,8 +1149,6 @@ restart:
 		return -EIO;
 	}
 
-	set_bit(HIF_PROMOTE, &gh->gh_iflags);
-
 	spin_lock(&gl->gl_spin);
 	add_to_queue(gh);
 	run_queue(gl);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 089dba412cc0..478023e9fda6 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -141,10 +141,6 @@ struct gfs2_glock_operations {
 };
 
 enum {
-	/* Actions */
-	HIF_MUTEX		= 0,
-	HIF_PROMOTE		= 1,
-
 	/* States */
 	HIF_HOLDER		= 6,
 	HIF_FIRST		= 7,
-- 
cgit v1.2.3


From c7227e46423a57b4df27a2d75b5869bd3ae654d0 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Fri, 2 Nov 2007 09:37:15 -0500
Subject: [GFS2] Given device ID rather than s_id in "id" sysfs file

This patch changes the /sys/fs/gfs2/<s_id>/id file to give the device
id "major:minor" rather than the s_id.  That enables gfs2_tool to
match devices properly (by id, not name) when locating the tuning files.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 1359198aed63..65dd0657e1f8 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -32,7 +32,8 @@ spinlock_t gfs2_sys_margs_lock;
 
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id);
+	return snprintf(buf, PAGE_SIZE, "%u:%u\n",
+			MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev));
 }
 
 static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
-- 
cgit v1.2.3


From 8cbc4342478311c2a85260a7ca54d96cb7f71f7b Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 7 Nov 2007 09:03:56 -0600
Subject: [GFS2] check kthread_should_stop when waiting

Use wait_event_interruptible() in the lock_dlm thread instead
of an open coded equivalent, and include a kthread_should_stop()
check in the wait test so we don't miss a kthread_stop().

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/thread.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index bd938f06481d..521694fc19d6 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -273,18 +273,13 @@ static int gdlm_thread(void *data, int blist)
 	struct gdlm_ls *ls = (struct gdlm_ls *) data;
 	struct gdlm_lock *lp = NULL;
 	uint8_t complete, blocking, submit, drop;
-	DECLARE_WAITQUEUE(wait, current);
 
 	/* Only thread1 is allowed to do blocking callbacks since gfs
 	   may wait for a completion callback within a blocking cb. */
 
 	while (!kthread_should_stop()) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&ls->thread_wait, &wait);
-		if (no_work(ls, blist))
-			schedule();
-		remove_wait_queue(&ls->thread_wait, &wait);
-		set_current_state(TASK_RUNNING);
+		wait_event_interruptible(ls->thread_wait,
+				!no_work(ls, blist) || kthread_should_stop());
 
 		complete = blocking = submit = drop = 0;
 
-- 
cgit v1.2.3


From 2bcd610d2fdea608a8fdac32788fc35a32a2327c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 8 Nov 2007 14:25:12 +0000
Subject: [GFS2] Don't add glocks to the journal

The only reason for adding glocks to the journal was to keep track
of which locks required a log flush prior to release. We add a
flag to the glock to allow this check to be made in a simpler way.

This reduces the size of a glock (by 12 bytes on i386, 24 on x86_64)
and means that we can avoid extra work during the journal flush.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c      |  3 ---
 fs/gfs2/incore.h     |  4 +---
 fs/gfs2/inode.c      |  3 ++-
 fs/gfs2/log.c        | 15 +++++---------
 fs/gfs2/log.h        |  9 +++++++-
 fs/gfs2/lops.c       | 58 +++++-----------------------------------------------
 fs/gfs2/ops_fstype.c |  1 -
 fs/gfs2/trans.c      |  5 -----
 fs/gfs2/trans.h      |  1 -
 9 files changed, 21 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 5fbd9d34ce23..d83df6888402 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -346,7 +346,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_object = NULL;
 	gl->gl_sbd = sdp;
 	gl->gl_aspace = NULL;
-	lops_init_le(&gl->gl_le, &gfs2_glock_lops);
 	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
 
 	/* If this glock protects actual on-disk data or metadata blocks,
@@ -1900,8 +1899,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
 	print_dbg(gi, "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
 	print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
 	print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-	print_dbg(gi, "  le = %s\n",
-		   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
 	print_dbg(gi, "  reclaim = %s\n",
 		   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
 	if (gl->gl_aspace)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 478023e9fda6..911822d1e4c0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -168,6 +168,7 @@ enum {
 	GLF_PENDING_DEMOTE	= 4,
 	GLF_DIRTY		= 5,
 	GLF_DEMOTE_IN_PROGRESS	= 6,
+	GLF_LFLUSH		= 7,
 };
 
 struct gfs2_glock {
@@ -208,7 +209,6 @@ struct gfs2_glock {
 	struct gfs2_sbd *gl_sbd;
 
 	struct inode *gl_aspace;
-	struct gfs2_log_element gl_le;
 	struct list_head gl_ail_list;
 	atomic_t gl_ail_count;
 	struct delayed_work gl_work;
@@ -584,13 +584,11 @@ struct gfs2_sbd {
 	unsigned int sd_log_commited_databuf;
 	unsigned int sd_log_commited_revoke;
 
-	unsigned int sd_log_num_gl;
 	unsigned int sd_log_num_buf;
 	unsigned int sd_log_num_revoke;
 	unsigned int sd_log_num_rg;
 	unsigned int sd_log_num_databuf;
 
-	struct list_head sd_log_le_gl;
 	struct list_head sd_log_le_buf;
 	struct list_head sd_log_le_revoke;
 	struct list_head sd_log_le_rg;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 532784eb5ba4..92959d093adf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -362,7 +362,8 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 	if (error)
 		goto out_rg_gunlock;
 
-	gfs2_trans_add_gl(ip->i_gl);
+	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+	set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags);
 
 	gfs2_free_di(rgd, ip);
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 1e1fe8def375..d24684330bc3 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -692,20 +692,16 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
  *
  */
 
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 {
 	struct gfs2_ail *ai;
 
 	down_write(&sdp->sd_log_flush_lock);
 
-	if (gl) {
-		gfs2_log_lock(sdp);
-		if (list_empty(&gl->gl_le.le_list)) {
-			gfs2_log_unlock(sdp);
-			up_write(&sdp->sd_log_flush_lock);
-			return;
-		}
-		gfs2_log_unlock(sdp);
+	/* Log might have been flushed while we waited for the flush lock */
+	if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
+		up_write(&sdp->sd_log_flush_lock);
+		return;
 	}
 
 	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
@@ -823,7 +819,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
 	down_write(&sdp->sd_log_flush_lock);
 
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
-	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 24e7161486e2..4babd430b722 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -57,7 +57,14 @@ void gfs2_log_incr_head(struct gfs2_sbd *sdp);
 struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
 struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 				      struct buffer_head *real);
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+
+static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
+{
+	if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
+		__gfs2_log_flush(sbd, gl);
+}
+
 void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
 void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
 
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index e901f8f7d650..fae59d69d01a 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -87,6 +87,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	}
 	bd->bd_ail = ai;
 	list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+	clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
 	gfs2_log_unlock(sdp);
 	unlock_buffer(bh);
 }
@@ -124,49 +125,6 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
 	return bh;
 }
 
-static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
-	struct gfs2_glock *gl;
-	struct gfs2_trans *tr = current->journal_info;
-
-	tr->tr_touched = 1;
-
-	gl = container_of(le, struct gfs2_glock, gl_le);
-	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
-		return;
-
-	if (!list_empty(&le->le_list))
-		return;
-
-	gfs2_glock_hold(gl);
-	set_bit(GLF_DIRTY, &gl->gl_flags);
-	sdp->sd_log_num_gl++;
-	list_add(&le->le_list, &sdp->sd_log_le_gl);
-}
-
-static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
-	gfs2_log_lock(sdp);
-	__glock_lo_add(sdp, le);
-	gfs2_log_unlock(sdp);
-}
-
-static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-{
-	struct list_head *head = &sdp->sd_log_le_gl;
-	struct gfs2_glock *gl;
-
-	while (!list_empty(head)) {
-		gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
-		list_del_init(&gl->gl_le.le_list);
-		sdp->sd_log_num_gl--;
-
-		gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
-		gfs2_glock_put(gl);
-	}
-	gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
-}
-
 static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
 	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
@@ -182,7 +140,8 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 	list_add(&bd->bd_list_tr, &tr->tr_list_buf);
 	if (!list_empty(&le->le_list))
 		goto out;
-	__glock_lo_add(sdp, &bd->bd_gl->gl_le);
+	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
 	gfs2_meta_check(sdp, bd->bd_bh);
 	gfs2_pin(sdp, bd->bd_bh);
 	sdp->sd_log_num_buf++;
@@ -568,8 +527,8 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 	if (!list_empty(&le->le_list))
 		goto out;
 
-	if (tr)
-		__glock_lo_add(sdp, &bd->bd_gl->gl_le);
+	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
 	if (gfs2_is_jdata(ip)) {
 		gfs2_pin(sdp, bd->bd_bh);
 		tr->tr_num_databuf_new++;
@@ -776,12 +735,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 }
 
 
-const struct gfs2_log_operations gfs2_glock_lops = {
-	.lo_add = glock_lo_add,
-	.lo_after_commit = glock_lo_after_commit,
-	.lo_name = "glock",
-};
-
 const struct gfs2_log_operations gfs2_buf_lops = {
 	.lo_add = buf_lo_add,
 	.lo_incore_commit = buf_lo_incore_commit,
@@ -819,7 +772,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
 };
 
 const struct gfs2_log_operations *gfs2_log_ops[] = {
-	&gfs2_glock_lops,
 	&gfs2_databuf_lops,
 	&gfs2_buf_lops,
 	&gfs2_rg_lops,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 17de58e83d92..52aaba96d5da 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -77,7 +77,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	spin_lock_init(&sdp->sd_log_lock);
 
-	INIT_LIST_HEAD(&sdp->sd_log_le_gl);
 	INIT_LIST_HEAD(&sdp->sd_log_le_buf);
 	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
 	INIT_LIST_HEAD(&sdp->sd_log_le_rg);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 717983e2c2ae..73e5d92a657c 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -114,11 +114,6 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 		gfs2_log_flush(sdp, NULL);
 }
 
-void gfs2_trans_add_gl(struct gfs2_glock *gl)
-{
-	lops_add(gl->gl_sbd, &gl->gl_le);
-}
-
 /**
  * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
  * @gl: the glock the buffer belongs to
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 043d5f4b9c4c..e826f0dab80a 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -30,7 +30,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 
 void gfs2_trans_end(struct gfs2_sbd *sdp);
 
-void gfs2_trans_add_gl(struct gfs2_glock *gl);
 void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
 void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
-- 
cgit v1.2.3


From fd041f0b4045db8646b36d393cbb274db60649f5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 8 Nov 2007 14:55:03 +0000
Subject: [GFS2] Use atomic_t for journal free blocks counter

This patch changes the counter which keeps track of the free
blocks in the journal to an atomic_t in preparation for the
following patch which will update the log reservation code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |  2 +-
 fs/gfs2/log.c        | 26 +++++++++++++-------------
 fs/gfs2/ops_fstype.c |  4 ++--
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 911822d1e4c0..7ae0206e9a61 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -595,7 +595,7 @@ struct gfs2_sbd {
 	struct list_head sd_log_le_databuf;
 	struct list_head sd_log_le_ordered;
 
-	unsigned int sd_log_blks_free;
+	atomic_t sd_log_blks_free;
 	struct mutex sd_log_reserve_mutex;
 
 	u64 sd_log_sequence;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index d24684330bc3..9192398408f2 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -301,7 +301,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 
 	mutex_lock(&sdp->sd_log_reserve_mutex);
 	gfs2_log_lock(sdp);
-	while(sdp->sd_log_blks_free <= (blks + reserved_blks)) {
+	while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
 		gfs2_log_unlock(sdp);
 		gfs2_ail1_empty(sdp, 0);
 		gfs2_log_flush(sdp, NULL);
@@ -310,7 +310,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 			gfs2_ail1_start(sdp, 0);
 		gfs2_log_lock(sdp);
 	}
-	sdp->sd_log_blks_free -= blks;
+	atomic_sub(blks, &sdp->sd_log_blks_free);
 	gfs2_log_unlock(sdp);
 	mutex_unlock(&sdp->sd_log_reserve_mutex);
 
@@ -330,9 +330,9 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
 {
 
 	gfs2_log_lock(sdp);
-	sdp->sd_log_blks_free += blks;
+	atomic_add(blks, &sdp->sd_log_blks_free);
 	gfs2_assert_withdraw(sdp,
-			     sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+			     atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
 	gfs2_log_unlock(sdp);
 	up_read(&sdp->sd_log_flush_lock);
 }
@@ -559,8 +559,8 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
 	ail2_empty(sdp, new_tail);
 
 	gfs2_log_lock(sdp);
-	sdp->sd_log_blks_free += dist;
-	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+	atomic_add(dist, &sdp->sd_log_blks_free);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
 	gfs2_log_unlock(sdp);
 
 	sdp->sd_log_tail = new_tail;
@@ -733,7 +733,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 		log_flush_commit(sdp);
 	else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
 		gfs2_log_lock(sdp);
-		sdp->sd_log_blks_free--; /* Adjust for unreserved buffer */
+		atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
 		gfs2_log_unlock(sdp);
 		log_write_header(sdp, 0, PULL);
 	}
@@ -773,12 +773,12 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
 	gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
 	reserved = calc_reserved(sdp);
-	old = sdp->sd_log_blks_free;
-	sdp->sd_log_blks_free += tr->tr_reserved -
-				 (reserved - sdp->sd_log_blks_reserved);
+	old = atomic_read(&sdp->sd_log_blks_free);
+	atomic_add(tr->tr_reserved - (reserved - sdp->sd_log_blks_reserved),
+		   &sdp->sd_log_blks_free);
 
-	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
-	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <=
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) >= old);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
 			     sdp->sd_jdesc->jd_blocks);
 
 	sdp->sd_log_blks_reserved = reserved;
@@ -831,7 +831,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
 	log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT,
 			 (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL);
 
-	gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
+	gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
 	gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
 	gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52aaba96d5da..1bba6ac0bcac 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -339,7 +339,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 
 	if (sdp->sd_args.ar_spectator) {
 		sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
-		sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
 	} else {
 		if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
 			fs_err(sdp, "can't mount journal #%u\n",
@@ -376,7 +376,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 			       sdp->sd_jdesc->jd_jid, error);
 			goto fail_jinode_gh;
 		}
-		sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
 	}
 
 	if (sdp->sd_lockstruct.ls_first) {
-- 
cgit v1.2.3


From ec69b188837a347769e187997d040e84a683b38a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 9 Nov 2007 10:01:41 +0000
Subject: [GFS2] Move gfs2_logd into log.c

This means that we can mark gfs2_ail1_empty static and prepares
the way for further changes.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/daemon.c | 50 --------------------------------------------------
 fs/gfs2/daemon.h |  1 -
 fs/gfs2/log.c    | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/gfs2/log.h    |  3 +--
 4 files changed, 56 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 3731ab0771d5..e51991947d2c 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -82,56 +82,6 @@ int gfs2_recoverd(void *data)
 	return 0;
 }
 
-/**
- * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
- * @sdp: Pointer to GFS2 superblock
- *
- * Also, periodically check to make sure that we're using the most recent
- * journal index.
- */
-
-int gfs2_logd(void *data)
-{
-	struct gfs2_sbd *sdp = data;
-	struct gfs2_holder ji_gh;
-	unsigned long t;
-	int need_flush;
-
-	while (!kthread_should_stop()) {
-		/* Advance the log tail */
-
-		t = sdp->sd_log_flush_time +
-		    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
-
-		gfs2_ail1_empty(sdp, DIO_ALL);
-		gfs2_log_lock(sdp);
-		need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
-		gfs2_log_unlock(sdp);
-		if (need_flush || time_after_eq(jiffies, t)) {
-			gfs2_log_flush(sdp, NULL);
-			sdp->sd_log_flush_time = jiffies;
-		}
-
-		/* Check for latest journal index */
-
-		t = sdp->sd_jindex_refresh_time +
-		    gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
-
-		if (time_after_eq(jiffies, t)) {
-			if (!gfs2_jindex_hold(sdp, &ji_gh))
-				gfs2_glock_dq_uninit(&ji_gh);
-			sdp->sd_jindex_refresh_time = jiffies;
-		}
-
-		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
-		schedule_timeout_interruptible(t);
-	}
-
-	return 0;
-}
-
 /**
  * gfs2_quotad - Write cached quota changes into the quota file
  * @sdp: Pointer to GFS2 superblock
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
index 0de9b3557955..4be084fb6a62 100644
--- a/fs/gfs2/daemon.h
+++ b/fs/gfs2/daemon.h
@@ -12,7 +12,6 @@
 
 int gfs2_glockd(void *data);
 int gfs2_recoverd(void *data);
-int gfs2_logd(void *data);
 int gfs2_quotad(void *data);
 
 #endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9192398408f2..e88a684b2209 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -16,6 +16,8 @@
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -26,6 +28,7 @@
 #include "meta_io.h"
 #include "util.h"
 #include "dir.h"
+#include "super.h"
 
 #define PULL 1
 
@@ -208,7 +211,7 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
 	gfs2_log_unlock(sdp);
 }
 
-int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
 {
 	struct gfs2_ail *ai, *s;
 	int ret;
@@ -859,3 +862,54 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 	}
 }
 
+
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+
+int gfs2_logd(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	struct gfs2_holder ji_gh;
+	unsigned long t;
+	int need_flush;
+
+	while (!kthread_should_stop()) {
+		/* Advance the log tail */
+
+		t = sdp->sd_log_flush_time +
+		    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+
+		gfs2_ail1_empty(sdp, DIO_ALL);
+		gfs2_log_lock(sdp);
+		need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
+		gfs2_log_unlock(sdp);
+		if (need_flush || time_after_eq(jiffies, t)) {
+			gfs2_log_flush(sdp, NULL);
+			sdp->sd_log_flush_time = jiffies;
+		}
+
+		/* Check for latest journal index */
+
+		t = sdp->sd_jindex_refresh_time +
+		    gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
+
+		if (time_after_eq(jiffies, t)) {
+			if (!gfs2_jindex_hold(sdp, &ji_gh))
+				gfs2_glock_dq_uninit(&ji_gh);
+			sdp->sd_jindex_refresh_time = jiffies;
+		}
+
+		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+		if (freezing(current))
+			refrigerator();
+		schedule_timeout_interruptible(t);
+	}
+
+	return 0;
+}
+
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 4babd430b722..771152816508 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,8 +48,6 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
 unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 			    unsigned int ssize);
 
-int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
-
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
 void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
 void gfs2_log_incr_head(struct gfs2_sbd *sdp);
@@ -70,5 +68,6 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
 
 void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+int gfs2_logd(void *data);
 
 #endif /* __LOG_DOT_H__ */
-- 
cgit v1.2.3


From e35b921185728850c5db3b5d5b356178f931a157 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 9 Nov 2007 10:07:21 +0000
Subject: [GFS2] Don't periodically update the jindex

We only care about the content of the jindex in two cases,
one is when we mount the fs and the other is when we need
to recover another journal. In both cases we have to update
the jindex anyway, so there is no point in updating it
periodically between times, so this removes it to simplify
gfs2_logd.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  1 -
 fs/gfs2/log.c    | 13 -------------
 fs/gfs2/super.c  |  1 -
 fs/gfs2/sys.c    |  2 --
 4 files changed, 17 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 7ae0206e9a61..330f4c73d0e7 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -411,7 +411,6 @@ struct gfs2_tune {
 	unsigned int gt_demote_secs; /* Cache retention for unheld glock */
 	unsigned int gt_incore_log_blocks;
 	unsigned int gt_log_flush_secs;
-	unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
 
 	unsigned int gt_recoverd_secs;
 	unsigned int gt_logd_secs;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e88a684b2209..4dcc7a8cda22 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -28,7 +28,6 @@
 #include "meta_io.h"
 #include "util.h"
 #include "dir.h"
-#include "super.h"
 
 #define PULL 1
 
@@ -874,7 +873,6 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 int gfs2_logd(void *data)
 {
 	struct gfs2_sbd *sdp = data;
-	struct gfs2_holder ji_gh;
 	unsigned long t;
 	int need_flush;
 
@@ -893,17 +891,6 @@ int gfs2_logd(void *data)
 			sdp->sd_log_flush_time = jiffies;
 		}
 
-		/* Check for latest journal index */
-
-		t = sdp->sd_jindex_refresh_time +
-		    gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
-
-		if (time_after_eq(jiffies, t)) {
-			if (!gfs2_jindex_hold(sdp, &ji_gh))
-				gfs2_glock_dq_uninit(&ji_gh);
-			sdp->sd_jindex_refresh_time = jiffies;
-		}
-
 		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
 		if (freezing(current))
 			refrigerator();
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 548cc8ba0703..2e74792ee487 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -54,7 +54,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_demote_secs = 300;
 	gt->gt_incore_log_blocks = 1024;
 	gt->gt_log_flush_secs = 60;
-	gt->gt_jindex_refresh_secs = 60;
 	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
 	gt->gt_quotad_secs = 5;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 65dd0657e1f8..7f828a2cc858 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -428,7 +428,6 @@ TUNE_ATTR_2(name, name##_store)
 TUNE_ATTR(demote_secs, 0);
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
-TUNE_ATTR(jindex_refresh_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
 TUNE_ATTR(atime_quantum, 0);
@@ -450,7 +449,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_demote_secs.attr,
 	&tune_attr_incore_log_blocks.attr,
 	&tune_attr_log_flush_secs.attr,
-	&tune_attr_jindex_refresh_secs.attr,
 	&tune_attr_quota_warn_period.attr,
 	&tune_attr_quota_quantum.attr,
 	&tune_attr_atime_quantum.attr,
-- 
cgit v1.2.3


From 0b7580c786a5feda6291fe68ead3a1b92b6b35b8 Mon Sep 17 00:00:00 2001
From: Fabio Massimo Di Nitto <fabbione@ubuntu.com>
Date: Thu, 15 Nov 2007 13:48:52 +0000
Subject: [GFS2] Check for installation of mount helpers for DLM mounts

The patch is a fix to abort mount if the mount.gfs* and possible
umount.* are missing from /sbin.

While we do what we can to guarantee that they are installed properly in
userland (CVS HEAD), we want to make sure that mount still aborts properly.

The only sign of missing helpers is that lock_dlm will receive no mount options
at all. According to David the problem does not exist for lock_nolock as the
helpers are not required.

The patch has been tested for both gfs and gfs2 and it works as expected. The
lack of mount.gfs* will generate an error that is propagated to mount:

oot@node1:~# mount -t  gfs2 /dev/nbd2 /mnt/
mount: wrong fs type, bad option, bad superblock on /dev/nbd2,
       missing codepage or helper program, or other error
       In some cases useful info is found in syslog - try
       dmesg | tail  or so

[ 3513.303346] GFS2: fsid=: Trying to join cluster "lock_dlm", "gutsy:gfs2"
[ 3513.304546] DLM/GFS2/GFS ERROR: (u)mount helpers are not installed properly!
[ 3513.306290] GFS2: fsid=: can't mount proto=lock_dlm, table=gutsy:gfs2, hostdata=

You might want to notice that it will also avoid mount to hang or fail silently
or with strange errors that will require the cluster to reboot/restart before
you can actually mount the filesystem again.

Signed-off-by: Fabio M. Di Nitto <fabbione@ubuntu.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/mount.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 41c5b04caaba..ab301023094f 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -67,6 +67,12 @@ static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
 	memset(data, 0, 256);
 	strncpy(data, data_arg, 255);
 
+	if (!strlen(data)) {
+		printk(KERN_ERR
+		       "DLM/GFS2/GFS ERROR: (u)mount helpers are not installed!\n");
+		return -EINVAL;
+	}
+
 	for (options = data; (x = strsep(&options, ":")); ) {
 		if (!*x)
 			continue;
-- 
cgit v1.2.3


From 00c134756c5ad570a1ad3d6f93a67fc9c25a67ea Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 15 Nov 2007 09:01:13 -0600
Subject: [GFS2] tidy up error message

Print error with log_error() to be consistent with others.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Fabio M. Di Nitto <fabbione@ubuntu.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/mount.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index ab301023094f..f2efff424224 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -68,8 +68,7 @@ static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
 	strncpy(data, data_arg, 255);
 
 	if (!strlen(data)) {
-		printk(KERN_ERR
-		       "DLM/GFS2/GFS ERROR: (u)mount helpers are not installed!\n");
+		log_error("no mount options, (u)mount helpers not installed");
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 1a2781cfa5ed8eb82bb311d684f268c1822dae69 Mon Sep 17 00:00:00 2001
From: Fabio Massimo Di Nitto <fabbione@ubuntu.com>
Date: Fri, 16 Nov 2007 09:50:40 +0000
Subject: [GFS2] Fix runtime issue with UP kernels

The issue is indeed UP vs SMP and it is totally random.

spin_is_locked() is a bad assertion because there is no correct answer on UP.
on UP spin_is_locked() has to return either one value or another, always.

This means that in my setup I am lucky enough to trigger the issue and your you
are lucky enough not to.

the patch in attachment removes the bogus calls to BUG_ON and according to David
(in CC and thanks for the long explanation on the problem) we can rely upon
things like lockdep to find problem that might be trying to catch.

Signed-off-by: Fabio M. Di Nitto <fabbione@ubuntu.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 3 ---
 fs/gfs2/log.c   | 2 --
 2 files changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d83df6888402..a7f3c462d4fe 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -217,7 +217,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 	if (atomic_dec_and_test(&gl->gl_ref)) {
 		hlist_del(&gl->gl_list);
 		write_unlock(gl_lock_addr(gl->gl_hash));
-		BUG_ON(spin_is_locked(&gl->gl_spin));
 		gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
 		gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
 		gfs2_assert(sdp, list_empty(&gl->gl_holders));
@@ -460,7 +459,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
 
 static void gfs2_demote_wake(struct gfs2_glock *gl)
 {
-	BUG_ON(!spin_is_locked(&gl->gl_spin));
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
         clear_bit(GLF_DEMOTE, &gl->gl_flags);
         smp_mb__after_clear_bit();
@@ -680,7 +678,6 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
 	gl->gl_owner_pid = 0;
 	gl->gl_ip = 0;
 	run_queue(gl);
-	BUG_ON(!spin_is_locked(&gl->gl_spin));
 	spin_unlock(&gl->gl_spin);
 }
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4dcc7a8cda22..96dcf050e6c9 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -92,8 +92,6 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 	struct buffer_head *bh;
 	int retry;
 
-	BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
-
 	do {
 		retry = 0;
 
-- 
cgit v1.2.3


From 002ef1dc63ded14507c110d3cf83d0c3f51374ab Mon Sep 17 00:00:00 2001
From: Ryan O'Hara <rohara@redhat.com>
Date: Wed, 21 Nov 2007 11:54:54 -0600
Subject: [GFS2] remove unnecessary permission checks

Remove read/write permission() checks from xattr operations.
VFS layer is already handling permission for xattrs via the
xattr_permission() call, so there is no need for gfs2 to
check permissions. Futhermore, using permission() for SELinux
xattrs ops is incorrect.

Signed-off-by: Ryan O'Hara <rohara@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/eaops.c | 30 ------------------------------
 1 file changed, 30 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index aa8dbf303f6d..ef91b6e893a0 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -59,9 +59,6 @@ unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
 static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
 	struct inode *inode = &ip->i_inode;
-	int error = permission(inode, MAY_READ, NULL);
-	if (error)
-		return error;
 
 	return gfs2_ea_get_i(ip, er);
 }
@@ -70,14 +67,6 @@ static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
 	struct inode *inode = &ip->i_inode;
 
-	if (S_ISREG(inode->i_mode) ||
-	    (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
-		int error = permission(inode, MAY_WRITE, NULL);
-		if (error)
-			return error;
-	} else
-		return -EPERM;
-
 	return gfs2_ea_set_i(ip, er);
 }
 
@@ -85,14 +74,6 @@ static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
 	struct inode *inode = &ip->i_inode;
 
-	if (S_ISREG(inode->i_mode) ||
-	    (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
-		int error = permission(inode, MAY_WRITE, NULL);
-		if (error)
-			return error;
-	} else
-		return -EPERM;
-
 	return gfs2_ea_remove_i(ip, er);
 }
 
@@ -108,8 +89,6 @@ static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	     GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
 		return -EOPNOTSUPP;
 
-
-
 	return gfs2_ea_get_i(ip, er);
 }
 
@@ -173,9 +152,6 @@ static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
 	struct inode *inode = &ip->i_inode;
-	int error = permission(inode, MAY_READ, NULL);
-	if (error)
-		return error;
 
 	return gfs2_ea_get_i(ip, er);
 }
@@ -183,9 +159,6 @@ static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
 	struct inode *inode = &ip->i_inode;
-	int error = permission(inode, MAY_WRITE, NULL);
-	if (error)
-		return error;
 
 	return gfs2_ea_set_i(ip, er);
 }
@@ -193,9 +166,6 @@ static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
 	struct inode *inode = &ip->i_inode;
-	int error = permission(inode, MAY_WRITE, NULL);
-	if (error)
-		return error;
 
 	return gfs2_ea_remove_i(ip, er);
 }
-- 
cgit v1.2.3


From 6a69a23f7df18f39e4a084e10b62ff4a144b05a5 Mon Sep 17 00:00:00 2001
From: Fabio Massimo Di Nitto <fabbione@ubuntu.com>
Date: Tue, 27 Nov 2007 06:16:42 +0100
Subject: [GFS2] Fix build warnings

Hi Steven,

Steven Whitehouse wrote:
> Hi,
>
> Now in the -nmw git tree. Thanks,
>
> Steve.
>
> On Wed, 2007-11-21 at 11:54 -0600, Ryan O'Hara wrote:

this patch introduces a bunch of build warnings by leaving around

struct inode *inode = &ip->i_inode;

The patch in attachment cleans them up. Please apply.

Signed-off-by: Fabio Massimo Di Nitto <fabbione@ubuntu.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/eaops.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index ef91b6e893a0..14fbd95fd664 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -58,22 +58,16 @@ unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
 
 static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
-	struct inode *inode = &ip->i_inode;
-
 	return gfs2_ea_get_i(ip, er);
 }
 
 static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
-	struct inode *inode = &ip->i_inode;
-
 	return gfs2_ea_set_i(ip, er);
 }
 
 static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
-	struct inode *inode = &ip->i_inode;
-
 	return gfs2_ea_remove_i(ip, er);
 }
 
@@ -151,22 +145,16 @@ static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 
 static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
-	struct inode *inode = &ip->i_inode;
-
 	return gfs2_ea_get_i(ip, er);
 }
 
 static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
-	struct inode *inode = &ip->i_inode;
-
 	return gfs2_ea_set_i(ip, er);
 }
 
 static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
-	struct inode *inode = &ip->i_inode;
-
 	return gfs2_ea_remove_i(ip, er);
 }
 
-- 
cgit v1.2.3


From bcd405599faa16cf32a3d3f1ce6a1e12cb37fede Mon Sep 17 00:00:00 2001
From: "Fabio M. Di Nitto" <fabbione@ubuntu.com>
Date: Wed, 28 Nov 2007 16:22:09 +0100
Subject: [GFS2] Remove unrequired code

Signed-off-by: Fabio M. Di Nitto <fabbione@ubuntu.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/eaops.c | 42 ++++++------------------------------------
 1 file changed, 6 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index 14fbd95fd664..f114ba2b3557 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -56,21 +56,6 @@ unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
 	return type;
 }
 
-static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	return gfs2_ea_get_i(ip, er);
-}
-
-static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	return gfs2_ea_set_i(ip, er);
-}
-
-static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	return gfs2_ea_remove_i(ip, er);
-}
-
 static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 {
 	if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
@@ -143,25 +128,10 @@ static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	return gfs2_ea_remove_i(ip, er);
 }
 
-static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	return gfs2_ea_get_i(ip, er);
-}
-
-static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	return gfs2_ea_set_i(ip, er);
-}
-
-static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	return gfs2_ea_remove_i(ip, er);
-}
-
 static const struct gfs2_eattr_operations gfs2_user_eaops = {
-	.eo_get = user_eo_get,
-	.eo_set = user_eo_set,
-	.eo_remove = user_eo_remove,
+	.eo_get = gfs2_ea_get_i,
+	.eo_set = gfs2_ea_set_i,
+	.eo_remove = gfs2_ea_remove_i,
 	.eo_name = "user",
 };
 
@@ -173,9 +143,9 @@ const struct gfs2_eattr_operations gfs2_system_eaops = {
 };
 
 static const struct gfs2_eattr_operations gfs2_security_eaops = {
-	.eo_get = security_eo_get,
-	.eo_set = security_eo_set,
-	.eo_remove = security_eo_remove,
+	.eo_get = gfs2_ea_get_i,
+	.eo_set = gfs2_ea_set_i,
+	.eo_remove = gfs2_ea_remove_i,
 	.eo_name = "security",
 };
 
-- 
cgit v1.2.3


From c97bfe4351771675963e02f34d31e206fd2d7150 Mon Sep 17 00:00:00 2001
From: Wendy Cheng <wcheng@redhat.com>
Date: Thu, 29 Nov 2007 17:56:51 -0500
Subject: [GFS2] Remove lock methods for lock_nolock protocol

GFS2 supports two modes of locking - lock_nolock for single node filesystem
and lock_dlm for cluster mode locking. The gfs2 lock methods are removed from
file operation table for lock_nolock protocol. This would allow VFS to handle
posix lock and flock logics just like other in-tree filesystems without
duplication.

Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     | 11 +++++++++--
 fs/gfs2/ops_file.c  | 36 ++++++++++++++++++++++++------------
 fs/gfs2/ops_inode.h |  2 ++
 3 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 92959d093adf..53bca9978fb5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -131,14 +131,21 @@ static struct inode *gfs2_iget_skip(struct super_block *sb,
 
 void gfs2_set_iop(struct inode *inode)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	umode_t mode = inode->i_mode;
 
 	if (S_ISREG(mode)) {
 		inode->i_op = &gfs2_file_iops;
-		inode->i_fop = &gfs2_file_fops;
+		if (sdp->sd_args.ar_localflocks)
+			inode->i_fop = &gfs2_file_fops_nolock;
+		else
+			inode->i_fop = &gfs2_file_fops;
 	} else if (S_ISDIR(mode)) {
 		inode->i_op = &gfs2_dir_iops;
-		inode->i_fop = &gfs2_dir_fops;
+		if (sdp->sd_args.ar_localflocks)
+			inode->i_fop = &gfs2_dir_fops_nolock;
+		else
+			inode->i_fop = &gfs2_dir_fops;
 	} else if (S_ISLNK(mode)) {
 		inode->i_op = &gfs2_symlink_iops;
 	} else {
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index ad5daaa6babc..db76ac1947e7 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -615,15 +615,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 	if (__mandatory_lock(&ip->i_inode))
 		return -ENOLCK;
 
-	if (sdp->sd_args.ar_localflocks) {
-		if (IS_GETLK(cmd)) {
-			posix_test_lock(file, fl);
-			return 0;
-		} else {
-			return posix_lock_file_wait(file, fl);
-		}
-	}
-
 	if (cmd == F_CANCELLK) {
 		/* Hack: */
 		cmd = F_SETLK;
@@ -716,9 +707,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	if (__mandatory_lock(&ip->i_inode))
 		return -ENOLCK;
 
-	if (sdp->sd_args.ar_localflocks)
-		return flock_lock_file_wait(file, fl);
-
 	if (fl->fl_type == F_UNLCK) {
 		do_unflock(file, fl);
 		return 0;
@@ -755,3 +743,27 @@ const struct file_operations gfs2_dir_fops = {
 	.flock		= gfs2_flock,
 };
 
+const struct file_operations gfs2_file_fops_nolock = {
+	.llseek		= gfs2_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.setlease	= gfs2_setlease,
+};
+
+const struct file_operations gfs2_dir_fops_nolock = {
+	.readdir	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+};
+
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index edb519cb05ee..fd8cee231e1d 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -18,6 +18,8 @@ extern const struct inode_operations gfs2_symlink_iops;
 extern const struct inode_operations gfs2_dev_iops;
 extern const struct file_operations gfs2_file_fops;
 extern const struct file_operations gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
 
 extern void gfs2_set_inode_flags(struct inode *inode);
 
-- 
cgit v1.2.3


From 292c8c14cace19c94c6abe25506310239daf949e Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Thu, 29 Nov 2007 14:13:54 -0600
Subject: [GFS2] patch to check for recursive lock requests in gfs2_rename code
 path

A certain scenario in the rename code path triggers a kernel BUG()
because it accidentally does recursive locking The first lock is
requested to unlink an already existing inode (replacing a file) and the
second lock is requested when the destination directory needs to alloc
some space. It is rare that these two
events happen during the same rename call, and even more rare that these
two instances try to lock the same rgrp. It is, however, possible.
https://bugzilla.redhat.com/show_bug.cgi?id=404711

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 09848aac45f6..e0ee195558d3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1063,22 +1063,30 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	int flags = LM_FLAG_TRY;
 	int skipped = 0;
 	int loops = 0;
-	int error;
+	int error, rg_locked;
 
 	/* Try recently successful rgrps */
 
 	rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
 
 	while (rgd) {
-		error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
-					   LM_FLAG_TRY, &al->al_rgd_gh);
+		rg_locked = 0;
+
+		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+			rg_locked = 1;
+			error = 0;
+		} else {
+			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+						   LM_FLAG_TRY, &al->al_rgd_gh);
+		}
 		switch (error) {
 		case 0:
 			if (try_rgrp_fit(rgd, al))
 				goto out;
 			if (rgd->rd_flags & GFS2_RDF_CHECK)
 				inode = try_rgrp_unlink(rgd, last_unlinked);
-			gfs2_glock_dq_uninit(&al->al_rgd_gh);
+			if (!rg_locked)
+				gfs2_glock_dq_uninit(&al->al_rgd_gh);
 			if (inode)
 				return inode;
 			rgd = recent_rgrp_next(rgd, 1);
@@ -1098,15 +1106,23 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	begin = rgd = forward_rgrp_get(sdp);
 
 	for (;;) {
-		error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
-					  &al->al_rgd_gh);
+		rg_locked = 0;
+
+		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+			rg_locked = 1;
+			error = 0;
+		} else {
+			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
+						   &al->al_rgd_gh);
+		}
 		switch (error) {
 		case 0:
 			if (try_rgrp_fit(rgd, al))
 				goto out;
 			if (rgd->rd_flags & GFS2_RDF_CHECK)
 				inode = try_rgrp_unlink(rgd, last_unlinked);
-			gfs2_glock_dq_uninit(&al->al_rgd_gh);
+			if (!rg_locked)
+				gfs2_glock_dq_uninit(&al->al_rgd_gh);
 			if (inode)
 				return inode;
 			break;
@@ -1213,7 +1229,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 			     al->al_line);
 
 	al->al_rgd = NULL;
-	gfs2_glock_dq_uninit(&al->al_rgd_gh);
+	if (al->al_rgd_gh.gh_gl)
+		gfs2_glock_dq_uninit(&al->al_rgd_gh);
 	if (ip != GFS2_I(sdp->sd_rindex))
 		gfs2_glock_dq_uninit(&al->al_ri_gh);
 }
-- 
cgit v1.2.3


From dbee2199c37336e89060fbe9abdfd1ca8454372a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 30 Nov 2007 08:17:15 +0000
Subject: [GFS2] Remove unused variable

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_file.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index db76ac1947e7..2569c13eb108 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -700,7 +700,6 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
-- 
cgit v1.2.3


From 2066b58b0a038d7aedd24133677efb8856cac3a1 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 6 Dec 2007 09:35:25 -0600
Subject: [GFS2] use pid for plock owner for nfs clients

The fl_owner is that of lockd when posix locks arrive from nfs
clients, so it can't be used to distinguish between lock holders.
Use fl_pid as owner instead; it's the pid of the process on the
nfs client.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/plock.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index 1f7b038530b4..2ebd374b3143 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -89,15 +89,19 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
 	op->info.number		= name->ln_number;
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
-	op->info.owner		= (__u64)(long) fl->fl_owner;
 	if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+		/* fl_owner is lockd which doesn't distinguish
+		   processes on the nfs client */
+		op->info.owner	= (__u64) fl->fl_pid;
 		xop->callback	= fl->fl_lmops->fl_grant;
 		locks_init_lock(&xop->flc);
 		locks_copy_lock(&xop->flc, fl);
 		xop->fl		= fl;
 		xop->file	= file;
-	} else
+	} else {
+		op->info.owner	= (__u64)(long) fl->fl_owner;
 		xop->callback	= NULL;
+	}
 
 	send_op(op);
 
@@ -203,7 +207,10 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
 	op->info.number		= name->ln_number;
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
-	op->info.owner		= (__u64)(long) fl->fl_owner;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
 
 	send_op(op);
 	wait_event(recv_wq, (op->done != 0));
@@ -242,7 +249,10 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
 	op->info.number		= name->ln_number;
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
-	op->info.owner		= (__u64)(long) fl->fl_owner;
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+		op->info.owner	= (__u64) fl->fl_pid;
+	else
+		op->info.owner	= (__u64)(long) fl->fl_owner;
 
 	send_op(op);
 	wait_event(recv_wq, (op->done != 0));
-- 
cgit v1.2.3


From e9e1ef2b6ee401d7c1e1eb38052857b4b206d172 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 10 Dec 2007 14:13:27 -0600
Subject: [GFS2] Remove function gfs2_get_block

This patch is just a cleanup.  Function gfs2_get_block() just calls
function gfs2_block_map reversing the last two parameters.  By
reversing the parameters, gfs2_block_map() may be called directly
and function gfs2_get_block may be eliminated altogether.
Since this function is done for every block operation,
this streamlines the code and makes it a little bit more efficient.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c        |  8 ++++----
 fs/gfs2/bmap.h        |  2 +-
 fs/gfs2/log.c         |  2 +-
 fs/gfs2/ops_address.c | 30 +++++++-----------------------
 fs/gfs2/ops_address.h |  2 --
 fs/gfs2/ops_file.c    |  2 +-
 fs/gfs2/quota.c       |  4 ++--
 fs/gfs2/recovery.c    |  2 +-
 8 files changed, 17 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1cfd493e30fb..49486029edc2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -452,8 +452,8 @@ static inline void bmap_unlock(struct inode *inode, int create)
  * Returns: errno
  */
 
-int gfs2_block_map(struct inode *inode, u64 lblock, int create,
-		   struct buffer_head *bh_map)
+int gfs2_block_map(struct inode *inode, sector_t lblock,
+		   struct buffer_head *bh_map, int create)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -559,7 +559,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 	BUG_ON(!new);
 
 	bh.b_size = 1 << (inode->i_blkbits + 5);
-	ret = gfs2_block_map(inode, lblock, create, &bh);
+	ret = gfs2_block_map(inode, lblock, &bh, create);
 	*extlen = bh.b_size >> inode->i_blkbits;
 	*dblock = bh.b_blocknr;
 	if (buffer_new(&bh))
@@ -909,7 +909,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
 	err = 0;
 
 	if (!buffer_mapped(bh)) {
-		gfs2_get_block(inode, iblock, bh, 0);
+		gfs2_block_map(inode, iblock, bh, 0);
 		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh))
 			goto unlock;
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index ac2fd04370dc..4e6cde2943bd 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -15,7 +15,7 @@ struct gfs2_inode;
 struct page;
 
 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh);
+int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
 
 int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 96dcf050e6c9..14333d81cf7d 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -344,7 +344,7 @@ static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
 
 	bh_map.b_size = 1 << inode->i_blkbits;
-	error = gfs2_block_map(inode, lbn, 0, &bh_map);
+	error = gfs2_block_map(inode, lbn, &bh_map, 0);
 	if (error || !bh_map.b_blocknr)
 		printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error,
 		       (unsigned long long)bh_map.b_blocknr, lbn);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 7353933483bb..8f94e306c862 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -58,22 +58,6 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
 	}
 }
 
-/**
- * gfs2_get_block - Fills in a buffer head with details about a block
- * @inode: The inode
- * @lblock: The block number to look up
- * @bh_result: The buffer head to return the result in
- * @create: Non-zero if we may add block to the file
- *
- * Returns: errno
- */
-
-int gfs2_get_block(struct inode *inode, sector_t lblock,
-	           struct buffer_head *bh_result, int create)
-{
-	return gfs2_block_map(inode, lblock, create, bh_result);
-}
-
 /**
  * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
  * @inode: The inode
@@ -89,7 +73,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
 {
 	int error;
 
-	error = gfs2_block_map(inode, lblock, 0, bh_result);
+	error = gfs2_block_map(inode, lblock, bh_result, 0);
 	if (error)
 		return error;
 	if (!buffer_mapped(bh_result))
@@ -100,7 +84,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
 static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
 				 struct buffer_head *bh_result, int create)
 {
-	return gfs2_block_map(inode, lblock, 0, bh_result);
+	return gfs2_block_map(inode, lblock, bh_result, 0);
 }
 
 /**
@@ -504,7 +488,7 @@ static int __gfs2_readpage(void *file, struct page *page)
 		error = stuffed_readpage(ip, page);
 		unlock_page(page);
 	} else {
-		error = mpage_readpage(page, gfs2_get_block);
+		error = mpage_readpage(page, gfs2_block_map);
 	}
 
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
@@ -598,7 +582,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
  *    Any I/O we ignore at this time will be done via readpage later.
  * 2. We don't handle stuffed files here we let readpage do the honours.
  * 3. mpage_readpages() does most of the heavy lifting in the common case.
- * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
+ * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
  */
 
 static int gfs2_readpages(struct file *file, struct address_space *mapping,
@@ -615,7 +599,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
 	if (unlikely(ret))
 		goto out_uninit;
 	if (!gfs2_is_stuffed(ip))
-		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
+		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
 	gfs2_glock_dq(&gh);
 out_uninit:
 	gfs2_holder_uninit(&gh);
@@ -710,7 +694,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	}
 
 prepare_write:
-	error = block_prepare_write(page, from, to, gfs2_get_block);
+	error = block_prepare_write(page, from, to, gfs2_block_map);
 out:
 	if (error == 0)
 		return 0;
@@ -923,7 +907,7 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
 		return 0;
 
 	if (!gfs2_is_stuffed(ip))
-		dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
+		dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
 
 	gfs2_glock_dq_uninit(&i_gh);
 
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
index d3b76d0cdc81..5da21285bba4 100644
--- a/fs/gfs2/ops_address.h
+++ b/fs/gfs2/ops_address.h
@@ -14,8 +14,6 @@
 #include <linux/buffer_head.h>
 #include <linux/mm.h>
 
-extern int gfs2_get_block(struct inode *inode, sector_t lblock,
-			  struct buffer_head *bh_result, int create);
 extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
 extern int gfs2_internal_read(struct gfs2_inode *ip,
 			      struct file_ra_state *ra_state,
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 2569c13eb108..597f7ff2bc11 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -323,7 +323,7 @@ static int gfs2_allocate_page_backing(struct page *page)
 	do {
 		bh.b_state = 0;
 		bh.b_size = size;
-		gfs2_block_map(inode, lblock, 1, &bh);
+		gfs2_block_map(inode, lblock, &bh, 1);
 		if (!buffer_mapped(&bh))
 			return -EIO;
 		size -= bh.b_size;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 4996f0ef3007..8b4c20c49ca7 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -276,7 +276,7 @@ static int bh_get(struct gfs2_quota_data *qd)
 	offset = qd->qd_slot % sdp->sd_qc_per_block;;
 
 	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
-	error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map);
+	error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
 	if (error)
 		goto fail;
 	error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
@@ -645,7 +645,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	}
 
 	if (!buffer_mapped(bh)) {
-		gfs2_get_block(inode, iblock, bh, 1);
+		gfs2_block_map(inode, iblock, bh, 1);
 		if (!buffer_mapped(bh))
 			goto unlock;
 	}
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index beb6c7ac0086..27c994f2d1f0 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -391,7 +391,7 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 	lblock = head->lh_blkno;
 	gfs2_replay_incr_blk(sdp, &lblock);
 	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
-	error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map);
+	error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0);
 	if (error)
 		return error;
 	if (!bh_map.b_blocknr) {
-- 
cgit v1.2.3


From da6dd40d59fa9617ed697b90114e197036901632 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 11 Dec 2007 18:49:21 -0600
Subject: [GFS2] Journal extent mapping

This patch saves a little time when gfs2 writes to the journals by
keeping a mapping between logical and physical blocks on disk.
That's better than constantly looking up indirect pointers in
buffers, when the journals are several levels of indirection
(which they typically are).

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     | 11 ++++++++-
 fs/gfs2/log.c        | 22 +++++++----------
 fs/gfs2/ops_fstype.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/gfs2/super.c      | 13 ++++++++--
 4 files changed, 97 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 330f4c73d0e7..51166c12c5d7 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -360,8 +360,17 @@ struct gfs2_ail {
 	u64 ai_sync_gen;
 };
 
+struct gfs2_journal_extent {
+	struct list_head extent_list;
+
+	unsigned int lblock; /* First logical block */
+	u64 dblock; /* First disk block */
+	u64 blocks;
+};
+
 struct gfs2_jdesc {
 	struct list_head jd_list;
+	struct list_head extent_list;
 
 	struct inode *jd_inode;
 	unsigned int jd_jid;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 14333d81cf7d..69a583ec43c7 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -339,18 +339,14 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
 
 static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 {
-	struct inode *inode = sdp->sd_jdesc->jd_inode;
-	int error;
-	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-
-	bh_map.b_size = 1 << inode->i_blkbits;
-	error = gfs2_block_map(inode, lbn, &bh_map, 0);
-	if (error || !bh_map.b_blocknr)
-		printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error,
-		       (unsigned long long)bh_map.b_blocknr, lbn);
-	gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
-
-	return bh_map.b_blocknr;
+	struct gfs2_journal_extent *je;
+
+	list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
+		if (lbn >= je->lblock && lbn < je->lblock + je->blocks)
+			return je->dblock + lbn;
+	}
+
+	return -1;
 }
 
 /**
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1bba6ac0bcac..0921f17a164c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -21,6 +21,7 @@
 
 #include "gfs2.h"
 #include "incore.h"
+#include "bmap.h"
 #include "daemon.h"
 #include "glock.h"
 #include "glops.h"
@@ -302,6 +303,68 @@ out:
 	return error;
 }
 
+/**
+ * map_journal_extents - create a reusable "extent" mapping from all logical
+ * blocks to all physical blocks for the given journal.  This will save
+ * us time when writing journal blocks.  Most journals will have only one
+ * extent that maps all their logical blocks.  That's because gfs2.mkfs
+ * arranges the journal blocks sequentially to maximize performance.
+ * So the extent would map the first block for the entire file length.
+ * However, gfs2_jadd can happen while file activity is happening, so
+ * those journals may not be sequential.  Less likely is the case where
+ * the users created their own journals by mounting the metafs and
+ * laying it out.  But it's still possible.  These journals might have
+ * several extents.
+ *
+ * TODO: This should be done in bigger chunks rather than one block at a time,
+ *       but since it's only done at mount time, I'm not worried about the
+ *       time it takes.
+ */
+static int map_journal_extents(struct gfs2_sbd *sdp)
+{
+	struct gfs2_jdesc *jd = sdp->sd_jdesc;
+	unsigned int lb;
+	u64 db, prev_db; /* logical block, disk block, prev disk block */
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_journal_extent *jext = NULL;
+	struct buffer_head bh;
+	int rc = 0;
+
+	INIT_LIST_HEAD(&jd->extent_list);
+	prev_db = 0;
+
+	for (lb = 0; lb < ip->i_di.di_size / sdp->sd_sb.sb_bsize; lb++) {
+		bh.b_state = 0;
+		bh.b_blocknr = 0;
+		bh.b_size = 1 << ip->i_inode.i_blkbits;
+		rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
+		db = bh.b_blocknr;
+		if (rc || !db) {
+			printk(KERN_INFO "GFS2 journal mapping error %d: lb="
+			       "%u db=%llu\n", rc, lb, (unsigned long long)db);
+			break;
+		}
+		if (!prev_db || db != prev_db + 1) {
+			jext = kzalloc(sizeof(struct gfs2_journal_extent),
+				       GFP_KERNEL);
+			if (!jext) {
+				printk(KERN_INFO "GFS2 error: out of memory "
+				       "mapping journal extents.\n");
+				rc = -ENOMEM;
+				break;
+			}
+			jext->dblock = db;
+			jext->lblock = lb;
+			jext->blocks = 1;
+			list_add_tail(&jext->extent_list, &jd->extent_list);
+		} else {
+			jext->blocks++;
+		}
+		prev_db = db;
+	}
+	return rc;
+}
+
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
 	struct gfs2_holder ji_gh;
@@ -377,6 +440,9 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 			goto fail_jinode_gh;
 		}
 		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+
+		/* Map the extents for this journal's blocks */
+		map_journal_extents(sdp);
 	}
 
 	if (sdp->sd_lockstruct.ls_first) {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2e74792ee487..22e09660d648 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -416,8 +416,9 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 
 void gfs2_jindex_free(struct gfs2_sbd *sdp)
 {
-	struct list_head list;
+	struct list_head list, *head;
 	struct gfs2_jdesc *jd;
+	struct gfs2_journal_extent *jext;
 
 	spin_lock(&sdp->sd_jindex_spin);
 	list_add(&list, &sdp->sd_jindex_list);
@@ -427,6 +428,14 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
 
 	while (!list_empty(&list)) {
 		jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+		head = &jd->extent_list;
+		while (!list_empty(head)) {
+			jext = list_entry(head->next,
+					  struct gfs2_journal_extent,
+					  extent_list);
+			list_del(&jext->extent_list);
+			kfree(jext);
+		}
 		list_del(&jd->jd_list);
 		iput(jd->jd_inode);
 		kfree(jd);
-- 
cgit v1.2.3


From 0d0868bde33273a200b33e54f4fad6099ad0c566 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 11 Dec 2007 18:51:25 -0600
Subject: [GFS2] Get rid of useless "found" variable in quota.c

This just eliminates an unused variable from the quota code.
Not likely to be a time saver.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8b4c20c49ca7..60cc50fe15b4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -273,7 +273,7 @@ static int bh_get(struct gfs2_quota_data *qd)
 	}
 
 	block = qd->qd_slot / sdp->sd_qc_per_block;
-	offset = qd->qd_slot % sdp->sd_qc_per_block;;
+	offset = qd->qd_slot % sdp->sd_qc_per_block;
 
 	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
 	error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
@@ -1016,7 +1016,6 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 	struct gfs2_alloc *al = &ip->i_alloc;
 	struct gfs2_quota_data *qd;
 	unsigned int x;
-	unsigned int found = 0;
 
 	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
 		return;
@@ -1029,7 +1028,6 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 		if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
 		    (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
 			do_qc(qd, change);
-			found++;
 		}
 	}
 }
-- 
cgit v1.2.3


From 5fdc2eeb5d1d3800367f471690b01fcd1fd5b963 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 11 Dec 2007 19:00:16 -0600
Subject: [GFS2] Run through full bitmaps quicker in gfs2_bitfit

I eliminated the passing of an unused parameter into gfs2_bitfit called rgd.

This also changes the gfs2_bitfit code that searches for free (or used) blocks.
Before, the code was trying to check for bytes that indicated 4 blocks in
the undesired state.  The problem is, it was spending more time trying to
do this than it actually was saving.  This version only optimizes the case
where we're looking for free blocks, and it checks a machine word at a time.
So on 32-bit machines, it will check 32-bits (16 blocks) and on 64-bit
machines, it will check 64-bits (32 blocks) at a time.  The compiler
optimizes that quite well and we save some time, especially when running
through full bitmaps (like the bitmaps allocated for the journals).

There's probably a more elegant or optimized way to do this, but I haven't
thought of it yet.  I'm open to suggestions.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 54 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e0ee195558d3..d7ff9cf6653f 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -126,41 +126,46 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
  * Return: the block number (bitmap buffer scope) that was found
  */
 
-static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
-			    unsigned int buflen, u32 goal,
-			    unsigned char old_state)
+static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
+		       unsigned char old_state)
 {
-	unsigned char *byte, *end, alloc;
+	unsigned char *byte;
 	u32 blk = goal;
-	unsigned int bit;
+	unsigned int bit, bitlong;
+	unsigned long *plong, plong55;
+	static int c = 0;
 
 	byte = buffer + (goal / GFS2_NBBY);
+	plong = buffer + (goal / GFS2_NBBY);
 	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-	end = buffer + buflen;
-	alloc = (old_state == GFS2_BLKST_FREE) ? 0x55 : 0;
-
-	while (byte < end) {
-		/* If we're looking for a free block we can eliminate all
-		   bitmap settings with 0x55, which represents four data
-		   blocks in a row.  If we're looking for a data block, we can
-		   eliminate 0x00 which corresponds to four free blocks. */
-		if ((*byte & 0x55) == alloc) {
-			blk += (8 - bit) >> 1;
-
-			bit = 0;
-			byte++;
-
+	bitlong = bit;
+#if BITS_PER_LONG == 32
+	plong55 = 0x55555555;
+#else
+	plong55 = 0x5555555555555555;
+#endif
+	while (byte < buffer + buflen) {
+
+		if (bitlong == 0 && old_state == 0 && *plong == plong55) {
+			plong++;
+			byte += sizeof(unsigned long);
+			blk += sizeof(unsigned long) * GFS2_NBBY;
 			continue;
 		}
-
-		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
+		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
+			c++;
 			return blk;
-
+		}
 		bit += GFS2_BIT_SIZE;
 		if (bit >= 8) {
 			bit = 0;
 			byte++;
 		}
+		bitlong += GFS2_BIT_SIZE;
+		if (bitlong >= sizeof(unsigned long) * 8) {
+			bitlong = 0;
+			plong++;
+		}
 
 		blk++;
 	}
@@ -1318,11 +1323,10 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
 		   bitmaps, so we must search the originals for that. */
 		if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
-			blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
+			blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset,
 					  bi->bi_len, goal, old_state);
 		else
-			blk = gfs2_bitfit(rgd,
-					  bi->bi_bh->b_data + bi->bi_offset,
+			blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
 					  bi->bi_len, goal, old_state);
 		if (blk != BFITNOENT)
 			break;
-- 
cgit v1.2.3


From 398bbe68321947f6763fbc259a01eb548ce19408 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 11 Dec 2007 19:13:54 -0600
Subject: [GFS2] Reorganize function gfs2_glmutex_lock

This patch optimizes the function gfs2_glmutex_lock.
The basic theory is: Why bother initializing a holder, setting up
wait bits and then waiting on them, if you know the glock can be
yours.  So the holder stuff is placed inside the if checking if the
glock is locked.  This one needs careful scrutiny because changing
anything to do with locking should strike terror into one's heart.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a7f3c462d4fe..80e09c50590a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -620,26 +620,21 @@ static void run_queue(struct gfs2_glock *gl)
 
 static void gfs2_glmutex_lock(struct gfs2_glock *gl)
 {
-	struct gfs2_holder gh;
-
-	gfs2_holder_init(gl, 0, 0, &gh);
-	if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags))
-		BUG();
-
 	spin_lock(&gl->gl_spin);
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+		struct gfs2_holder gh;
+
+		gfs2_holder_init(gl, 0, 0, &gh);
+		set_bit(HIF_WAIT, &gh.gh_iflags);
 		list_add_tail(&gh.gh_list, &gl->gl_waiters1);
+		spin_unlock(&gl->gl_spin);
+		wait_on_holder(&gh);
+		gfs2_holder_uninit(&gh);
 	} else {
 		gl->gl_owner_pid = current->pid;
 		gl->gl_ip = (unsigned long)__builtin_return_address(0);
-		clear_bit(HIF_WAIT, &gh.gh_iflags);
-		smp_mb();
-		wake_up_bit(&gh.gh_iflags, HIF_WAIT);
+		spin_unlock(&gl->gl_spin);
 	}
-	spin_unlock(&gl->gl_spin);
-
-	wait_on_holder(&gh);
-	gfs2_holder_uninit(&gh);
 }
 
 /**
-- 
cgit v1.2.3


From b0d5fd307463405fe1f57494fbb37f810715ed6d Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 11 Dec 2007 19:16:09 -0600
Subject: [GFS2] Only fetch the dinode once in block_map

Function gfs2_block_map was often looking up the disk inode twice.
This optimizes it so that only does it once.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 49486029edc2..224114166529 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -469,6 +469,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
 	struct metapath mp;
 	u64 size;
+	struct buffer_head *dibh = NULL;
 
 	BUG_ON(maxlen == 0);
 
@@ -499,6 +500,8 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	error = gfs2_meta_inode_buffer(ip, &bh);
 	if (error)
 		goto out_fail;
+	dibh = bh;
+	get_bh(dibh);
 
 	for (x = 0; x < end_of_metadata; x++) {
 		lookup_block(ip, bh, x, &mp, create, &new, &dblock);
@@ -517,13 +520,8 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 		if (boundary)
 			set_buffer_boundary(bh_map);
 		if (new) {
-			struct buffer_head *dibh;
-			error = gfs2_meta_inode_buffer(ip, &dibh);
-			if (!error) {
-				gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-				gfs2_dinode_out(ip, dibh->b_data);
-				brelse(dibh);
-			}
+			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+			gfs2_dinode_out(ip, dibh->b_data);
 			set_buffer_new(bh_map);
 			goto out_brelse;
 		}
@@ -544,6 +542,8 @@ out_brelse:
 out_ok:
 	error = 0;
 out_fail:
+	if (dibh)
+		brelse(dibh);
 	bmap_unlock(inode, create);
 	return error;
 }
-- 
cgit v1.2.3


From 15c7cee7995a9013f1b2f31a15b70e1d2e8ae501 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 11 Dec 2007 19:29:17 -0600
Subject: [GFS2] Function meta_read optimization

This patch optimizes function gfs2_meta_read.  Basically, gfs2_meta_wait
was being called regardless of whether a disk read was requested.
This just pulls that wait into the if that triggers the read.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/meta_io.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 4b1aced9023d..3144d35a6261 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -222,13 +222,14 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   struct buffer_head **bhp)
 {
 	*bhp = getbuf(gl, blkno, CREATE);
-	if (!buffer_uptodate(*bhp))
+	if (!buffer_uptodate(*bhp)) {
 		ll_rw_block(READ_META, 1, bhp);
-	if (flags & DIO_WAIT) {
-		int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
-		if (error) {
-			brelse(*bhp);
-			return error;
+		if (flags & DIO_WAIT) {
+			int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+			if (error) {
+				brelse(*bhp);
+				return error;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From b3513fca7e41965d85125c9770ce5f8fd4ff509a Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 12 Dec 2007 09:24:08 -0600
Subject: [GFS2] Incremental patch to fix compiler warning

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index d7ff9cf6653f..68c4bf363c46 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -133,10 +133,9 @@ static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
 	u32 blk = goal;
 	unsigned int bit, bitlong;
 	unsigned long *plong, plong55;
-	static int c = 0;
 
 	byte = buffer + (goal / GFS2_NBBY);
-	plong = buffer + (goal / GFS2_NBBY);
+	plong = (unsigned long *)(buffer + (goal / GFS2_NBBY));
 	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
 	bitlong = bit;
 #if BITS_PER_LONG == 32
@@ -152,10 +151,8 @@ static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
 			blk += sizeof(unsigned long) * GFS2_NBBY;
 			continue;
 		}
-		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
-			c++;
+		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
 			return blk;
-		}
 		bit += GFS2_BIT_SIZE;
 		if (bit >= 8) {
 			bit = 0;
-- 
cgit v1.2.3


From c3f60b6e3a7667f78a63b15cf09655ecfca757fc Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 12 Dec 2007 11:44:41 -0600
Subject: [GFS2] Eliminate the no longer needed sd_statfs_mutex

This patch eliminates the unneeded sd_statfs_mutex mutex but preserves
the ordering as discussed.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     | 1 -
 fs/gfs2/ops_fstype.c | 1 -
 fs/gfs2/super.c      | 4 ----
 3 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 51166c12c5d7..350b5169a9a0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -529,7 +529,6 @@ struct gfs2_sbd {
 	/* StatFS stuff */
 
 	spinlock_t sd_statfs_spin;
-	struct mutex sd_statfs_mutex;
 	struct gfs2_statfs_change_host sd_statfs_master;
 	struct gfs2_statfs_change_host sd_statfs_local;
 	unsigned long sd_statfs_sync_time;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 0921f17a164c..79f9bb365f10 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -60,7 +60,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	mutex_init(&sdp->sd_inum_mutex);
 	spin_lock_init(&sdp->sd_statfs_spin);
-	mutex_init(&sdp->sd_statfs_mutex);
 
 	spin_lock_init(&sdp->sd_rindex_spin);
 	mutex_init(&sdp->sd_rindex_mutex);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 22e09660d648..5d0017d313a3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -688,9 +688,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 	if (error)
 		return;
 
-	mutex_lock(&sdp->sd_statfs_mutex);
 	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
-	mutex_unlock(&sdp->sd_statfs_mutex);
 
 	spin_lock(&sdp->sd_statfs_spin);
 	l_sc->sc_total += total;
@@ -738,9 +736,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
 	if (error)
 		goto out_bh2;
 
-	mutex_lock(&sdp->sd_statfs_mutex);
 	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
-	mutex_unlock(&sdp->sd_statfs_mutex);
 
 	spin_lock(&sdp->sd_statfs_spin);
 	m_sc->sc_total += l_sc->sc_total;
-- 
cgit v1.2.3


From fa3742fa8545df20e54aa0953a1873cca3a9bd92 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 12 Dec 2007 17:52:13 -0600
Subject: [GFS2] Minor correction

This is a small correction to my previously posted patch1.
It just changes a divide to a shift.  It's faster and doesn't
introduce odd dependencies on 32-bit compiles.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 79f9bb365f10..5537798af381 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -332,7 +332,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 	INIT_LIST_HEAD(&jd->extent_list);
 	prev_db = 0;
 
-	for (lb = 0; lb < ip->i_di.di_size / sdp->sd_sb.sb_bsize; lb++) {
+	for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
 		bh.b_state = 0;
 		bh.b_blocknr = 0;
 		bh.b_size = 1 << ip->i_inode.i_blkbits;
-- 
cgit v1.2.3


From ff91cc9bb41b62bc4ea7d5ced396fabf97539df9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 14 Dec 2007 14:04:34 +0000
Subject: [GFS2] Fix log block mapper

A missing offset in the calculation.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 69a583ec43c7..91645259e135 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -343,7 +343,7 @@ static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 
 	list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
 		if (lbn >= je->lblock && lbn < je->lblock + je->blocks)
-			return je->dblock + lbn;
+			return je->dblock + lbn - je->lblock;
 	}
 
 	return -1;
-- 
cgit v1.2.3


From 65a6290998f3d38b5c5e84423ae9e08bdd957095 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 2 Jan 2008 10:16:56 +0000
Subject: [GFS2] Remove unused variable

The go_drop_th function is never called or referenced.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 350b5169a9a0..745dada4085c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -131,7 +131,6 @@ struct gfs2_bufdata {
 struct gfs2_glock_operations {
 	void (*go_xmote_th) (struct gfs2_glock *gl);
 	void (*go_xmote_bh) (struct gfs2_glock *gl);
-	void (*go_drop_th) (struct gfs2_glock *gl);
 	void (*go_inval) (struct gfs2_glock *gl, int flags);
 	int (*go_demote_ok) (struct gfs2_glock *gl);
 	int (*go_lock) (struct gfs2_holder *gh);
-- 
cgit v1.2.3


From e5d9dc278c7f79c220e4506cc1ade2efa2ca73fd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 3 Jan 2008 11:31:38 +0000
Subject: [GFS2] Allow page migration for writeback and ordered pages

To improve performance on NUMA, we use the VM's standard page
migration for writeback and ordered pages. Probably we could
also do the same for journaled data, but that would need a
careful audit of the code, so will be the subject of a later
patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 8f94e306c862..e16ad8104495 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -1098,6 +1098,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
 	.invalidatepage = gfs2_invalidatepage,
 	.releasepage = gfs2_releasepage,
 	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
 };
 
 static const struct address_space_operations gfs2_ordered_aops = {
@@ -1112,6 +1113,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
 	.invalidatepage = gfs2_invalidatepage,
 	.releasepage = gfs2_releasepage,
 	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
 };
 
 static const struct address_space_operations gfs2_jdata_aops = {
-- 
cgit v1.2.3


From 0811a127cb83ad2e0355e5e3e30164d7ef0f2d65 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 3 Jan 2008 09:24:53 -0600
Subject: [GFS2] Initialize extent_list earlier

Here is a patch for the latest upstream GFS2 code:
The journal extent map needs to be initialized sooner than it
currently is.  Otherwise failed mount attempts (e.g. not enough
journals, etc.) may panic trying to access the uninitialized list.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 1 -
 fs/gfs2/super.c      | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 5537798af381..43d511bba52d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -329,7 +329,6 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 	struct buffer_head bh;
 	int rc = 0;
 
-	INIT_LIST_HEAD(&jd->extent_list);
 	prev_db = 0;
 
 	for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5d0017d313a3..ef0562c3bc71 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -387,6 +387,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 		if (!jd)
 			break;
 
+		INIT_LIST_HEAD(&jd->extent_list);
 		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
 		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
 			if (!jd->jd_inode)
-- 
cgit v1.2.3


From 9656b2c14c6ee0806c90a6be41dec71117fc8f50 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 8 Jan 2008 08:14:30 +0000
Subject: [GFS2] Fix problems relating to execution of files on GFS2

This patch fixes a couple of problems which affected the execution of files
on GFS2. The first is that there was a corner case where inodes were not
always uptodate at the point at which permissions checks were being carried
out, this was resulting in refusal of execute permission, but only on the
first lookup, subsequent requests worked correctly. The second was a problem
relating to incorrect updating of file sizes which was introduced with the
write_begin/end code for GFS2 a little while back.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/ops_address.c | 13 +++++--------
 fs/gfs2/ops_inode.c   | 12 +++++++++++-
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index e16ad8104495..37406a379e7a 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -848,14 +848,11 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 
-	if (likely(ret >= 0)) {
-		copied = ret;
-		if  ((pos + copied) > inode->i_size) {
-			di = (struct gfs2_dinode *)dibh->b_data;
-			ip->i_di.di_size = inode->i_size;
-			di->di_size = cpu_to_be64(inode->i_size);
-			mark_inode_dirty(inode);
-		}
+	if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
+		di = (struct gfs2_dinode *)dibh->b_data;
+		ip->i_di.di_size = inode->i_size;
+		di->di_size = cpu_to_be64(inode->i_size);
+		mark_inode_dirty(inode);
 	}
 
 	if (inode == sdp->sd_rindex)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 291f0c7eaa3b..8386ab323e33 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -113,8 +113,18 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 	if (inode && IS_ERR(inode))
 		return ERR_PTR(PTR_ERR(inode));
 
-	if (inode)
+	if (inode) {
+		struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+		struct gfs2_holder gh;
+		int error;
+		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+		if (error) {
+			iput(inode);
+			return ERR_PTR(error);
+		}
+		gfs2_glock_dq_uninit(&gh);
 		return d_splice_alias(inode, dentry);
+	}
 	d_add(dentry, inode);
 
 	return NULL;
-- 
cgit v1.2.3


From ac39aadd0440ae696e6dacaa8006ce1737b17008 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 10 Jan 2008 14:49:43 +0000
Subject: [GFS2] Fix assert in log code

Although the values were all being calculated correctly, there was a
race in the assert due to the way it was using atomic variables. This
changes the value we assert on so that we get the same effect by testing
a different variable. This prevents the assert triggering when it shouldn't.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 91645259e135..161ab6f2058e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -757,7 +757,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
 	unsigned int reserved;
-	unsigned int old;
+	unsigned int unused;
 
 	gfs2_log_lock(sdp);
 
@@ -769,14 +769,11 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
 	gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
 	reserved = calc_reserved(sdp);
-	old = atomic_read(&sdp->sd_log_blks_free);
-	atomic_add(tr->tr_reserved - (reserved - sdp->sd_log_blks_reserved),
-		   &sdp->sd_log_blks_free);
-
-	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) >= old);
+	unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
+	gfs2_assert_withdraw(sdp, unused >= 0);
+	atomic_add(unused, &sdp->sd_log_blks_free);
 	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
 			     sdp->sd_jdesc->jd_blocks);
-
 	sdp->sd_log_blks_reserved = reserved;
 
 	gfs2_log_unlock(sdp);
-- 
cgit v1.2.3


From 6dbd822487d0a9f14432cb4680415b80656b63a2 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 10 Jan 2008 15:18:55 +0000
Subject: [GFS2] Reduce inode size by moving i_alloc out of line

It is possible to reduce the size of GFS2 inodes by taking the i_alloc
structure out of the gfs2_inode. This patch allocates the i_alloc
structure whenever its needed, and frees it afterward. This decreases
the amount of low memory we use at the expense of requiring a memory
allocation for each page or partial page that we write. A quick test
with postmark shows that the overhead is not measurable and I also note
that OCFS2 use the same approach.

In the future I'd like to solve the problem by shrinking down the size
of the members of the i_alloc structure, but for now, this reduces the
immediate problem of using too much low-memory on x86 and doesn't add
too much overhead.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c        |  4 ++--
 fs/gfs2/dir.c         |  4 ++--
 fs/gfs2/eattr.c       |  2 +-
 fs/gfs2/incore.h      |  2 +-
 fs/gfs2/inode.c       |  7 ++++---
 fs/gfs2/main.c        |  1 +
 fs/gfs2/ops_address.c |  5 ++---
 fs/gfs2/ops_file.c    |  6 ++++--
 fs/gfs2/ops_inode.c   |  8 ++++----
 fs/gfs2/quota.c       | 12 ++++++------
 fs/gfs2/rgrp.c        | 20 +++++++++-----------
 fs/gfs2/rgrp.h        |  4 +++-
 12 files changed, 39 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 224114166529..73dfad70de66 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -683,7 +683,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (metadata)
 		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
 
-	error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
+	error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
 	if (error)
 		return error;
 
@@ -785,7 +785,7 @@ out_rg_gunlock:
 out_rlist:
 	gfs2_rlist_free(&rlist);
 out:
-	gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
+	gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
 	return error;
 }
 
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 9949bb746a52..57e2ed932adc 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1876,7 +1876,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 	if (error)
 		goto out;
 
-	error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
+	error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
 	if (error)
 		goto out_qs;
 
@@ -1949,7 +1949,7 @@ out_rg_gunlock:
 	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
 out_rlist:
 	gfs2_rlist_free(&rlist);
-	gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
+	gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
 out_qs:
 	gfs2_quota_unhold(dip);
 out:
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 2a7435b5c4dc..bee99704ea10 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -1418,7 +1418,7 @@ out:
 static int ea_dealloc_block(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_rgrpd *rgd;
 	struct buffer_head *dibh;
 	int error;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 745dada4085c..4cdda1a3e12c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -264,7 +264,7 @@ struct gfs2_inode {
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
-	struct gfs2_alloc i_alloc;
+	struct gfs2_alloc *i_alloc;
 	u64 i_last_rg_alloc;
 
 	spinlock_t i_spin;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 53bca9978fb5..c84764ad82b3 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -711,9 +711,10 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	int error;
 
-	gfs2_alloc_get(dip);
+	if (gfs2_alloc_get(dip) == NULL)
+		return -ENOMEM;
 
-	dip->i_alloc.al_requested = RES_DINODE;
+	dip->i_alloc->al_requested = RES_DINODE;
 	error = gfs2_inplace_reserve(dip);
 	if (error)
 		goto out;
@@ -900,7 +901,7 @@ fail_end_trans:
 	gfs2_trans_end(sdp);
 
 fail_ipreserv:
-	if (dip->i_alloc.al_rgd)
+	if (dip->i_alloc->al_rgd)
 		gfs2_inplace_release(dip);
 
 fail_quota_locks:
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 653fd5a6203a..88686fcdfb1b 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -31,6 +31,7 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
 	inode_init_once(&ip->i_inode);
 	spin_lock_init(&ip->i_spin);
 	init_rwsem(&ip->i_rw_mutex);
+	ip->i_alloc = NULL;
 }
 
 static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 37406a379e7a..38dbe99a30ed 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -646,7 +646,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	if (error)
 		goto out_unlock;
 
-	ip->i_alloc.al_requested = 0;
 	if (alloc_required) {
 		al = gfs2_alloc_get(ip);
 
@@ -823,7 +822,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh;
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_dinode *di;
 	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned int to = from + len;
@@ -861,7 +860,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	brelse(dibh);
 	gfs2_trans_end(sdp);
 failed:
-	if (al->al_requested) {
+	if (al) {
 		gfs2_inplace_release(ip);
 		gfs2_quota_unlock(ip);
 		gfs2_alloc_put(ip);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 597f7ff2bc11..d7f4726ae0ce 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -364,9 +364,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
 	if (ret || !alloc_required)
 		goto out_unlock;
-
-	ip->i_alloc.al_requested = 0;
+	ret = -ENOMEM;
 	al = gfs2_alloc_get(ip);
+	if (al == NULL)
+		goto out_unlock;
+
 	ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
 	if (ret)
 		goto out_alloc_put;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 8386ab323e33..9f71372c1757 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -61,7 +61,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
 		inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0);
 		if (!IS_ERR(inode)) {
 			gfs2_trans_end(sdp);
-			if (dip->i_alloc.al_rgd)
+			if (dip->i_alloc->al_rgd)
 				gfs2_inplace_release(dip);
 			gfs2_quota_unlock(dip);
 			gfs2_alloc_put(dip);
@@ -376,7 +376,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 	}
 
 	gfs2_trans_end(sdp);
-	if (dip->i_alloc.al_rgd)
+	if (dip->i_alloc->al_rgd)
 		gfs2_inplace_release(dip);
 	gfs2_quota_unlock(dip);
 	gfs2_alloc_put(dip);
@@ -452,7 +452,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
 
 	gfs2_trans_end(sdp);
-	if (dip->i_alloc.al_rgd)
+	if (dip->i_alloc->al_rgd)
 		gfs2_inplace_release(dip);
 	gfs2_quota_unlock(dip);
 	gfs2_alloc_put(dip);
@@ -558,7 +558,7 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	}
 
 	gfs2_trans_end(sdp);
-	if (dip->i_alloc.al_rgd)
+	if (dip->i_alloc->al_rgd)
 		gfs2_inplace_release(dip);
 	gfs2_quota_unlock(dip);
 	gfs2_alloc_put(dip);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 60cc50fe15b4..a08dabd6ce90 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -453,7 +453,7 @@ static void qdsb_put(struct gfs2_quota_data *qd)
 int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_quota_data **qd = al->al_qd;
 	int error;
 
@@ -501,7 +501,7 @@ out:
 void gfs2_quota_unhold(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	unsigned int x;
 
 	gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
@@ -853,7 +853,7 @@ fail:
 int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	unsigned int x;
 	int error = 0;
 
@@ -921,7 +921,7 @@ static int need_sync(struct gfs2_quota_data *qd)
 
 void gfs2_quota_unlock(struct gfs2_inode *ip)
 {
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_quota_data *qda[4];
 	unsigned int count = 0;
 	unsigned int x;
@@ -969,7 +969,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
 int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_quota_data *qd;
 	s64 value;
 	unsigned int x;
@@ -1013,7 +1013,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 		       u32 uid, u32 gid)
 {
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_quota_data *qd;
 	unsigned int x;
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 68c4bf363c46..3552110b2e5f 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -819,11 +819,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 
 struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
 {
-	struct gfs2_alloc *al = &ip->i_alloc;
-
-	/* FIXME: Should assert that the correct locks are held here... */
-	memset(al, 0, sizeof(*al));
-	return al;
+	BUG_ON(ip->i_alloc != NULL);
+	ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL);
+	return ip->i_alloc;
 }
 
 /**
@@ -1061,7 +1059,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	struct inode *inode = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *rgd, *begin = NULL;
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	int flags = LM_FLAG_TRY;
 	int skipped = 0;
 	int loops = 0;
@@ -1176,7 +1174,7 @@ out:
 int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	struct inode *inode;
 	int error = 0;
 	u64 last_unlinked = NO_BLOCK;
@@ -1222,7 +1220,7 @@ try_again:
 void gfs2_inplace_release(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 
 	if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
 		fs_warn(sdp, "al_alloced = %u, al_requested = %u "
@@ -1412,7 +1410,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 u64 gfs2_alloc_data(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_rgrpd *rgd = al->al_rgd;
 	u32 goal, blk;
 	u64 block;
@@ -1457,7 +1455,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
 u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_alloc *al = ip->i_alloc;
 	struct gfs2_rgrpd *rgd = al->al_rgd;
 	u32 goal, blk;
 	u64 block;
@@ -1503,7 +1501,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-	struct gfs2_alloc *al = &dip->i_alloc;
+	struct gfs2_alloc *al = dip->i_alloc;
 	struct gfs2_rgrpd *rgd = al->al_rgd;
 	u32 blk;
 	u64 block;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4c6adfc6f2e..149bb161f4b6 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -32,7 +32,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
 struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 {
-	return; /* So we can see where ip->i_alloc is used */
+	BUG_ON(ip->i_alloc == NULL);
+	kfree(ip->i_alloc);
+	ip->i_alloc = NULL;
 }
 
 int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
-- 
cgit v1.2.3


From 598278bd4808ed81b0e6fa445458a7d549f72a32 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Fri, 11 Jan 2008 13:31:12 -0600
Subject: [GFS2] Remove unneeded i_spin

This patch removes a vestigial variable "i_spin" from the gfs2_inode
structure.  This not only saves us memory (>300000 of these in memory
for the oom test) it also saves us time because we don't have to
spend time initializing it (i.e. slightly better performance).

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 1 -
 fs/gfs2/main.c   | 1 -
 2 files changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4cdda1a3e12c..513aaf0dc0ab 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -267,7 +267,6 @@ struct gfs2_inode {
 	struct gfs2_alloc *i_alloc;
 	u64 i_last_rg_alloc;
 
-	spinlock_t i_spin;
 	struct rw_semaphore i_rw_mutex;
 };
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 88686fcdfb1b..9c7765c12d62 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -29,7 +29,6 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
 	struct gfs2_inode *ip = foo;
 
 	inode_init_once(&ip->i_inode);
-	spin_lock_init(&ip->i_spin);
 	init_rwsem(&ip->i_rw_mutex);
 	ip->i_alloc = NULL;
 }
-- 
cgit v1.2.3


From 05220535196d413db434527a3edcba79b7187df8 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Fri, 11 Jan 2008 13:44:50 -0600
Subject: [GFS2] gfs2_alloc_required performance

This is a small I/O performance enhancement to gfs2.  (Actually, it is a rework of
an earlier version I got wrong).  The idea here is to check if the write extends
past the last block in the file.  If so, the function can save itself a lot of
time and trouble because it knows an allocate will be required.  Benchmarks like
iozone should see better performance.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 73dfad70de66..4356cc2fb3f5 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1224,6 +1224,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 		unsigned int shift = sdp->sd_sb.sb_bsize_shift;
 		lblock = offset >> shift;
 		lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+		if (lblock_stop > ip->i_di.di_blocks) { /* writing past the
+							   last block */
+			*alloc_required = 1;
+			return 0;
+		}
 	}
 
 	for (; lblock < lblock_stop; lblock += extlen) {
-- 
cgit v1.2.3


From 1af535727bbf68e1da7ac232de47315da4c66ade Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 16 Jan 2008 14:24:05 +0000
Subject: [GFS2] Fix write alloc required shortcut calculation

The comparison was being made against the wrong quantity.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 4356cc2fb3f5..e4effc47abfc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1222,10 +1222,10 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 		do_div(lblock_stop, bsize);
 	} else {
 		unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+		u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
 		lblock = offset >> shift;
 		lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-		if (lblock_stop > ip->i_di.di_blocks) { /* writing past the
-							   last block */
+		if (lblock_stop > end_of_file) {
 			*alloc_required = 1;
 			return 0;
 		}
-- 
cgit v1.2.3


From 3e5cd0877e6d2f059dc36b8206cb7e93938151db Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 16 Jan 2008 08:45:39 -0600
Subject: [GFS2] Fix typo

This patch fixes a minor typo.  Surprisingly, it still compiled.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/meta_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 3144d35a6261..85aea27b4a86 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -284,7 +284,7 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 		return;
 	}
 
-	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
+	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
 	bd->bd_bh = bh;
 	bd->bd_gl = gl;
 
-- 
cgit v1.2.3


From b7fe2e391ee7b711d6dfd6a694d60c4f21113cbb Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 17 Jan 2008 15:12:03 +0000
Subject: [GFS2] Fix page_mkwrite truncation race path

There was a bug in the truncation/invalidation race path for
->page_mkwrite for gfs2. It ought to return 0 so that the effect is the
same as if the page was truncated at any of the other points at which
the page_lock is dropped. This will result in the restart of the whole
page fault path. If it was due to a real truncation (as opposed to an
invalidate because we let a glock go) then the ->fault path will pick
that up when it gets called again.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index d7f4726ae0ce..f4842f2548cd 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -394,6 +394,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
 	if (page->index > last_index)
 		goto out_unlock_page;
+	ret = 0;
 	if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
 		goto out_unlock_page;
 	if (gfs2_is_stuffed(ip)) {
-- 
cgit v1.2.3


From 1b8177ec1e779bcc3ed89419ff7c80dbc3dcc489 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Sat, 19 Jan 2008 21:50:24 -0600
Subject: [GFS2] Lockup on error

I spotted this bug while I was digging around.  Looks like it could cause
a lockup in some rare error condition.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c84764ad82b3..728d3169e7bd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -860,7 +860,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 
 	error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
 	if (alloc_required < 0)
-		goto fail;
+		goto fail_quota_locks;
 	if (alloc_required) {
 		error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
 		if (error)
-- 
cgit v1.2.3


From 7bc5c414fe6627ec518c82d154c796f0981f5b02 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Fri, 18 Jan 2008 14:06:37 -0600
Subject: [GFS2] Allow journal recovery on read-only mount

This patch allows gfs2 to perform journal recovery even if it is mounted
read-only. Strictly speaking, a read-only mount should not be writing to
the filesystem, but we do this only to perform journal recovery. A
read-only mount will fail if we don't recover the dirty journal. Also,
when gfs2 is used as a root filesystem, it will be mounted read-only
before being mounted read-write during the boot sequence. A failed
read-only mount will panic the machine during bootup.

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/recovery.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 27c994f2d1f0..b249e294a95b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -504,13 +504,21 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 			if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
 				ro = 1;
 		} else {
-			if (sdp->sd_vfs->s_flags & MS_RDONLY)
-				ro = 1;
+			if (sdp->sd_vfs->s_flags & MS_RDONLY) {
+				/* check if device itself is read-only */
+				ro = bdev_read_only(sdp->sd_vfs->s_bdev);
+				if (!ro) {
+					fs_info(sdp, "recovery required on "
+						"read-only filesystem.\n");
+					fs_info(sdp, "write access will be "
+						"enabled during recovery.\n");
+				}
+			}
 		}
 
 		if (ro) {
-			fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
-				jd->jd_jid);
+			fs_warn(sdp, "jid=%u: Can't replay: read-only block "
+				"device\n", jd->jd_jid);
 			error = -EROFS;
 			goto fail_gunlock_tr;
 		}
-- 
cgit v1.2.3


From 366781c19635d861f43ff5e03388a3873ec912d9 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 25 Jan 2008 10:12:41 +0000
Subject: [CIFS] DFS build fixes

Also includes a few minor changes suggested by Christoph

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 14 ++++++++------
 fs/cifs/cifsglob.h     | 41 ++++++++++++++++++++++++++++++++---------
 fs/cifs/cifsproto.h    |  4 ++--
 fs/cifs/connect.c      | 12 ++++++++----
 fs/cifs/dns_resolve.c  |  5 +++--
 fs/cifs/link.c         | 16 ++++++++--------
 6 files changed, 61 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 15e31f8435ba..413ee2349d1a 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -3,8 +3,9 @@
  *   traversal via DFS junction point
  *
  *   Copyright (c) 2007 Igor Mammedov
+ *   Copyright (C) International Business Machines  Corp., 2008
  *   Author(s): Igor Mammedov (niallain@gmail.com)
- *
+ *		Steve French (sfrench@us.ibm.com)
  *   This program is free software; you can redistribute it and/or
  *   modify it under the terms of the GNU General Public License
  *   as published by the Free Software Foundation; either version
@@ -107,8 +108,9 @@ static char *cifs_get_share_name(const char *node_name)
  * Returns: pointer to new mount options or ERR_PTR.
  * Caller is responcible for freeing retunrned value if it is not error.
  */
-char *compose_mount_options(const char *sb_mountdata, const char *ref_unc,
-				char **devname)
+static char *compose_mount_options(const char *sb_mountdata,
+				   const char *ref_unc,
+				   char **devname)
 {
 	int rc;
 	char *mountdata;
@@ -188,13 +190,13 @@ compose_mount_options_out:
 }
 
 
-struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
+static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
 		struct dentry *dentry, char *ref_unc)
 {
 	struct cifs_sb_info *cifs_sb;
 	struct vfsmount *mnt;
 	char *mountdata;
-	char *devname;
+	char *devname = NULL;
 
 	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
 	mountdata = compose_mount_options(cifs_sb->mountdata,
@@ -278,7 +280,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 	return err;
 }
 
-void dump_referral(const struct dfs_info3_param *ref)
+static void dump_referral(const struct dfs_info3_param *ref)
 {
 	cFYI(1, ("DFS: ref path: %s", ref->path_name));
 	cFYI(1, ("DFS: node path: %s", ref->node_name));
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1fde2197ad76..5d32d8ddc82e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifsglob.h
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *              Jeremy Allison (jra@samba.org)
  *
@@ -69,14 +69,6 @@
 #define XATTR_DOS_ATTRIB "user.DOSATTRIB"
 #endif
 
-/*
- * This information is kept on every Server we know about.
- *
- * Some things to note:
- *
- */
-#define SERVER_NAME_LEN_WITH_NULL	(SERVER_NAME_LENGTH + 1)
-
 /*
  * CIFS vfs client Status information (based on what we know.)
  */
@@ -460,6 +452,37 @@ struct dir_notify_req {
        struct file *pfile;
 };
 
+struct dfs_info3_param {
+	int flags; /* DFSREF_REFERRAL_SERVER, DFSREF_STORAGE_SERVER*/
+	int PathConsumed;
+	int server_type;
+	int ref_flag;
+	char *path_name;
+	char *node_name;
+};
+
+static inline void free_dfs_info_param(struct dfs_info3_param *param)
+{
+	if (param) {
+		kfree(param->path_name);
+		kfree(param->node_name);
+		kfree(param);
+	}
+}
+
+static inline void free_dfs_info_array(struct dfs_info3_param *param,
+				       int number_of_items)
+{
+	int i;
+	if ((number_of_items == 0) || (param == NULL))
+		return;
+	for (i = 0; i < number_of_items; i++) {
+		kfree(param[i].path_name);
+		kfree(param[i].node_name);
+	}
+	kfree(param);
+}
+
 #define   MID_FREE 0
 #define   MID_REQUEST_ALLOCATED 1
 #define   MID_REQUEST_SUBMITTED 2
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index aaaf748f6a26..2f09f565a3d9 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifsproto.h
  *
- *   Copyright (c) International Business Machines  Corp., 2002,2007
+ *   Copyright (c) International Business Machines  Corp., 2002,2008
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -156,7 +156,7 @@ extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
 			const char *old_path,
 			const struct nls_table *nls_codepage,
 			unsigned int *pnum_referrals,
-			unsigned char **preferrals,
+			struct dfs_info3_param **preferrals,
 			int remap);
 extern void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 				 struct super_block *sb, struct smb_vol *vol);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index db3746c891b5..65d0ba72e78f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/connect.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -1410,7 +1410,7 @@ connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
 		    const char *old_path, const struct nls_table *nls_codepage,
 		    int remap)
 {
-	unsigned char *referrals = NULL;
+	struct dfs_info3_param *referrals = NULL;
 	unsigned int num_referrals;
 	int rc = 0;
 
@@ -1429,12 +1429,14 @@ connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
-	     unsigned char **preferrals, int remap)
+	     struct dfs_info3_param **preferrals, int remap)
 {
 	char *temp_unc;
 	int rc = 0;
+	unsigned char *targetUNCs;
 
 	*pnum_referrals = 0;
+	*preferrals = NULL;
 
 	if (pSesInfo->ipc_tid == 0) {
 		temp_unc = kmalloc(2 /* for slashes */ +
@@ -1454,8 +1456,10 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 		kfree(temp_unc);
 	}
 	if (rc == 0)
-		rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, preferrals,
+		rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, &targetUNCs,
 				     pnum_referrals, nls_codepage, remap);
+	/* BB map targetUNCs to dfs_info3 structures, here or
+		in CIFSGetDFSRefer BB */
 
 	return rc;
 }
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 777a086abd6f..ef7f43824347 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -64,13 +64,14 @@ struct key_type key_type_dns_resolver = {
  * return 0 on success
  */
 int
-dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) {
+dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
+{
 	int rc = -EAGAIN;
 	struct key *rkey;
 	char *name;
 	int len;
 
-	if ((!ip_addr) || (!unc))
+	if (!ip_addr || !unc)
 		return -EINVAL;
 
 	/* search for server name delimiter */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 11f265726db7..1d6fb01b8e6d 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/link.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2003
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -236,8 +236,6 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 	char *full_path = NULL;
 	char *tmp_path = NULL;
 	char *tmpbuffer;
-	unsigned char *referrals = NULL;
-	unsigned int num_referrals = 0;
 	int len;
 	__u16 fid;
 
@@ -297,8 +295,11 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 				cFYI(1, ("Error closing junction point "
 					 "(open for ioctl)"));
 			}
+			/* BB unwind this long, nested function, or remove BB */
 			if (rc == -EIO) {
 				/* Query if DFS Junction */
+				unsigned int num_referrals = 0;
+				struct dfs_info3_param *refs = NULL;
 				tmp_path =
 					kmalloc(MAX_TREE_SIZE + MAX_PATHCONF + 1,
 						GFP_KERNEL);
@@ -310,7 +311,7 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 					rc = get_dfs_path(xid, pTcon->ses,
 						tmp_path,
 						cifs_sb->local_nls,
-						&num_referrals, &referrals,
+						&num_referrals, &refs,
 						cifs_sb->mnt_cifs_flags &
 						    CIFS_MOUNT_MAP_SPECIAL_CHR);
 					cFYI(1, ("Get DFS for %s rc = %d ",
@@ -320,14 +321,13 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 					else {
 						cFYI(1, ("num referral: %d",
 							num_referrals));
-						if (referrals) {
-							cFYI(1,("referral string: %s", referrals));
+						if (refs && refs->path_name) {
 							strncpy(tmpbuffer,
-								referrals,
+								refs->path_name,
 								len-1);
 						}
 					}
-					kfree(referrals);
+					kfree(refs);
 					kfree(tmp_path);
 }
 				/* BB add code like else decode referrals
-- 
cgit v1.2.3


From e260be673a15b6125068270e0216a3bfbfc12f87 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:24 +0100
Subject: Preempt-RCU: implementation

This patch implements a new version of RCU which allows its read-side
critical sections to be preempted. It uses a set of counter pairs
to keep track of the read-side critical sections and flips them
when all tasks exit read-side critical section. The details
of this implementation can be found in this paper -

	http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf

and the article-

	http://lwn.net/Articles/253651/

This patch was developed as a part of the -rt kernel development and
meant to provide better latencies when read-side critical sections of
RCU don't disable preemption.  As a consequence of keeping track of RCU
readers, the readers have a slight overhead (optimizations in the paper).
This implementation co-exists with the "classic" RCU implementations
and can be switched to at compiler.

Also includes RCU tracing summarized in debugfs.

[ akpm@linux-foundation.org: build fixes on non-preempt architectures ]

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 781b47d2f9f2..b4799efaf9e8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -2130,4 +2130,3 @@ source "fs/nls/Kconfig"
 source "fs/dlm/Kconfig"
 
 endmenu
-
-- 
cgit v1.2.3


From 9745512ce79de686df354dc70a8d1a74d801892d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: latencytop support

LatencyTOP kernel infrastructure; it measures latencies in the
scheduler and tracks it system wide and per process.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/base.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7411bfb0b7cc..91fa8e6ce8ad 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 }
 #endif
 
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+	int i;
+	struct task_struct *task = m->private;
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < 32; i++) {
+		if (task->latency_record[i].backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li ",
+				task->latency_record[i].count,
+				task->latency_record[i].time,
+				task->latency_record[i].max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				char sym[KSYM_NAME_LEN];
+				char *c;
+				if (!task->latency_record[i].backtrace[q])
+					break;
+				if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+					break;
+				sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+				c = strchr(sym, '+');
+				if (c)
+					*c = 0;
+				seq_printf(m, "%s ", sym);
+			}
+			seq_printf(m, "\n");
+		}
+
+	}
+	return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_file *m;
+	struct task_struct *task = get_proc_task(inode);
+
+	ret = single_open(file, lstats_show_proc, NULL);
+	if (!ret) {
+		m = file->private_data;
+		m->private = task;
+	}
+	return ret;
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	struct seq_file *m;
+	struct task_struct *task;
+
+	m = file->private_data;
+	task = m->private;
+	clear_all_latency_tracing(task);
+
+	return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
 /* The badness from the OOM killer */
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
 };
 #endif
 
+
 #ifdef CONFIG_SCHED_DEBUG
 /*
  * Print out various scheduling related per-task fields:
@@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat",  S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",     S_IRUGO, cpuset),
 #endif
@@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat", S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",    S_IRUGO, cpuset),
 #endif
-- 
cgit v1.2.3


From 6561168cb442be8d2769dce663870b6a28573e16 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 7 Sep 2007 11:11:10 -0700
Subject: ocfs2_dlm: Call node eviction callbacks from heartbeat handler

With this, a dlm client can take advantage of the group protocol in the dlm
to get full notification whenever a node within the dlm domain leaves
unexpectedly.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2fde7bf91434..b10f3e313fbf 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
 	if (!dlm_grab(dlm))
 		return;
 
+	/*
+	 * This will notify any dlm users that a node in our domain
+	 * went away without notifying us first.
+	 */
+	if (test_bit(idx, dlm->domain_map))
+		dlm_fire_domain_eviction_callbacks(dlm, idx);
+
 	spin_lock(&dlm->spinlock);
 	__dlm_hb_node_down(dlm, idx);
 	spin_unlock(&dlm->spinlock);
-- 
cgit v1.2.3


From 6f7b056ea9c6fa978c79ca626eff43549df94dbb Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 24 Sep 2007 15:09:41 -0700
Subject: ocfs2: Remove fs dependency on ocfs2_heartbeat module

Now that the dlm exposes domain information to us, we don't need generic
node up / node down callbacks. And since the DLM is only telling us when a
node goes down unexpectedly, we no longer need to optimize away node down
callbacks via the umount map.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/heartbeat.c | 73 ----------------------------------------------------
 fs/ocfs2/heartbeat.h |  2 --
 fs/ocfs2/super.c     |  8 ------
 3 files changed, 83 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c4c36171240d..6239fc52790c 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,9 +30,6 @@
 #include <linux/highmem.h>
 #include <linux/kmod.h>
 
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-
 #include <dlm/dlmapi.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
@@ -48,9 +45,6 @@
 
 #include "buffer_head_io.h"
 
-#define OCFS2_HB_NODE_DOWN_PRI     (0x0000002)
-#define OCFS2_HB_NODE_UP_PRI	   OCFS2_HB_NODE_DOWN_PRI
-
 static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
 					    int bit);
 static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@@ -87,26 +81,11 @@ static void ocfs2_do_node_down(int node_num,
 		return;
 	}
 
-	if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
-		/* If a node is in the umount map, then we've been
-		 * expecting him to go down and we know ahead of time
-		 * that recovery is not necessary. */
-		ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-		return;
-	}
-
 	ocfs2_recovery_thread(osb, node_num);
 
 	ocfs2_remove_node_from_vote_queues(osb, node_num);
 }
 
-static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
-				  int node_num,
-				  void *data)
-{
-	ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
-}
-
 /* Called from the dlm when it's about to evict a node. We may also
  * get a heartbeat callback later. */
 static void ocfs2_dlm_eviction_cb(int node_num,
@@ -121,27 +100,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
 	ocfs2_do_node_down(node_num, osb);
 }
 
-static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
-				int node_num,
-				void *data)
-{
-	struct ocfs2_super *osb = data;
-
-	BUG_ON(osb->node_num == node_num);
-
-	mlog(0, "node up event for %d\n", node_num);
-	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
-
 void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
 {
-	o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
-			    ocfs2_hb_node_down_cb, osb,
-			    OCFS2_HB_NODE_DOWN_PRI);
-
-	o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
-			    ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
-
 	/* Not exactly a heartbeat callback, but leads to essentially
 	 * the same path so we set it up here. */
 	dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@@ -149,39 +109,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
 			      osb);
 }
 
-/* Most functions here are just stubs for now... */
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
-{
-	int status;
-
-	if (ocfs2_mount_local(osb))
-		return 0;
-
-	status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
-	if (status < 0) {
-		mlog_errno(status);
-		o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
-	}
-
-bail:
-	return status;
-}
-
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
-{
-	if (ocfs2_mount_local(osb))
-		return;
-
-	o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
-	o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
-}
-
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
 {
 	int ret;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e4..56859211888a 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,8 +29,6 @@
 void ocfs2_init_node_maps(struct ocfs2_super *osb);
 
 void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
 void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
 
 /* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5ee775420665..64b81b341ece 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1117,12 +1117,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
 		goto leave;
 	}
 
-	status = ocfs2_register_hb_callbacks(osb);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 	status = ocfs2_dlm_init(osb);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1260,8 +1254,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 		ocfs2_dlm_shutdown(osb);
 	}
 
-	ocfs2_clear_hb_callbacks(osb);
-
 	debugfs_remove(osb->osb_debug_root);
 
 	if (!mnt_err)
-- 
cgit v1.2.3


From 34d024f84345807bf44163fac84e921513dde323 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 24 Sep 2007 15:56:19 -0700
Subject: ocfs2: Remove mount/unmount votes

The node maps that are set/unset by these votes are no longer relevant, thus
we can remove the mount and umount votes. Since those are the last two
remaining votes, we can also remove the entire vote infrastructure.

The vote thread has been renamed to the downconvert thread, and the small
amount of functionality related to managing it has been moved into
fs/ocfs2/dlmglue.c. All references to votes have been removed or updated.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/Makefile               |   3 +-
 fs/ocfs2/cluster/tcp_internal.h |   5 +-
 fs/ocfs2/dcache.c               |   8 +-
 fs/ocfs2/dlmglue.c              | 164 +++++++--
 fs/ocfs2/dlmglue.h              |   5 +-
 fs/ocfs2/heartbeat.c            |   7 -
 fs/ocfs2/inode.c                |  36 +-
 fs/ocfs2/journal.c              |  15 +-
 fs/ocfs2/namei.c                |  10 +-
 fs/ocfs2/ocfs2.h                |  25 +-
 fs/ocfs2/slot_map.c             |  19 -
 fs/ocfs2/slot_map.h             |   2 -
 fs/ocfs2/super.c                |  43 +--
 fs/ocfs2/vote.c                 | 756 ----------------------------------------
 fs/ocfs2/vote.h                 |  48 ---
 15 files changed, 179 insertions(+), 967 deletions(-)
 delete mode 100644 fs/ocfs2/vote.c
 delete mode 100644 fs/ocfs2/vote.h

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b0..d2057e7fbda7 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -27,8 +27,7 @@ ocfs2-objs := \
 	symlink.o 		\
 	sysfile.o 		\
 	uptodate.o		\
-	ver.o 			\
-	vote.o
+	ver.o
 
 obj-$(CONFIG_OCFS2_FS) += cluster/
 obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 9606111fe89d..79bd6665b3ca 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
  * locking semantics of the file system using the protocol.  It should 
  * be somewhere else, I'm sure, but right now it isn't.
  *
+ * New in version 9:
+ * 	- All votes removed
+ *
  * New in version 8:
  * 	- Replace delete inode votes with a cluster lock
  *
@@ -60,7 +63,7 @@
  * 	- full 64 bit i_size in the metadata lock lvbs
  * 	- introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 8ULL
+#define O2NET_PROTOCOL_VERSION 9ULL
 struct o2net_handshake {
 	__be64	protocol_version;
 	__be64	connector_id;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 9923278ea6d4..b1cc7c381e88 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
 /*
  * Walk the inode alias list, and find a dentry which has a given
  * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
- * is looking for a dentry_lock reference. The vote thread is looking
- * to unhash aliases, so we allow it to skip any that already have
- * that property.
+ * is looking for a dentry_lock reference. The downconvert thread is
+ * looking to unhash aliases, so we allow it to skip any that already
+ * have that property.
  */
 struct dentry *ocfs2_find_local_alias(struct inode *inode,
 				      u64 parent_blkno,
@@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
 	dl->dl_count = 0;
 	/*
 	 * Does this have to happen below, for all attaches, in case
-	 * the struct inode gets blown away by votes?
+	 * the struct inode gets blown away by the downconvert thread?
 	 */
 	dl->dl_inode = igrab(inode);
 	dl->dl_parent_blkno = parent_blkno;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4e97dcceaf8f..b3068ade3f7b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
-#include "vote.h"
 
 #include "buffer_head_io.h"
 
@@ -153,10 +152,10 @@ struct ocfs2_lock_res_ops {
 	struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
 
 	/*
-	 * Optionally called in the downconvert (or "vote") thread
-	 * after a successful downconvert. The lockres will not be
-	 * referenced after this callback is called, so it is safe to
-	 * free memory, etc.
+	 * Optionally called in the downconvert thread after a
+	 * successful downconvert. The lockres will not be referenced
+	 * after this callback is called, so it is safe to free
+	 * memory, etc.
 	 *
 	 * The exact semantics of when this is called are controlled
 	 * by ->downconvert_worker()
@@ -310,8 +309,9 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 		"resource %s: %s\n", dlm_errname(_stat), _func,	\
 		_lockres->l_name, dlm_errmsg(_stat));		\
 } while (0)
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
-				 struct ocfs2_lock_res *lockres);
+static int ocfs2_downconvert_thread(void *arg);
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres);
 static int ocfs2_meta_lock_update(struct inode *inode,
 				  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
@@ -732,7 +732,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 
 	wake_up(&lockres->l_event);
 
-	ocfs2_kick_vote_thread(osb);
+	ocfs2_wake_downconvert_thread(osb);
 }
 
 static void ocfs2_locking_ast(void *opaque)
@@ -1089,7 +1089,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
 	mlog_entry_void();
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	ocfs2_dec_holders(lockres, level);
-	ocfs2_vote_on_unlock(osb, lockres);
+	ocfs2_downconvert_on_unlock(osb, lockres);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 	mlog_exit_void();
 }
@@ -1372,15 +1372,15 @@ int ocfs2_data_lock_with_page(struct inode *inode,
 	return ret;
 }
 
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
-				 struct ocfs2_lock_res *lockres)
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres)
 {
 	int kick = 0;
 
 	mlog_entry_void();
 
 	/* If we know that another node is waiting on our lock, kick
-	 * the vote thread * pre-emptively when we reach a release
+	 * the downconvert thread * pre-emptively when we reach a release
 	 * condition. */
 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
 		switch(lockres->l_blocking) {
@@ -1398,7 +1398,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
 	}
 
 	if (kick)
-		ocfs2_kick_vote_thread(osb);
+		ocfs2_wake_downconvert_thread(osb);
 
 	mlog_exit_void();
 }
@@ -1832,19 +1832,20 @@ bail:
 }
 
 /*
- * This is working around a lock inversion between tasks acquiring DLM locks
- * while holding a page lock and the vote thread which blocks dlm lock acquiry
- * while acquiring page locks.
+ * This is working around a lock inversion between tasks acquiring DLM
+ * locks while holding a page lock and the downconvert thread which
+ * blocks dlm lock acquiry while acquiring page locks.
  *
  * ** These _with_page variantes are only intended to be called from aop
  * methods that hold page locks and return a very specific *positive* error
  * code that aop methods pass up to the VFS -- test for errors with != 0. **
  *
- * The DLM is called such that it returns -EAGAIN if it would have blocked
- * waiting for the vote thread.  In that case we unlock our page so the vote
- * thread can make progress.  Once we've done this we have to return
- * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
- * into the VFS who will then immediately retry the aop call.
+ * The DLM is called such that it returns -EAGAIN if it would have
+ * blocked waiting for the downconvert thread.  In that case we unlock
+ * our page so the downconvert thread can make progress.  Once we've
+ * done this we have to return AOP_TRUNCATED_PAGE so the aop method
+ * that called us can bubble that back up into the VFS who will then
+ * immediately retry the aop call.
  *
  * We do a blocking lock and immediate unlock before returning, though, so that
  * the lock has a great chance of being cached on this node by the time the VFS
@@ -2320,11 +2321,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	/* launch vote thread */
-	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
-	if (IS_ERR(osb->vote_task)) {
-		status = PTR_ERR(osb->vote_task);
-		osb->vote_task = NULL;
+	/* launch downconvert thread */
+	osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+	if (IS_ERR(osb->dc_task)) {
+		status = PTR_ERR(osb->dc_task);
+		osb->dc_task = NULL;
 		mlog_errno(status);
 		goto bail;
 	}
@@ -2353,8 +2354,8 @@ local:
 bail:
 	if (status < 0) {
 		ocfs2_dlm_shutdown_debug(osb);
-		if (osb->vote_task)
-			kthread_stop(osb->vote_task);
+		if (osb->dc_task)
+			kthread_stop(osb->dc_task);
 	}
 
 	mlog_exit(status);
@@ -2369,9 +2370,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
 
 	ocfs2_drop_osb_locks(osb);
 
-	if (osb->vote_task) {
-		kthread_stop(osb->vote_task);
-		osb->vote_task = NULL;
+	if (osb->dc_task) {
+		kthread_stop(osb->dc_task);
+		osb->dc_task = NULL;
 	}
 
 	ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2527,7 +2528,7 @@ out:
 
 /* Mark the lockres as being dropped. It will no longer be
  * queued if blocking, but we still may have to wait on it
- * being dequeued from the vote thread before we can consider
+ * being dequeued from the downconvert thread before we can consider
  * it safe to drop. 
  *
  * You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2903,7 +2904,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
 
 /*
  * Does the final reference drop on our dentry lock. Right now this
- * happens in the vote thread, but we could choose to simplify the
+ * happens in the downconvert thread, but we could choose to simplify the
  * dlmglue API and push these off to the ocfs2_wq in the future.
  */
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3042,7 +3043,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 	mlog(0, "lockres %s blocked.\n", lockres->l_name);
 
 	/* Detect whether a lock has been marked as going away while
-	 * the vote thread was processing other things. A lock can
+	 * the downconvert thread was processing other things. A lock can
 	 * still be marked with OCFS2_LOCK_FREEING after this check,
 	 * but short circuiting here will still save us some
 	 * performance. */
@@ -3091,13 +3092,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 
 	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
 
-	spin_lock(&osb->vote_task_lock);
+	spin_lock(&osb->dc_task_lock);
 	if (list_empty(&lockres->l_blocked_list)) {
 		list_add_tail(&lockres->l_blocked_list,
 			      &osb->blocked_lock_list);
 		osb->blocked_lock_count++;
 	}
-	spin_unlock(&osb->vote_task_lock);
+	spin_unlock(&osb->dc_task_lock);
 
 	mlog_exit_void();
 }
+
+static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
+{
+	unsigned long processed;
+	struct ocfs2_lock_res *lockres;
+
+	mlog_entry_void();
+
+	spin_lock(&osb->dc_task_lock);
+	/* grab this early so we know to try again if a state change and
+	 * wake happens part-way through our work  */
+	osb->dc_work_sequence = osb->dc_wake_sequence;
+
+	processed = osb->blocked_lock_count;
+	while (processed) {
+		BUG_ON(list_empty(&osb->blocked_lock_list));
+
+		lockres = list_entry(osb->blocked_lock_list.next,
+				     struct ocfs2_lock_res, l_blocked_list);
+		list_del_init(&lockres->l_blocked_list);
+		osb->blocked_lock_count--;
+		spin_unlock(&osb->dc_task_lock);
+
+		BUG_ON(!processed);
+		processed--;
+
+		ocfs2_process_blocked_lock(osb, lockres);
+
+		spin_lock(&osb->dc_task_lock);
+	}
+	spin_unlock(&osb->dc_task_lock);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
+{
+	int empty = 0;
+
+	spin_lock(&osb->dc_task_lock);
+	if (list_empty(&osb->blocked_lock_list))
+		empty = 1;
+
+	spin_unlock(&osb->dc_task_lock);
+	return empty;
+}
+
+static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
+{
+	int should_wake = 0;
+
+	spin_lock(&osb->dc_task_lock);
+	if (osb->dc_work_sequence != osb->dc_wake_sequence)
+		should_wake = 1;
+	spin_unlock(&osb->dc_task_lock);
+
+	return should_wake;
+}
+
+int ocfs2_downconvert_thread(void *arg)
+{
+	int status = 0;
+	struct ocfs2_super *osb = arg;
+
+	/* only quit once we've been asked to stop and there is no more
+	 * work available */
+	while (!(kthread_should_stop() &&
+		ocfs2_downconvert_thread_lists_empty(osb))) {
+
+		wait_event_interruptible(osb->dc_event,
+					 ocfs2_downconvert_thread_should_wake(osb) ||
+					 kthread_should_stop());
+
+		mlog(0, "downconvert_thread: awoken\n");
+
+		ocfs2_downconvert_thread_do_work(osb);
+	}
+
+	osb->dc_task = NULL;
+	return status;
+}
+
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->dc_task_lock);
+	/* make sure the voting thread gets a swipe at whatever changes
+	 * the caller may have made to the voting state */
+	osb->dc_wake_sequence++;
+	spin_unlock(&osb->dc_task_lock);
+	wake_up(&osb->dc_event);
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 87a785e41205..931f6ee55146 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -54,7 +54,7 @@ struct ocfs2_meta_lvb {
 #define OCFS2_META_LOCK_RECOVERY	(0x01)
 /* Instruct the dlm not to queue ourselves on the other node. */
 #define OCFS2_META_LOCK_NOQUEUE		(0x02)
-/* don't block waiting for the vote thread, instead return -EAGAIN */
+/* don't block waiting for the downconvert thread, instead return -EAGAIN */
 #define OCFS2_LOCK_NONBLOCK		(0x04)
 
 int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -112,9 +112,10 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
 			       struct ocfs2_lock_res *lockres);
 
-/* for the vote thread */
+/* for the downconvert thread */
 void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 				struct ocfs2_lock_res *lockres);
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
 
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 6239fc52790c..c0efd9489fe8 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -41,7 +41,6 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
-#include "vote.h"
 
 #include "buffer_head_io.h"
 
@@ -58,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
 void ocfs2_init_node_maps(struct ocfs2_super *osb)
 {
 	spin_lock_init(&osb->node_map_lock);
-	ocfs2_node_map_init(&osb->mounted_map);
 	ocfs2_node_map_init(&osb->recovery_map);
-	ocfs2_node_map_init(&osb->umount_map);
 	ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
 
@@ -82,8 +79,6 @@ static void ocfs2_do_node_down(int node_num,
 	}
 
 	ocfs2_recovery_thread(osb, node_num);
-
-	ocfs2_remove_node_from_vote_queues(osb, node_num);
 }
 
 /* Called from the dlm when it's about to evict a node. We may also
@@ -268,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
 
 	spin_lock(&osb->node_map_lock);
 
-	__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
-
 	if (!test_bit(num, osb->recovery_map.map)) {
 	    __ocfs2_node_map_set_bit(&osb->recovery_map, num);
 	    set = 1;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ebb2bbe30f35..86cf073996b5 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,7 +49,6 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
-#include "vote.h"
 
 #include "buffer_head_io.h"
 
@@ -718,8 +717,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
 	}
 
 	/* we do this while holding the orphan dir lock because we
-	 * don't want recovery being run from another node to vote for
-	 * an inode delete on us -- this will result in two nodes
+	 * don't want recovery being run from another node to try an
+	 * inode delete underneath us -- this will result in two nodes
 	 * truncating the same file! */
 	status = ocfs2_truncate_for_delete(osb, inode, di_bh);
 	if (status < 0) {
@@ -744,7 +743,7 @@ bail:
 }
 
 /* There is a series of simple checks that should be done before a
- * vote is even considered. Encapsulate those in this function. */
+ * trylock is even considered. Encapsulate those in this function. */
 static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
 {
 	int ret = 0;
@@ -758,14 +757,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
 		goto bail;
 	}
 
-	/* If we're coming from process_vote we can't go into our own
+	/* If we're coming from downconvert_thread we can't go into our own
 	 * voting [hello, deadlock city!], so unforuntately we just
 	 * have to skip deleting this guy. That's OK though because
 	 * the node who's doing the actual deleting should handle it
 	 * anyway. */
-	if (current == osb->vote_task) {
+	if (current == osb->dc_task) {
 		mlog(0, "Skipping delete of %lu because we're currently "
-		     "in process_vote\n", inode->i_ino);
+		     "in downconvert\n", inode->i_ino);
 		goto bail;
 	}
 
@@ -779,10 +778,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
 		goto bail_unlock;
 	}
 
-	/* If we have voted "yes" on the wipe of this inode for
-	 * another node, it will be marked here so we can safely skip
-	 * it. Recovery will cleanup any inodes we might inadvertantly
-	 * skip here. */
+	/* If we have allowd wipe of this inode for another node, it
+	 * will be marked here so we can safely skip it. Recovery will
+	 * cleanup any inodes we might inadvertantly skip here. */
 	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
 		mlog(0, "Skipping delete of %lu because another node "
 		     "has done this for us.\n", inode->i_ino);
@@ -929,7 +927,7 @@ void ocfs2_delete_inode(struct inode *inode)
 
 	/* Lock down the inode. This gives us an up to date view of
 	 * it's metadata (for verification), and allows us to
-	 * serialize delete_inode votes. 
+	 * serialize delete_inode on multiple nodes.
 	 *
 	 * Even though we might be doing a truncate, we don't take the
 	 * allocation lock here as it won't be needed - nobody will
@@ -947,15 +945,15 @@ void ocfs2_delete_inode(struct inode *inode)
 	 * before we go ahead and wipe the inode. */
 	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
 	if (!wipe || status < 0) {
-		/* Error and inode busy vote both mean we won't be
+		/* Error and remote inode busy both mean we won't be
 		 * removing the inode, so they take almost the same
 		 * path. */
 		if (status < 0)
 			mlog_errno(status);
 
-		/* Someone in the cluster has voted to not wipe this
-		 * inode, or it was never completely orphaned. Write
-		 * out the pages and exit now. */
+		/* Someone in the cluster has disallowed a wipe of
+		 * this inode, or it was never completely
+		 * orphaned. Write out the pages and exit now. */
 		ocfs2_cleanup_delete_inode(inode, 1);
 		goto bail_unlock_inode;
 	}
@@ -1008,12 +1006,12 @@ void ocfs2_clear_inode(struct inode *inode)
 	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
 			"Inode=%lu\n", inode->i_ino);
 
-	/* For remove delete_inode vote, we hold open lock before,
-	 * now it is time to unlock PR and EX open locks. */
+	/* To preven remote deletes we hold open lock before, now it
+	 * is time to unlock PR and EX open locks. */
 	ocfs2_open_unlock(inode);
 
 	/* Do these before all the other work so that we don't bounce
-	 * the vote thread while waiting to destroy the locks. */
+	 * the downconvert thread while waiting to destroy the locks. */
 	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8d81f6c1b877..f2ebe2eb3c21 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -44,7 +44,6 @@
 #include "localalloc.h"
 #include "slot_map.h"
 #include "super.h"
-#include "vote.h"
 #include "sysfile.h"
 
 #include "buffer_head_io.h"
@@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
 	     journal->j_trans_id, flushed);
 
-	ocfs2_kick_vote_thread(osb);
+	ocfs2_wake_downconvert_thread(osb);
 	wake_up(&journal->j_checkpointed);
 finally:
 	mlog_exit(status);
@@ -883,8 +882,8 @@ restart:
 	ocfs2_super_unlock(osb, 1);
 
 	/* We always run recovery on our own orphan dir - the dead
-	 * node(s) may have voted "no" on an inode delete earlier. A
-	 * revote is therefore required. */
+	 * node(s) may have disallowd a previos inode delete. Re-processing
+	 * is therefore required. */
 	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
 					NULL);
 
@@ -1380,10 +1379,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		iter = oi->ip_next_orphan;
 
 		spin_lock(&oi->ip_lock);
-		/* Delete voting may have set these on the assumption
-		 * that the other node would wipe them successfully.
-		 * If they are still in the node's orphan dir, we need
-		 * to reset that state. */
+		/* The remote delete code may have set these on the
+		 * assumption that the other node would wipe them
+		 * successfully.  If they are still in the node's
+		 * orphan dir, we need to reset that state. */
 		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
 
 		/* Set the proper information to get us going into
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 989ac2718587..6295fd6ae469 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,7 +60,6 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
-#include "vote.h"
 
 #include "buffer_head_io.h"
 
@@ -176,7 +175,7 @@ bail_unlock:
 	/* Don't drop the cluster lock until *after* the d_add --
 	 * unlink on another node will message us to remove that
 	 * dentry under this lock so otherwise we can race this with
-	 * the vote thread and have a stale dentry. */
+	 * the downconvert thread and have a stale dentry. */
 	ocfs2_meta_unlock(dir, 0);
 
 bail:
@@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
 
 	status = ocfs2_remote_dentry_delete(dentry);
 	if (status < 0) {
-		/* This vote should succeed under all normal
+		/* This remote delete should succeed under all normal
 		 * circumstances. */
 		mlog_errno(status);
 		goto leave;
@@ -1031,8 +1030,9 @@ static int ocfs2_rename(struct inode *old_dir,
 
 	/*
 	 * Aside from allowing a meta data update, the locking here
-	 * also ensures that the vote thread on other nodes won't have
-	 * to concurrently downconvert the inode and the dentry locks.
+	 * also ensures that the downconvert thread on other nodes
+	 * won't have to concurrently downconvert the inode and the
+	 * dentry locks.
 	 */
 	status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1);
 	if (status < 0) {
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 60a23e1906b0..f8f866144c6a 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -189,9 +189,7 @@ struct ocfs2_super
 	struct ocfs2_slot_info *slot_info;
 
 	spinlock_t node_map_lock;
-	struct ocfs2_node_map mounted_map;
 	struct ocfs2_node_map recovery_map;
-	struct ocfs2_node_map umount_map;
 
 	u64 root_blkno;
 	u64 system_dir_blkno;
@@ -254,28 +252,15 @@ struct ocfs2_super
 
 	wait_queue_head_t recovery_event;
 
-	spinlock_t vote_task_lock;
-	struct task_struct *vote_task;
-	wait_queue_head_t vote_event;
-	unsigned long vote_wake_sequence;
-	unsigned long vote_work_sequence;
+	spinlock_t dc_task_lock;
+	struct task_struct *dc_task;
+	wait_queue_head_t dc_event;
+	unsigned long dc_wake_sequence;
+	unsigned long dc_work_sequence;
 
 	struct list_head blocked_lock_list;
 	unsigned long blocked_lock_count;
 
-	struct list_head vote_list;
-	int vote_count;
-
-	u32 net_key;
-	spinlock_t net_response_lock;
-	unsigned int net_response_ids;
-	struct list_head net_response_list;
-
-	struct o2hb_callback_func osb_hb_up;
-	struct o2hb_callback_func osb_hb_down;
-
-	struct list_head	osb_net_handlers;
-
 	wait_queue_head_t		osb_mount_event;
 
 	/* Truncate log info */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index af4882b62cfa..3a50ce555e64 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
 			      s16 slot_num,
 			      s16 node_num);
 
-/* Use the slot information we've collected to create a map of mounted
- * nodes. Should be holding an EX on super block. assumes slot info is
- * up to date. Note that we call this *after* we find a slot, so our
- * own node should be set in the map too... */
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
-{
-	int i;
-	struct ocfs2_slot_info *si = osb->slot_info;
-
-	spin_lock(&si->si_lock);
-
-	for (i = 0; i < si->si_size; i++)
-		if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
-			ocfs2_node_map_set_bit(osb, &osb->mounted_map,
-					      si->si_global_node_nums[i]);
-
-	spin_unlock(&si->si_lock);
-}
-
 /* post the slot information on disk into our slot_info struct. */
 void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
 {
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031b..1025872aaade 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
 void ocfs2_clear_slot(struct ocfs2_slot_info *si,
 		      s16 slot_num);
 
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
-
 static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
 				      int slot_num)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 64b81b341ece..1996820488cc 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,7 +65,6 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "ver.h"
-#include "vote.h"
 
 #include "buffer_head_io.h"
 
@@ -1123,13 +1122,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
 		goto leave;
 	}
 
-	/* requires vote_thread to be running. */
-	status = ocfs2_register_net_handlers(osb);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 	status = ocfs2_super_lock(osb, 1);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1144,8 +1136,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
 		goto leave;
 	}
 
-	ocfs2_populate_mounted_map(osb);
-
 	/* load all node-local system inodes */
 	status = ocfs2_init_local_system_inodes(osb);
 	if (status < 0) {
@@ -1168,15 +1158,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
 	if (ocfs2_mount_local(osb))
 		goto leave;
 
-	/* This should be sent *after* we recovered our journal as it
-	 * will cause other nodes to unmark us as needing
-	 * recovery. However, we need to send it *before* dropping the
-	 * super block lock as otherwise their recovery threads might
-	 * try to clean us up while we're live! */
-	status = ocfs2_request_mount_vote(osb);
-	if (status < 0)
-		mlog_errno(status);
-
 leave:
 	if (unlock_super)
 		ocfs2_super_unlock(osb, 1);
@@ -1234,10 +1215,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 			mlog_errno(tmp);
 			return;
 		}
-
-		tmp = ocfs2_request_umount_vote(osb);
-		if (tmp < 0)
-			mlog_errno(tmp);
 	}
 
 	if (osb->slot_num != OCFS2_INVALID_SLOT)
@@ -1248,11 +1225,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_release_system_inodes(osb);
 
-	if (osb->dlm) {
-		ocfs2_unregister_net_handlers(osb);
-
+	if (osb->dlm)
 		ocfs2_dlm_shutdown(osb);
-	}
 
 	debugfs_remove(osb->osb_debug_root);
 
@@ -1336,19 +1310,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->s_sectsize_bits = blksize_bits(sector_size);
 	BUG_ON(!osb->s_sectsize_bits);
 
-	osb->net_response_ids = 0;
-	spin_lock_init(&osb->net_response_lock);
-	INIT_LIST_HEAD(&osb->net_response_list);
-
-	INIT_LIST_HEAD(&osb->osb_net_handlers);
 	init_waitqueue_head(&osb->recovery_event);
-	spin_lock_init(&osb->vote_task_lock);
-	init_waitqueue_head(&osb->vote_event);
-	osb->vote_work_sequence = 0;
-	osb->vote_wake_sequence = 0;
+	spin_lock_init(&osb->dc_task_lock);
+	init_waitqueue_head(&osb->dc_event);
+	osb->dc_work_sequence = 0;
+	osb->dc_wake_sequence = 0;
 	INIT_LIST_HEAD(&osb->blocked_lock_list);
 	osb->blocked_lock_count = 0;
-	INIT_LIST_HEAD(&osb->vote_list);
 	spin_lock_init(&osb->osb_lock);
 
 	atomic_set(&osb->alloc_stats.moves, 0);
@@ -1488,7 +1456,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 
 	memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
-	osb->net_key = le32_to_cpu(uuid_net_key);
 
 	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
 	osb->vol_label[63] = '\0';
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index c05358538f2b..000000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * vote.c
- *
- * description here
- *
- * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-
-#include <dlm/dlmapi.h>
-
-#define MLOG_MASK_PREFIX ML_VOTE
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-
-#include "alloc.h"
-#include "dlmglue.h"
-#include "extent_map.h"
-#include "heartbeat.h"
-#include "inode.h"
-#include "journal.h"
-#include "slot_map.h"
-#include "vote.h"
-
-#include "buffer_head_io.h"
-
-#define OCFS2_MESSAGE_TYPE_VOTE     (0x1)
-#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
-struct ocfs2_msg_hdr
-{
-	__be32 h_response_id; /* used to lookup message handle on sending
-			    * node. */
-	__be32 h_request;
-	__be64 h_blkno;
-	__be32 h_generation;
-	__be32 h_node_num;    /* node sending this particular message. */
-};
-
-struct ocfs2_vote_msg
-{
-	struct ocfs2_msg_hdr v_hdr;
-	__be32 v_reserved1;
-} __attribute__ ((packed));
-
-/* Responses are given these values to maintain backwards
- * compatibility with older ocfs2 versions */
-#define OCFS2_RESPONSE_OK		(0)
-#define OCFS2_RESPONSE_BUSY		(-16)
-#define OCFS2_RESPONSE_BAD_MSG		(-22)
-
-struct ocfs2_response_msg
-{
-	struct ocfs2_msg_hdr r_hdr;
-	__be32 r_response;
-} __attribute__ ((packed));
-
-struct ocfs2_vote_work {
-	struct list_head   w_list;
-	struct ocfs2_vote_msg w_msg;
-};
-
-enum ocfs2_vote_request {
-	OCFS2_VOTE_REQ_INVALID = 0,
-	OCFS2_VOTE_REQ_MOUNT,
-	OCFS2_VOTE_REQ_UMOUNT,
-	OCFS2_VOTE_REQ_LAST
-};
-
-static inline int ocfs2_is_valid_vote_request(int request)
-{
-	return OCFS2_VOTE_REQ_INVALID < request &&
-		request < OCFS2_VOTE_REQ_LAST;
-}
-
-typedef void (*ocfs2_net_response_callback)(void *priv,
-					    struct ocfs2_response_msg *resp);
-struct ocfs2_net_response_cb {
-	ocfs2_net_response_callback	rc_cb;
-	void				*rc_priv;
-};
-
-struct ocfs2_net_wait_ctxt {
-	struct list_head        n_list;
-	u32                     n_response_id;
-	wait_queue_head_t       n_event;
-	struct ocfs2_node_map   n_node_map;
-	int                     n_response; /* an agreggate response. 0 if
-					     * all nodes are go, < 0 on any
-					     * negative response from any
-					     * node or network error. */
-	struct ocfs2_net_response_cb *n_callback;
-};
-
-static void ocfs2_process_mount_request(struct ocfs2_super *osb,
-					unsigned int node_num)
-{
-	mlog(0, "MOUNT vote from node %u\n", node_num);
-	/* The other node only sends us this message when he has an EX
-	 * on the superblock, so our recovery threads (if having been
-	 * launched) are waiting on it.*/
-	ocfs2_recovery_map_clear(osb, node_num);
-	ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
-
-	/* We clear the umount map here because a node may have been
-	 * previously mounted, safely unmounted but never stopped
-	 * heartbeating - in which case we'd have a stale entry. */
-	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
-
-static void ocfs2_process_umount_request(struct ocfs2_super *osb,
-					 unsigned int node_num)
-{
-	mlog(0, "UMOUNT vote from node %u\n", node_num);
-	ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
-	ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
-}
-
-static void ocfs2_process_vote(struct ocfs2_super *osb,
-			       struct ocfs2_vote_msg *msg)
-{
-	int net_status, vote_response;
-	unsigned int node_num;
-	u64 blkno;
-	enum ocfs2_vote_request request;
-	struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
-	struct ocfs2_response_msg response;
-
-	/* decode the network mumbo jumbo into local variables. */
-	request = be32_to_cpu(hdr->h_request);
-	blkno = be64_to_cpu(hdr->h_blkno);
-	node_num = be32_to_cpu(hdr->h_node_num);
-
-	mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
-	     request, (unsigned long long)blkno, node_num);
-
-	if (!ocfs2_is_valid_vote_request(request)) {
-		mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
-		     request, node_num);
-		vote_response = OCFS2_RESPONSE_BAD_MSG;
-		goto respond;
-	}
-
-	vote_response = OCFS2_RESPONSE_OK;
-
-	switch (request) {
-	case OCFS2_VOTE_REQ_UMOUNT:
-		ocfs2_process_umount_request(osb, node_num);
-		goto respond;
-	case OCFS2_VOTE_REQ_MOUNT:
-		ocfs2_process_mount_request(osb, node_num);
-		goto respond;
-	default:
-		/* avoids a gcc warning */
-		break;
-	}
-
-respond:
-	/* Response struture is small so we just put it on the stack
-	 * and stuff it inline. */
-	memset(&response, 0, sizeof(struct ocfs2_response_msg));
-	response.r_hdr.h_response_id = hdr->h_response_id;
-	response.r_hdr.h_blkno = hdr->h_blkno;
-	response.r_hdr.h_generation = hdr->h_generation;
-	response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
-	response.r_response = cpu_to_be32(vote_response);
-
-	net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
-					osb->net_key,
-					&response,
-					sizeof(struct ocfs2_response_msg),
-					node_num,
-					NULL);
-	/* We still want to error print for ENOPROTOOPT here. The
-	 * sending node shouldn't have unregistered his net handler
-	 * without sending an unmount vote 1st */
-	if (net_status < 0
-	    && net_status != -ETIMEDOUT
-	    && net_status != -ENOTCONN)
-		mlog(ML_ERROR, "message to node %u fails with error %d!\n",
-		     node_num, net_status);
-}
-
-static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
-{
-	unsigned long processed;
-	struct ocfs2_lock_res *lockres;
-	struct ocfs2_vote_work *work;
-
-	mlog_entry_void();
-
-	spin_lock(&osb->vote_task_lock);
-	/* grab this early so we know to try again if a state change and
-	 * wake happens part-way through our work  */
-	osb->vote_work_sequence = osb->vote_wake_sequence;
-
-	processed = osb->blocked_lock_count;
-	while (processed) {
-		BUG_ON(list_empty(&osb->blocked_lock_list));
-
-		lockres = list_entry(osb->blocked_lock_list.next,
-				     struct ocfs2_lock_res, l_blocked_list);
-		list_del_init(&lockres->l_blocked_list);
-		osb->blocked_lock_count--;
-		spin_unlock(&osb->vote_task_lock);
-
-		BUG_ON(!processed);
-		processed--;
-
-		ocfs2_process_blocked_lock(osb, lockres);
-
-		spin_lock(&osb->vote_task_lock);
-	}
-
-	while (osb->vote_count) {
-		BUG_ON(list_empty(&osb->vote_list));
-		work = list_entry(osb->vote_list.next,
-				  struct ocfs2_vote_work, w_list);
-		list_del(&work->w_list);
-		osb->vote_count--;
-		spin_unlock(&osb->vote_task_lock);
-
-		ocfs2_process_vote(osb, &work->w_msg);
-		kfree(work);
-
-		spin_lock(&osb->vote_task_lock);
-	}
-	spin_unlock(&osb->vote_task_lock);
-
-	mlog_exit_void();
-}
-
-static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
-{
-	int empty = 0;
-
-	spin_lock(&osb->vote_task_lock);
-	if (list_empty(&osb->blocked_lock_list) &&
-	    list_empty(&osb->vote_list))
-		empty = 1;
-
-	spin_unlock(&osb->vote_task_lock);
-	return empty;
-}
-
-static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
-{
-	int should_wake = 0;
-
-	spin_lock(&osb->vote_task_lock);
-	if (osb->vote_work_sequence != osb->vote_wake_sequence)
-		should_wake = 1;
-	spin_unlock(&osb->vote_task_lock);
-
-	return should_wake;
-}
-
-int ocfs2_vote_thread(void *arg)
-{
-	int status = 0;
-	struct ocfs2_super *osb = arg;
-
-	/* only quit once we've been asked to stop and there is no more
-	 * work available */
-	while (!(kthread_should_stop() &&
-		 ocfs2_vote_thread_lists_empty(osb))) {
-
-		wait_event_interruptible(osb->vote_event,
-					 ocfs2_vote_thread_should_wake(osb) ||
-					 kthread_should_stop());
-
-		mlog(0, "vote_thread: awoken\n");
-
-		ocfs2_vote_thread_do_work(osb);
-	}
-
-	osb->vote_task = NULL;
-	return status;
-}
-
-static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
-{
-	struct ocfs2_net_wait_ctxt *w;
-
-	w = kzalloc(sizeof(*w), GFP_NOFS);
-	if (!w) {
-		mlog_errno(-ENOMEM);
-		goto bail;
-	}
-
-	INIT_LIST_HEAD(&w->n_list);
-	init_waitqueue_head(&w->n_event);
-	ocfs2_node_map_init(&w->n_node_map);
-	w->n_response_id = response_id;
-	w->n_callback = NULL;
-bail:
-	return w;
-}
-
-static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
-{
-	unsigned int ret;
-
-	spin_lock(&osb->net_response_lock);
-	ret = ++osb->net_response_ids;
-	spin_unlock(&osb->net_response_lock);
-
-	return ret;
-}
-
-static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
-					struct ocfs2_net_wait_ctxt *w)
-{
-	spin_lock(&osb->net_response_lock);
-	list_del(&w->n_list);
-	spin_unlock(&osb->net_response_lock);
-}
-
-static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
-				      struct ocfs2_net_wait_ctxt *w)
-{
-	spin_lock(&osb->net_response_lock);
-	list_add_tail(&w->n_list,
-		      &osb->net_response_list);
-	spin_unlock(&osb->net_response_lock);
-}
-
-static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
-					struct ocfs2_net_wait_ctxt *w,
-					int node_num)
-{
-	assert_spin_locked(&osb->net_response_lock);
-
-	ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
-	if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
-		wake_up(&w->n_event);
-}
-
-/* Intended to be called from the node down callback, we fake remove
- * the node from all our response contexts */
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
-					int node_num)
-{
-	struct list_head *p;
-	struct ocfs2_net_wait_ctxt *w = NULL;
-
-	spin_lock(&osb->net_response_lock);
-
-	list_for_each(p, &osb->net_response_list) {
-		w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
-
-		__ocfs2_mark_node_responded(osb, w, node_num);
-	}
-
-	spin_unlock(&osb->net_response_lock);
-}
-
-static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
-				struct ocfs2_vote_msg *request,
-				unsigned int response_id,
-				int *response,
-				struct ocfs2_net_response_cb *callback)
-{
-	int status, i, remote_err;
-	struct ocfs2_net_wait_ctxt *w = NULL;
-	int dequeued = 0;
-
-	mlog_entry_void();
-
-	w = ocfs2_new_net_wait_ctxt(response_id);
-	if (!w) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
-	w->n_callback = callback;
-
-	/* we're pretty much ready to go at this point, and this fills
-	 * in n_response which we need anyway... */
-	ocfs2_queue_net_wait_ctxt(osb, w);
-
-	i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
-
-	while (i != O2NM_INVALID_NODE_NUM) {
-		if (i != osb->node_num) {
-			mlog(0, "trying to send request to node %i\n", i);
-			ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
-
-			remote_err = 0;
-			status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
-						    osb->net_key,
-						    request,
-						    sizeof(*request),
-						    i,
-						    &remote_err);
-			if (status == -ETIMEDOUT) {
-				mlog(0, "remote node %d timed out!\n", i);
-				status = -EAGAIN;
-				goto bail;
-			}
-			if (remote_err < 0) {
-				status = remote_err;
-				mlog(0, "remote error %d on node %d!\n",
-				     remote_err, i);
-				mlog_errno(status);
-				goto bail;
-			}
-			if (status < 0) {
-				mlog_errno(status);
-				goto bail;
-			}
-		}
-		i++;
-		i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
-		mlog(0, "next is %d, i am %d\n", i, osb->node_num);
-	}
-	mlog(0, "done sending, now waiting on responses...\n");
-
-	wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
-
-	ocfs2_dequeue_net_wait_ctxt(osb, w);
-	dequeued = 1;
-
-	*response = w->n_response;
-	status = 0;
-bail:
-	if (w) {
-		if (!dequeued)
-			ocfs2_dequeue_net_wait_ctxt(osb, w);
-		kfree(w);
-	}
-
-	mlog_exit(status);
-	return status;
-}
-
-static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
-						      u64 blkno,
-						      unsigned int generation,
-						      enum ocfs2_vote_request type)
-{
-	struct ocfs2_vote_msg *request;
-	struct ocfs2_msg_hdr *hdr;
-
-	BUG_ON(!ocfs2_is_valid_vote_request(type));
-
-	request = kzalloc(sizeof(*request), GFP_NOFS);
-	if (!request) {
-		mlog_errno(-ENOMEM);
-	} else {
-		hdr = &request->v_hdr;
-		hdr->h_node_num = cpu_to_be32(osb->node_num);
-		hdr->h_request = cpu_to_be32(type);
-		hdr->h_blkno = cpu_to_be64(blkno);
-		hdr->h_generation = cpu_to_be32(generation);
-	}
-
-	return request;
-}
-
-/* Complete the buildup of a new vote request and process the
- * broadcast return value. */
-static int ocfs2_do_request_vote(struct ocfs2_super *osb,
-				 struct ocfs2_vote_msg *request,
-				 struct ocfs2_net_response_cb *callback)
-{
-	int status, response = -EBUSY;
-	unsigned int response_id;
-	struct ocfs2_msg_hdr *hdr;
-
-	response_id = ocfs2_new_response_id(osb);
-
-	hdr = &request->v_hdr;
-	hdr->h_response_id = cpu_to_be32(response_id);
-
-	status = ocfs2_broadcast_vote(osb, request, response_id, &response,
-				      callback);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	status = response;
-bail:
-
-	return status;
-}
-
-int ocfs2_request_mount_vote(struct ocfs2_super *osb)
-{
-	int status;
-	struct ocfs2_vote_msg *request = NULL;
-
-	request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
-	if (!request) {
-		status = -ENOMEM;
-		goto bail;
-	}
-
-	status = -EAGAIN;
-	while (status == -EAGAIN) {
-		if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
-		    signal_pending(current)) {
-			status = -ERESTARTSYS;
-			goto bail;
-		}
-
-		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
-					   osb->node_num)) {
-			status = 0;
-			goto bail;
-		}
-
-		status = ocfs2_do_request_vote(osb, request, NULL);
-	}
-
-bail:
-	kfree(request);
-	return status;
-}
-
-int ocfs2_request_umount_vote(struct ocfs2_super *osb)
-{
-	int status;
-	struct ocfs2_vote_msg *request = NULL;
-
-	request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
-	if (!request) {
-		status = -ENOMEM;
-		goto bail;
-	}
-
-	status = -EAGAIN;
-	while (status == -EAGAIN) {
-		/* Do not check signals on this vote... We really want
-		 * this one to go all the way through. */
-
-		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
-					   osb->node_num)) {
-			status = 0;
-			goto bail;
-		}
-
-		status = ocfs2_do_request_vote(osb, request, NULL);
-	}
-
-bail:
-	kfree(request);
-	return status;
-}
-
-/* TODO: This should eventually be a hash table! */
-static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
-							       u32 response_id)
-{
-	struct list_head *p;
-	struct ocfs2_net_wait_ctxt *w = NULL;
-
-	list_for_each(p, &osb->net_response_list) {
-		w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
-		if (response_id == w->n_response_id)
-			break;
-		w = NULL;
-	}
-
-	return w;
-}
-
-/* Translate response codes into local node errno values */
-static inline int ocfs2_translate_response(int response)
-{
-	int ret;
-
-	switch (response) {
-	case OCFS2_RESPONSE_OK:
-		ret = 0;
-		break;
-
-	case OCFS2_RESPONSE_BUSY:
-		ret = -EBUSY;
-		break;
-
-	default:
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-
-static int ocfs2_handle_response_message(struct o2net_msg *msg,
-					 u32 len,
-					 void *data, void **ret_data)
-{
-	unsigned int response_id, node_num;
-	int response_status;
-	struct ocfs2_super *osb = data;
-	struct ocfs2_response_msg *resp;
-	struct ocfs2_net_wait_ctxt * w;
-	struct ocfs2_net_response_cb *resp_cb;
-
-	resp = (struct ocfs2_response_msg *) msg->buf;
-
-	response_id = be32_to_cpu(resp->r_hdr.h_response_id);
-	node_num = be32_to_cpu(resp->r_hdr.h_node_num);
-	response_status = 
-		ocfs2_translate_response(be32_to_cpu(resp->r_response));
-
-	mlog(0, "received response message:\n");
-	mlog(0, "h_response_id = %u\n", response_id);
-	mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
-	mlog(0, "h_blkno = %llu\n",
-	     (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
-	mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
-	mlog(0, "h_node_num = %u\n", node_num);
-	mlog(0, "r_response = %d\n", response_status);
-
-	spin_lock(&osb->net_response_lock);
-	w = __ocfs2_find_net_wait_ctxt(osb, response_id);
-	if (!w) {
-		mlog(0, "request not found!\n");
-		goto bail;
-	}
-	resp_cb = w->n_callback;
-
-	if (response_status && (!w->n_response)) {
-		/* we only really need one negative response so don't
-		 * set it twice. */
-		w->n_response = response_status;
-	}
-
-	if (resp_cb) {
-		spin_unlock(&osb->net_response_lock);
-
-		resp_cb->rc_cb(resp_cb->rc_priv, resp);
-
-		spin_lock(&osb->net_response_lock);
-	}
-
-	__ocfs2_mark_node_responded(osb, w, node_num);
-bail:
-	spin_unlock(&osb->net_response_lock);
-
-	return 0;
-}
-
-static int ocfs2_handle_vote_message(struct o2net_msg *msg,
-				     u32 len,
-				     void *data, void **ret_data)
-{
-	int status;
-	struct ocfs2_super *osb = data;
-	struct ocfs2_vote_work *work;
-
-	work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
-	if (!work) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
-
-	INIT_LIST_HEAD(&work->w_list);
-	memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
-
-	mlog(0, "scheduling vote request:\n");
-	mlog(0, "h_response_id = %u\n",
-	     be32_to_cpu(work->w_msg.v_hdr.h_response_id));
-	mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
-	mlog(0, "h_blkno = %llu\n",
-	     (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
-	mlog(0, "h_generation = %u\n",
-	     be32_to_cpu(work->w_msg.v_hdr.h_generation));
-	mlog(0, "h_node_num = %u\n",
-	     be32_to_cpu(work->w_msg.v_hdr.h_node_num));
-
-	spin_lock(&osb->vote_task_lock);
-	list_add_tail(&work->w_list, &osb->vote_list);
-	osb->vote_count++;
-	spin_unlock(&osb->vote_task_lock);
-
-	ocfs2_kick_vote_thread(osb);
-
-	status = 0;
-bail:
-	return status;
-}
-
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
-{
-	if (!osb->net_key)
-		return;
-
-	o2net_unregister_handler_list(&osb->osb_net_handlers);
-
-	if (!list_empty(&osb->net_response_list))
-		mlog(ML_ERROR, "net response list not empty!\n");
-
-	osb->net_key = 0;
-}
-
-int ocfs2_register_net_handlers(struct ocfs2_super *osb)
-{
-	int status = 0;
-
-	if (ocfs2_mount_local(osb))
-		return 0;
-
-	status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
-					osb->net_key,
-					sizeof(struct ocfs2_response_msg),
-					ocfs2_handle_response_message,
-					osb, NULL, &osb->osb_net_handlers);
-	if (status) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
-					osb->net_key,
-					sizeof(struct ocfs2_vote_msg),
-					ocfs2_handle_vote_message,
-					osb, NULL, &osb->osb_net_handlers);
-	if (status) {
-		mlog_errno(status);
-		goto bail;
-	}
-bail:
-	if (status < 0)
-		ocfs2_unregister_net_handlers(osb);
-
-	return status;
-}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
deleted file mode 100644
index 9ea46f62de31..000000000000
--- a/fs/ocfs2/vote.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * vote.h
- *
- * description here
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-
-#ifndef VOTE_H
-#define VOTE_H
-
-int ocfs2_vote_thread(void *arg);
-static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
-{
-	spin_lock(&osb->vote_task_lock);
-	/* make sure the voting thread gets a swipe at whatever changes
-	 * the caller may have made to the voting state */
-	osb->vote_wake_sequence++;
-	spin_unlock(&osb->vote_task_lock);
-	wake_up(&osb->vote_event);
-}
-
-int ocfs2_request_mount_vote(struct ocfs2_super *osb);
-int ocfs2_request_umount_vote(struct ocfs2_super *osb);
-int ocfs2_register_net_handlers(struct ocfs2_super *osb);
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
-
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
-					int node_num);
-#endif
-- 
cgit v1.2.3


From f1f540688eae66c274ff1c1133b5d9c687b28f58 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 18 Oct 2007 15:13:59 -0700
Subject: ocfs2: Add data downconvert worker to inode lock

In order to extend inode lock coverage to inode data, we use the same data
downconvert worker with only a small modification to only do work for
regular files.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlmglue.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b3068ade3f7b..7e36abea8f40 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -228,6 +228,7 @@ static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
 	.get_osb	= ocfs2_get_inode_osb,
 	.check_downconvert = ocfs2_check_meta_downconvert,
 	.set_lvb	= ocfs2_set_meta_lvb,
+	.downconvert_worker = ocfs2_data_convert_worker,
 	.flags		= LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
 };
 
@@ -2851,6 +2852,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
        	inode = ocfs2_lock_res_inode(lockres);
 	mapping = inode->i_mapping;
 
+	if (S_ISREG(inode->i_mode))
+		goto out;
+
 	/*
 	 * We need this before the filemap_fdatawrite() so that it can
 	 * transfer the dirty bit from the PTE to the
@@ -2876,6 +2880,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
 		filemap_fdatawait(mapping);
 	}
 
+out:
 	return UNBLOCK_CONTINUE;
 }
 
-- 
cgit v1.2.3


From c934a92d05b549dd2f25db72c5fc3cb9dcf1b611 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 18 Oct 2007 15:23:46 -0700
Subject: ocfs2: Remove data locks

The meta lock now covers both meta data and data, so this just removes the
now-redundant data lock.

Combining locks saves us a round of lock mastery per inode and one less lock
to ping between nodes during read/write.

We don't lose much - since meta locks were always held before a data lock
(and at the same level) ordered writeout mode (the default) ensured that
flushing for the meta data lock also pushed out data anyways.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c                 |  44 +----------------
 fs/ocfs2/cluster/tcp_internal.h |   5 +-
 fs/ocfs2/dlmglue.c              | 104 ----------------------------------------
 fs/ocfs2/dlmglue.h              |  11 +----
 fs/ocfs2/file.c                 |  55 ++++++---------------
 fs/ocfs2/inode.c                |   6 ---
 fs/ocfs2/inode.h                |   1 -
 fs/ocfs2/mmap.c                 |   9 ----
 fs/ocfs2/super.c                |   1 -
 9 files changed, 22 insertions(+), 214 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 56f7790cad46..5fc27cfaee50 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -305,21 +305,12 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 		goto out_alloc;
 	}
 
-	ret = ocfs2_data_lock_with_page(inode, 0, page);
-	if (ret != 0) {
-		if (ret == AOP_TRUNCATED_PAGE)
-			unlock = 0;
-		mlog_errno(ret);
-		goto out_alloc;
-	}
-
 	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		ret = ocfs2_readpage_inline(inode, page);
 	else
 		ret = block_read_full_page(page, ocfs2_get_block);
 	unlock = 0;
 
-	ocfs2_data_unlock(inode, 0);
 out_alloc:
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 out_meta_unlock:
@@ -638,34 +629,12 @@ static ssize_t ocfs2_direct_IO(int rw,
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		return 0;
 
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-		/*
-		 * We get PR data locks even for O_DIRECT.  This
-		 * allows concurrent O_DIRECT I/O but doesn't let
-		 * O_DIRECT with extending and buffered zeroing writes
-		 * race.  If they did race then the buffered zeroing
-		 * could be written back after the O_DIRECT I/O.  It's
-		 * one thing to tell people not to mix buffered and
-		 * O_DIRECT writes, but expecting them to understand
-		 * that file extension is also an implicit buffered
-		 * write is too much.  By getting the PR we force
-		 * writeback of the buffered zeroing before
-		 * proceeding.
-		 */
-		ret = ocfs2_data_lock(inode, 0);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-		ocfs2_data_unlock(inode, 0);
-	}
-
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
 					    nr_segs, 
 					    ocfs2_direct_IO_get_blocks,
 					    ocfs2_dio_end_io);
-out:
+
 	mlog_exit(ret);
 	return ret;
 }
@@ -1769,25 +1738,17 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = ocfs2_data_lock(inode, 1);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_fail;
-	}
-
 	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
 				       fsdata, di_bh, NULL);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_fail_data;
+		goto out_fail;
 	}
 
 	brelse(di_bh);
 
 	return 0;
 
-out_fail_data:
-	ocfs2_data_unlock(inode, 1);
 out_fail:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
@@ -1908,7 +1869,6 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 
 	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
 
-	ocfs2_data_unlock(inode, 1);
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_meta_unlock(inode, 1);
 
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 79bd6665b3ca..b2e832aca567 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
  * locking semantics of the file system using the protocol.  It should 
  * be somewhere else, I'm sure, but right now it isn't.
  *
+ * New in version 10:
+ * 	- Meta/data locks combined
+ *
  * New in version 9:
  * 	- All votes removed
  *
@@ -63,7 +66,7 @@
  * 	- full 64 bit i_size in the metadata lock lvbs
  * 	- introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 9ULL
+#define O2NET_PROTOCOL_VERSION 10ULL
 struct o2net_handshake {
 	__be64	protocol_version;
 	__be64	connector_id;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7e36abea8f40..ecf58c6e2fa3 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -232,12 +232,6 @@ static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
 	.flags		= LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
 };
 
-static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
-	.get_osb	= ocfs2_get_inode_osb,
-	.downconvert_worker = ocfs2_data_convert_worker,
-	.flags		= 0,
-};
-
 static struct ocfs2_lock_res_ops ocfs2_super_lops = {
 	.flags		= LOCK_TYPE_REQUIRES_REFRESH,
 };
@@ -261,7 +255,6 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
-		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
 		lockres->l_type == OCFS2_LOCK_TYPE_RW ||
 		lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
@@ -405,9 +398,6 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
 		case OCFS2_LOCK_TYPE_META:
 			ops = &ocfs2_inode_meta_lops;
 			break;
-		case OCFS2_LOCK_TYPE_DATA:
-			ops = &ocfs2_inode_data_lops;
-			break;
 		case OCFS2_LOCK_TYPE_OPEN:
 			ops = &ocfs2_inode_open_lops;
 			break;
@@ -1154,12 +1144,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
 		goto bail;
 	}
 
-	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
-	if (ret) {
-		mlog_errno(ret);
-		goto bail;
-	}
-
 	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
 	if (ret) {
 		mlog_errno(ret);
@@ -1312,67 +1296,6 @@ out:
 	mlog_exit_void();
 }
 
-int ocfs2_data_lock_full(struct inode *inode,
-			 int write,
-			 int arg_flags)
-{
-	int status = 0, level;
-	struct ocfs2_lock_res *lockres;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	BUG_ON(!inode);
-
-	mlog_entry_void();
-
-	mlog(0, "inode %llu take %s DATA lock\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     write ? "EXMODE" : "PRMODE");
-
-	/* We'll allow faking a readonly data lock for
-	 * rodevices. */
-	if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
-		if (write) {
-			status = -EROFS;
-			mlog_errno(status);
-		}
-		goto out;
-	}
-
-	if (ocfs2_mount_local(osb))
-		goto out;
-
-	lockres = &OCFS2_I(inode)->ip_data_lockres;
-
-	level = write ? LKM_EXMODE : LKM_PRMODE;
-
-	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
-				    0, arg_flags);
-	if (status < 0 && status != -EAGAIN)
-		mlog_errno(status);
-
-out:
-	mlog_exit(status);
-	return status;
-}
-
-/* see ocfs2_meta_lock_with_page() */
-int ocfs2_data_lock_with_page(struct inode *inode,
-			      int write,
-			      struct page *page)
-{
-	int ret;
-
-	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
-	if (ret == -EAGAIN) {
-		unlock_page(page);
-		if (ocfs2_data_lock(inode, write) == 0)
-			ocfs2_data_unlock(inode, write);
-		ret = AOP_TRUNCATED_PAGE;
-	}
-
-	return ret;
-}
-
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres)
 {
@@ -1404,26 +1327,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
 	mlog_exit_void();
 }
 
-void ocfs2_data_unlock(struct inode *inode,
-		       int write)
-{
-	int level = write ? LKM_EXMODE : LKM_PRMODE;
-	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	mlog_entry_void();
-
-	mlog(0, "inode %llu drop %s DATA lock\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     write ? "EXMODE" : "PRMODE");
-
-	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
-	    !ocfs2_mount_local(osb))
-		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-
-	mlog_exit_void();
-}
-
 #define OCFS2_SEC_BITS   34
 #define OCFS2_SEC_SHIFT  (64 - 34)
 #define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
@@ -2591,13 +2494,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
 
 	status = err;
 
-	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-			      &OCFS2_I(inode)->ip_data_lockres);
-	if (err < 0)
-		mlog_errno(err);
-	if (err < 0 && !status)
-		status = err;
-
 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
 			      &OCFS2_I(inode)->ip_meta_lockres);
 	if (err < 0)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 931f6ee55146..3fd7729daeef 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,7 +49,7 @@ struct ocfs2_meta_lvb {
 	__be32       lvb_reserved2;
 };
 
-/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* ocfs2_meta_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY	(0x01)
 /* Instruct the dlm not to queue ourselves on the other node. */
@@ -69,15 +69,6 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
-int ocfs2_data_lock_full(struct inode *inode,
-			 int write,
-			 int arg_flags);
-#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
-int ocfs2_data_lock_with_page(struct inode *inode,
-			      int write,
-			      struct page *page);
-void ocfs2_data_unlock(struct inode *inode,
-		       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
 int ocfs2_open_lock(struct inode *inode);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b75b2e1f0e42..c5c183ac41fe 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -382,18 +382,13 @@ static int ocfs2_truncate_file(struct inode *inode,
 
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	/* This forces other nodes to sync and drop their pages. Do
-	 * this even if we have a truncate without allocation change -
-	 * ocfs2 cluster sizes can be much greater than page size, so
-	 * we have to truncate them anyway.  */
-	status = ocfs2_data_lock(inode, 1);
-	if (status < 0) {
-		up_write(&OCFS2_I(inode)->ip_alloc_sem);
-
-		mlog_errno(status);
-		goto bail;
-	}
-
+	/*
+	 * The inode lock forced other nodes to sync and drop their
+	 * pages, which (correctly) happens even if we have a truncate
+	 * without allocation change - ocfs2 cluster sizes can be much
+	 * greater than page size, so we have to truncate them
+	 * anyway.
+	 */
 	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 	truncate_inode_pages(inode->i_mapping, new_i_size);
 
@@ -403,7 +398,7 @@ static int ocfs2_truncate_file(struct inode *inode,
 		if (status)
 			mlog_errno(status);
 
-		goto bail_unlock_data;
+		goto bail_unlock_sem;
 	}
 
 	/* alright, we're going to need to do a full blown alloc size
@@ -413,25 +408,23 @@ static int ocfs2_truncate_file(struct inode *inode,
 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail_unlock_data;
+		goto bail_unlock_sem;
 	}
 
 	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail_unlock_data;
+		goto bail_unlock_sem;
 	}
 
 	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail_unlock_data;
+		goto bail_unlock_sem;
 	}
 
 	/* TODO: orphan dir cleanup here. */
-bail_unlock_data:
-	ocfs2_data_unlock(inode, 1);
-
+bail_unlock_sem:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
 bail:
@@ -917,7 +910,7 @@ static int ocfs2_extend_file(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     u64 new_i_size)
 {
-	int ret = 0, data_locked = 0;
+	int ret = 0;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
 	BUG_ON(!di_bh);
@@ -943,20 +936,6 @@ static int ocfs2_extend_file(struct inode *inode,
 	    && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 		goto out_update_size;
 
-	/* 
-	 * protect the pages that ocfs2_zero_extend is going to be
-	 * pulling into the page cache.. we do this before the
-	 * metadata extend so that we don't get into the situation
-	 * where we've extended the metadata but can't get the data
-	 * lock to zero.
-	 */
-	ret = ocfs2_data_lock(inode, 1);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-	data_locked = 1;
-
 	/*
 	 * The alloc sem blocks people in read/write from reading our
 	 * allocation until we're done changing it. We depend on
@@ -980,7 +959,7 @@ static int ocfs2_extend_file(struct inode *inode,
 			up_write(&oi->ip_alloc_sem);
 
 			mlog_errno(ret);
-			goto out_unlock;
+			goto out;
 		}
 	}
 
@@ -991,7 +970,7 @@ static int ocfs2_extend_file(struct inode *inode,
 
 	if (ret < 0) {
 		mlog_errno(ret);
-		goto out_unlock;
+		goto out;
 	}
 
 out_update_size:
@@ -999,10 +978,6 @@ out_update_size:
 	if (ret < 0)
 		mlog_errno(ret);
 
-out_unlock:
-	if (data_locked)
-		ocfs2_data_unlock(inode, 1);
-
 out:
 	return ret;
 }
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 86cf073996b5..8ff201d3705e 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -332,10 +332,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 				  OCFS2_LOCK_TYPE_RW, inode->i_generation,
 				  inode);
 
-	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
-				  OCFS2_LOCK_TYPE_DATA, inode->i_generation,
-				  inode);
-
 	ocfs2_set_inode_flags(inode);
 
 	status = 0;
@@ -1014,7 +1010,6 @@ void ocfs2_clear_inode(struct inode *inode)
 	 * the downconvert thread while waiting to destroy the locks. */
 	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
-	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
 
 	/* We very well may get a clear_inode before all an inodes
@@ -1038,7 +1033,6 @@ void ocfs2_clear_inode(struct inode *inode)
 
 	ocfs2_lock_res_free(&oi->ip_rw_lockres);
 	ocfs2_lock_res_free(&oi->ip_meta_lockres);
-	ocfs2_lock_res_free(&oi->ip_data_lockres);
 	ocfs2_lock_res_free(&oi->ip_open_lockres);
 
 	ocfs2_metadata_cache_purge(inode);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 70e881c55536..d1c54da687c9 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -35,7 +35,6 @@ struct ocfs2_inode_info
 
 	struct ocfs2_lock_res		ip_rw_lockres;
 	struct ocfs2_lock_res		ip_meta_lockres;
-	struct ocfs2_lock_res		ip_data_lockres;
 	struct ocfs2_lock_res		ip_open_lockres;
 
 	/* protects allocation changes on this inode. */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 98756156d298..a7f0ccc6fdd8 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -181,17 +181,8 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = ocfs2_data_lock(inode, 1);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_meta_unlock;
-	}
-
 	ret = __ocfs2_page_mkwrite(inode, di_bh, page);
 
-	ocfs2_data_unlock(inode, 1);
-
-out_meta_unlock:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
 	brelse(di_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 1996820488cc..064eba074f1e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1020,7 +1020,6 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
 
 	ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
 	ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
-	ocfs2_lock_res_init_once(&oi->ip_data_lockres);
 	ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 
 	ocfs2_metadata_cache_init(&oi->vfs_inode);
-- 
cgit v1.2.3


From e63aecb651ba73dffc62f9608ee1b7ae2a0ffd4b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 18 Oct 2007 15:30:42 -0700
Subject: ocfs2: Rename ocfs2_meta_[un]lock

Call this the "inode_lock" now, since it covers both data and meta data.
This patch makes no functional changes.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c      |  8 ++++----
 fs/ocfs2/aops.c       | 18 ++++++++---------
 fs/ocfs2/dir.c        |  8 ++++----
 fs/ocfs2/dlmglue.c    | 50 +++++++++++++++++++++++------------------------
 fs/ocfs2/dlmglue.h    | 12 ++++++------
 fs/ocfs2/export.c     |  4 ++--
 fs/ocfs2/file.c       | 42 +++++++++++++++++++--------------------
 fs/ocfs2/inode.c      | 30 ++++++++++++++--------------
 fs/ocfs2/inode.h      |  2 +-
 fs/ocfs2/ioctl.c      |  8 ++++----
 fs/ocfs2/journal.c    | 26 ++++++++++++-------------
 fs/ocfs2/localalloc.c |  8 ++++----
 fs/ocfs2/mmap.c       |  8 ++++----
 fs/ocfs2/namei.c      | 54 +++++++++++++++++++++++++--------------------------
 fs/ocfs2/suballoc.c   |  4 ++--
 fs/ocfs2/super.c      |  6 +++---
 16 files changed, 144 insertions(+), 144 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 23c8cda43f19..e6df06ac6405 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4731,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 
 	mutex_lock(&data_alloc_inode->i_mutex);
 
-	status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
+	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_mutex;
@@ -4753,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 
 out_unlock:
 	brelse(data_alloc_bh);
-	ocfs2_meta_unlock(data_alloc_inode, 1);
+	ocfs2_inode_unlock(data_alloc_inode, 1);
 
 out_mutex:
 	mutex_unlock(&data_alloc_inode->i_mutex);
@@ -5077,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
 
 	mutex_lock(&inode->i_mutex);
 
-	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_mutex;
@@ -5118,7 +5118,7 @@ out_journal:
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock:
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 	brelse(di_bh);
 out_mutex:
 	mutex_unlock(&inode->i_mutex);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5fc27cfaee50..ac8c39055717 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -275,7 +275,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 
 	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
 
-	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
+	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
 	if (ret != 0) {
 		if (ret == AOP_TRUNCATED_PAGE)
 			unlock = 0;
@@ -285,7 +285,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 
 	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
 		ret = AOP_TRUNCATED_PAGE;
-		goto out_meta_unlock;
+		goto out_inode_unlock;
 	}
 
 	/*
@@ -313,8 +313,8 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 
 out_alloc:
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-out_meta_unlock:
-	ocfs2_meta_unlock(inode, 0);
+out_inode_unlock:
+	ocfs2_inode_unlock(inode, 0);
 out:
 	if (unlock)
 		unlock_page(page);
@@ -443,7 +443,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 	 * accessed concurrently from multiple nodes.
 	 */
 	if (!INODE_JOURNAL(inode)) {
-		err = ocfs2_meta_lock(inode, NULL, 0);
+		err = ocfs2_inode_lock(inode, NULL, 0);
 		if (err) {
 			if (err != -ENOENT)
 				mlog_errno(err);
@@ -458,7 +458,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 
 	if (!INODE_JOURNAL(inode)) {
 		up_read(&OCFS2_I(inode)->ip_alloc_sem);
-		ocfs2_meta_unlock(inode, 0);
+		ocfs2_inode_unlock(inode, 0);
 	}
 
 	if (err) {
@@ -1723,7 +1723,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 	struct buffer_head *di_bh = NULL;
 	struct inode *inode = mapping->host;
 
-	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -1753,7 +1753,7 @@ out_fail:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
 	brelse(di_bh);
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 
 	return ret;
 }
@@ -1870,7 +1870,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
 
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 
 	return ret;
 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 63b28fdceb4a..6b0107f21344 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	mlog_entry("dirino=%llu\n",
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+	error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
 	if (lock_level && error >= 0) {
 		/* We release EX lock which used to update atime
 		 * and get PR lock again to reduce contention
 		 * on commonly accessed directories. */
-		ocfs2_meta_unlock(inode, 1);
+		ocfs2_inode_unlock(inode, 1);
 		lock_level = 0;
-		error = ocfs2_meta_lock(inode, NULL, 0);
+		error = ocfs2_inode_lock(inode, NULL, 0);
 	}
 	if (error < 0) {
 		if (error != -ENOENT)
@@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
 				      dirent, filldir, NULL);
 
-	ocfs2_meta_unlock(inode, lock_level);
+	ocfs2_inode_unlock(inode, lock_level);
 
 bail_nolock:
 	mlog_exit(error);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index ecf58c6e2fa3..fa5e3bdc295d 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -224,7 +224,7 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
 	.flags		= 0,
 };
 
-static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
 	.get_osb	= ocfs2_get_inode_osb,
 	.check_downconvert = ocfs2_check_meta_downconvert,
 	.set_lvb	= ocfs2_set_meta_lvb,
@@ -306,7 +306,7 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres);
-static int ocfs2_meta_lock_update(struct inode *inode,
+static int ocfs2_inode_lock_update(struct inode *inode,
 				  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
@@ -396,7 +396,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
 			ops = &ocfs2_inode_rw_lops;
 			break;
 		case OCFS2_LOCK_TYPE_META:
-			ops = &ocfs2_inode_meta_lops;
+			ops = &ocfs2_inode_inode_lops;
 			break;
 		case OCFS2_LOCK_TYPE_OPEN:
 			ops = &ocfs2_inode_open_lops;
@@ -1138,7 +1138,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
 	 * We don't want to use LKM_LOCAL on a meta data lock as they
 	 * don't use a generation in their lock names.
 	 */
-	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
+	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
 	if (ret) {
 		mlog_errno(ret);
 		goto bail;
@@ -1346,11 +1346,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
 
 /* Call this with the lockres locked. I am reasonably sure we don't
  * need ip_lock in this function as anyone who would be changing those
- * values is supposed to be blocked in ocfs2_meta_lock right now. */
+ * values is supposed to be blocked in ocfs2_inode_lock right now. */
 static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
 
 	mlog_entry_void();
@@ -1400,7 +1400,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
 static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
 
 	mlog_entry_void();
@@ -1508,12 +1508,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
 }
 
 /* may or may not return a bh if it went to disk. */
-static int ocfs2_meta_lock_update(struct inode *inode,
+static int ocfs2_inode_lock_update(struct inode *inode,
 				  struct buffer_head **bh)
 {
 	int status = 0;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_dinode *fe;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
@@ -1625,7 +1625,7 @@ static int ocfs2_assign_bh(struct inode *inode,
  * returns < 0 error if the callback will never be called, otherwise
  * the result of the lock will be communicated via the callback.
  */
-int ocfs2_meta_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full(struct inode *inode,
 			 struct buffer_head **ret_bh,
 			 int ex,
 			 int arg_flags)
@@ -1660,7 +1660,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
 		wait_event(osb->recovery_event,
 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
 
-	lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	lockres = &OCFS2_I(inode)->ip_inode_lockres;
 	level = ex ? LKM_EXMODE : LKM_PRMODE;
 	dlm_flags = 0;
 	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1699,11 +1699,11 @@ local:
 	}
 
 	/* This is fun. The caller may want a bh back, or it may
-	 * not. ocfs2_meta_lock_update definitely wants one in, but
+	 * not. ocfs2_inode_lock_update definitely wants one in, but
 	 * may or may not read one, depending on what's in the
 	 * LVB. The result of all of this is that we've *only* gone to
 	 * disk if we have to, so the complexity is worthwhile. */
-	status = ocfs2_meta_lock_update(inode, &local_bh);
+	status = ocfs2_inode_lock_update(inode, &local_bh);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -1725,7 +1725,7 @@ bail:
 			*ret_bh = NULL;
 		}
 		if (acquired)
-			ocfs2_meta_unlock(inode, ex);
+			ocfs2_inode_unlock(inode, ex);
 	}
 
 	if (local_bh)
@@ -1757,32 +1757,32 @@ bail:
  * ping locks back and forth, but that's a risk we're willing to take to avoid
  * the lock inversion simply.
  */
-int ocfs2_meta_lock_with_page(struct inode *inode,
+int ocfs2_inode_lock_with_page(struct inode *inode,
 			      struct buffer_head **ret_bh,
 			      int ex,
 			      struct page *page)
 {
 	int ret;
 
-	ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
+	ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
 	if (ret == -EAGAIN) {
 		unlock_page(page);
-		if (ocfs2_meta_lock(inode, ret_bh, ex) == 0)
-			ocfs2_meta_unlock(inode, ex);
+		if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
+			ocfs2_inode_unlock(inode, ex);
 		ret = AOP_TRUNCATED_PAGE;
 	}
 
 	return ret;
 }
 
-int ocfs2_meta_lock_atime(struct inode *inode,
+int ocfs2_inode_lock_atime(struct inode *inode,
 			  struct vfsmount *vfsmnt,
 			  int *level)
 {
 	int ret;
 
 	mlog_entry_void();
-	ret = ocfs2_meta_lock(inode, NULL, 0);
+	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1795,8 +1795,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
 	if (ocfs2_should_update_atime(inode, vfsmnt)) {
 		struct buffer_head *bh = NULL;
 
-		ocfs2_meta_unlock(inode, 0);
-		ret = ocfs2_meta_lock(inode, &bh, 1);
+		ocfs2_inode_unlock(inode, 0);
+		ret = ocfs2_inode_lock(inode, &bh, 1);
 		if (ret < 0) {
 			mlog_errno(ret);
 			return ret;
@@ -1813,11 +1813,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
 	return ret;
 }
 
-void ocfs2_meta_unlock(struct inode *inode,
+void ocfs2_inode_unlock(struct inode *inode,
 		       int ex)
 {
 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
-	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	mlog_entry_void();
@@ -2495,7 +2495,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
 	status = err;
 
 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-			      &OCFS2_I(inode)->ip_meta_lockres);
+			      &OCFS2_I(inode)->ip_inode_lockres);
 	if (err < 0)
 		mlog_errno(err);
 	if (err < 0 && !status)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 3fd7729daeef..6dcbc944e8ce 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,7 +49,7 @@ struct ocfs2_meta_lvb {
 	__be32       lvb_reserved2;
 };
 
-/* ocfs2_meta_lock_full() 'arg_flags' flags */
+/* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY	(0x01)
 /* Instruct the dlm not to queue ourselves on the other node. */
@@ -74,21 +74,21 @@ void ocfs2_rw_unlock(struct inode *inode, int write);
 int ocfs2_open_lock(struct inode *inode);
 int ocfs2_try_open_lock(struct inode *inode, int write);
 void ocfs2_open_unlock(struct inode *inode);
-int ocfs2_meta_lock_atime(struct inode *inode,
+int ocfs2_inode_lock_atime(struct inode *inode,
 			  struct vfsmount *vfsmnt,
 			  int *level);
-int ocfs2_meta_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full(struct inode *inode,
 			 struct buffer_head **ret_bh,
 			 int ex,
 			 int arg_flags);
-int ocfs2_meta_lock_with_page(struct inode *inode,
+int ocfs2_inode_lock_with_page(struct inode *inode,
 			      struct buffer_head **ret_bh,
 			      int ex,
 			      struct page *page);
 /* 99% of the time we don't want to supply any additional flags --
  * those are for very specific cases only. */
-#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0)
-void ocfs2_meta_unlock(struct inode *inode,
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
+void ocfs2_inode_unlock(struct inode *inode,
 		       int ex);
 int ocfs2_super_lock(struct ocfs2_super *osb,
 		     int ex);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 535bfa9568a4..1f9e353cac45 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	mlog(0, "find parent of directory %llu\n",
 	     (unsigned long long)OCFS2_I(dir)->ip_blkno);
 
-	status = ocfs2_meta_lock(dir, NULL, 0);
+	status = ocfs2_inode_lock(dir, NULL, 0);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	parent->d_op = &ocfs2_dentry_ops;
 
 bail_unlock:
-	ocfs2_meta_unlock(dir, 0);
+	ocfs2_inode_unlock(dir, 0);
 
 bail:
 	mlog_exit_ptr(parent);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c5c183ac41fe..432e5f3c4784 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1025,7 +1025,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	status = ocfs2_meta_lock(inode, &bh, 1);
+	status = ocfs2_inode_lock(inode, &bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -1077,7 +1077,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
@@ -1124,7 +1124,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 
 	mlog_entry_void();
 
-	ret = ocfs2_meta_lock(inode, NULL, 0);
+	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret) {
 		if (ret != -ENOENT)
 			mlog_errno(ret);
@@ -1133,7 +1133,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 
 	ret = generic_permission(inode, mask, NULL);
 
-	ocfs2_meta_unlock(inode, 0);
+	ocfs2_inode_unlock(inode, 0);
 out:
 	mlog_exit(ret);
 	return ret;
@@ -1605,7 +1605,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_rw_unlock;
@@ -1613,7 +1613,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 
 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
 		ret = -EPERM;
-		goto out_meta_unlock;
+		goto out_inode_unlock;
 	}
 
 	switch (sr->l_whence) {
@@ -1627,7 +1627,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 		break;
 	default:
 		ret = -EINVAL;
-		goto out_meta_unlock;
+		goto out_inode_unlock;
 	}
 	sr->l_whence = 0;
 
@@ -1638,14 +1638,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 	    || (sr->l_start + llen) < 0
 	    || (sr->l_start + llen) > max_off) {
 		ret = -EINVAL;
-		goto out_meta_unlock;
+		goto out_inode_unlock;
 	}
 	size = sr->l_start + sr->l_len;
 
 	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
 		if (sr->l_len <= 0) {
 			ret = -EINVAL;
-			goto out_meta_unlock;
+			goto out_inode_unlock;
 		}
 	}
 
@@ -1653,7 +1653,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 		ret = __ocfs2_write_remove_suid(inode, di_bh);
 		if (ret) {
 			mlog_errno(ret);
-			goto out_meta_unlock;
+			goto out_inode_unlock;
 		}
 	}
 
@@ -1679,7 +1679,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_meta_unlock;
+		goto out_inode_unlock;
 	}
 
 	/*
@@ -1689,7 +1689,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
-		goto out_meta_unlock;
+		goto out_inode_unlock;
 	}
 
 	if (change_size && i_size_read(inode) < size)
@@ -1702,9 +1702,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 
 	ocfs2_commit_trans(osb, handle);
 
-out_meta_unlock:
+out_inode_unlock:
 	brelse(di_bh);
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 out_rw_unlock:
 	ocfs2_rw_unlock(inode, 1);
 
@@ -1774,7 +1774,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 	 * if we need to make modifications here.
 	 */
 	for(;;) {
-		ret = ocfs2_meta_lock(inode, NULL, meta_level);
+		ret = ocfs2_inode_lock(inode, NULL, meta_level);
 		if (ret < 0) {
 			meta_level = -1;
 			mlog_errno(ret);
@@ -1792,7 +1792,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 		 * set inode->i_size at the end of a write. */
 		if (should_remove_suid(dentry)) {
 			if (meta_level == 0) {
-				ocfs2_meta_unlock(inode, meta_level);
+				ocfs2_inode_unlock(inode, meta_level);
 				meta_level = 1;
 				continue;
 			}
@@ -1861,7 +1861,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 		*ppos = saved_pos;
 
 out_unlock:
-	ocfs2_meta_unlock(inode, meta_level);
+	ocfs2_inode_unlock(inode, meta_level);
 
 out:
 	return ret;
@@ -2074,12 +2074,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
 	/*
 	 * See the comment in ocfs2_file_aio_read()
 	 */
-	ret = ocfs2_meta_lock(inode, NULL, 0);
+	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
-	ocfs2_meta_unlock(inode, 0);
+	ocfs2_inode_unlock(inode, 0);
 
 	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
 
@@ -2135,12 +2135,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	 * like i_size. This allows the checks down below
 	 * generic_file_aio_read() a chance of actually working. 
 	 */
-	ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
 	}
-	ocfs2_meta_unlock(inode, lock_level);
+	ocfs2_inode_unlock(inode, lock_level);
 
 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
 	if (ret == -EINVAL)
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8ff201d3705e..00cd5b7f3e52 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -321,7 +321,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		 */
 		BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
 
-		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
 					  OCFS2_LOCK_TYPE_META, 0, inode);
 
 		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@@ -409,7 +409,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
 		generation = osb->fs_generation;
 
-	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
 				  OCFS2_LOCK_TYPE_META,
 				  generation, inode);
 
@@ -424,7 +424,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 			mlog_errno(status);
 			return status;
 		}
-		status = ocfs2_meta_lock(inode, NULL, 0);
+		status = ocfs2_inode_lock(inode, NULL, 0);
 		if (status) {
 			make_bad_inode(inode);
 			mlog_errno(status);
@@ -479,7 +479,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 
 bail:
 	if (can_lock)
-		ocfs2_meta_unlock(inode, 0);
+		ocfs2_inode_unlock(inode, 0);
 
 	if (status < 0)
 		make_bad_inode(inode);
@@ -581,7 +581,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 
 	mutex_lock(&inode_alloc_inode->i_mutex);
-	status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1);
+	status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
 	if (status < 0) {
 		mutex_unlock(&inode_alloc_inode->i_mutex);
 
@@ -630,7 +630,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
-	ocfs2_meta_unlock(inode_alloc_inode, 1);
+	ocfs2_inode_unlock(inode_alloc_inode, 1);
 	mutex_unlock(&inode_alloc_inode->i_mutex);
 	brelse(inode_alloc_bh);
 bail:
@@ -704,7 +704,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
 	 * delete_inode operation. We do this now to avoid races with
 	 * recovery completion on other nodes. */
 	mutex_lock(&orphan_dir_inode->i_mutex);
-	status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 	if (status < 0) {
 		mutex_unlock(&orphan_dir_inode->i_mutex);
 
@@ -728,7 +728,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		mlog_errno(status);
 
 bail_unlock_dir:
-	ocfs2_meta_unlock(orphan_dir_inode, 1);
+	ocfs2_inode_unlock(orphan_dir_inode, 1);
 	mutex_unlock(&orphan_dir_inode->i_mutex);
 	brelse(orphan_dir_bh);
 bail:
@@ -929,7 +929,7 @@ void ocfs2_delete_inode(struct inode *inode)
 	 * allocation lock here as it won't be needed - nobody will
 	 * have the file open.
 	 */
-	status = ocfs2_meta_lock(inode, &di_bh, 1);
+	status = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -975,7 +975,7 @@ void ocfs2_delete_inode(struct inode *inode)
 	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
 
 bail_unlock_inode:
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 	brelse(di_bh);
 bail_unblock:
 	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@@ -1009,7 +1009,7 @@ void ocfs2_clear_inode(struct inode *inode)
 	/* Do these before all the other work so that we don't bounce
 	 * the downconvert thread while waiting to destroy the locks. */
 	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
-	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
 
 	/* We very well may get a clear_inode before all an inodes
@@ -1032,7 +1032,7 @@ void ocfs2_clear_inode(struct inode *inode)
 		mlog_errno(status);
 
 	ocfs2_lock_res_free(&oi->ip_rw_lockres);
-	ocfs2_lock_res_free(&oi->ip_meta_lockres);
+	ocfs2_lock_res_free(&oi->ip_inode_lockres);
 	ocfs2_lock_res_free(&oi->ip_open_lockres);
 
 	ocfs2_metadata_cache_purge(inode);
@@ -1176,15 +1176,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
 	}
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
-	/* Let ocfs2_meta_lock do the work of updating our struct
+	/* Let ocfs2_inode_lock do the work of updating our struct
 	 * inode for us. */
-	status = ocfs2_meta_lock(inode, NULL, 0);
+	status = ocfs2_inode_lock(inode, NULL, 0);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		goto bail;
 	}
-	ocfs2_meta_unlock(inode, 0);
+	ocfs2_inode_unlock(inode, 0);
 bail:
 	mlog_exit(status);
 
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index d1c54da687c9..a61c044eb7da 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,7 +34,7 @@ struct ocfs2_inode_info
 	u64			ip_blkno;
 
 	struct ocfs2_lock_res		ip_rw_lockres;
-	struct ocfs2_lock_res		ip_meta_lockres;
+	struct ocfs2_lock_res		ip_inode_lockres;
 	struct ocfs2_lock_res		ip_open_lockres;
 
 	/* protects allocation changes on this inode. */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 87dcece7e1b5..67c2fb4bae91 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -27,14 +27,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
 	int status;
 
-	status = ocfs2_meta_lock(inode, NULL, 0);
+	status = ocfs2_inode_lock(inode, NULL, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		return status;
 	}
 	ocfs2_get_inode_flags(OCFS2_I(inode));
 	*flags = OCFS2_I(inode)->ip_attr;
-	ocfs2_meta_unlock(inode, 0);
+	ocfs2_inode_unlock(inode, 0);
 
 	mlog_exit(status);
 	return status;
@@ -52,7 +52,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 
 	mutex_lock(&inode->i_mutex);
 
-	status = ocfs2_meta_lock(inode, &bh, 1);
+	status = ocfs2_inode_lock(inode, &bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -100,7 +100,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 bail:
 	mutex_unlock(&inode->i_mutex);
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f2ebe2eb3c21..4f440a88bf53 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -336,7 +336,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	struct ocfs2_dinode *di = NULL;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_super *osb;
-	int meta_lock = 0;
+	int inode_lock = 0;
 
 	mlog_entry_void();
 
@@ -366,14 +366,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	/* Skip recovery waits here - journal inode metadata never
 	 * changes in a live cluster so it can be considered an
 	 * exception to the rule. */
-	status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
 	if (status < 0) {
 		if (status != -ERESTARTSYS)
 			mlog(ML_ERROR, "Could not get lock on journal!\n");
 		goto done;
 	}
 
-	meta_lock = 1;
+	inode_lock = 1;
 	di = (struct ocfs2_dinode *)bh->b_data;
 
 	if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
@@ -413,8 +413,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	status = 0;
 done:
 	if (status < 0) {
-		if (meta_lock)
-			ocfs2_meta_unlock(inode, 1);
+		if (inode_lock)
+			ocfs2_inode_unlock(inode, 1);
 		if (bh != NULL)
 			brelse(bh);
 		if (inode) {
@@ -543,7 +543,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	OCFS2_I(inode)->ip_open_count--;
 
 	/* unlock our journal */
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 
 	brelse(journal->j_bh);
 	journal->j_bh = NULL;
@@ -972,9 +972,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	}
 	SET_INODE_JOURNAL(inode);
 
-	status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
+	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
 	if (status < 0) {
-		mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
+		mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
 		if (status != -ERESTARTSYS)
 			mlog(ML_ERROR, "Could not lock journal!\n");
 		goto done;
@@ -1046,7 +1046,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 done:
 	/* drop the lock on this nodes journal */
 	if (got_lock)
-		ocfs2_meta_unlock(inode, 1);
+		ocfs2_inode_unlock(inode, 1);
 
 	if (inode)
 		iput(inode);
@@ -1161,14 +1161,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 	SET_INODE_JOURNAL(inode);
 
 	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
-	status = ocfs2_meta_lock_full(inode, NULL, 1, flags);
+	status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
 	if (status < 0) {
 		if (status != -EAGAIN)
 			mlog_errno(status);
 		goto bail;
 	}
 
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 bail:
 	if (inode)
 		iput(inode);
@@ -1276,7 +1276,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 	}	
 
 	mutex_lock(&orphan_dir_inode->i_mutex);
-	status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0);
+	status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
@@ -1292,7 +1292,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 	*head = priv.head;
 
 out_cluster:
-	ocfs2_meta_unlock(orphan_dir_inode, 0);
+	ocfs2_inode_unlock(orphan_dir_inode, 0);
 out:
 	mutex_unlock(&orphan_dir_inode->i_mutex);
 	iput(orphan_dir_inode);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 58ea88b5af36..0de0792fce7f 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -231,7 +231,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 
 	mutex_lock(&main_bm_inode->i_mutex);
 
-	status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
+	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_mutex;
@@ -286,7 +286,7 @@ out_unlock:
 	if (main_bm_bh)
 		brelse(main_bm_bh);
 
-	ocfs2_meta_unlock(main_bm_inode, 1);
+	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
 	mutex_unlock(&main_bm_inode->i_mutex);
@@ -399,7 +399,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
 
 	mutex_lock(&main_bm_inode->i_mutex);
 
-	status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1);
+	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_mutex;
@@ -424,7 +424,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock:
-	ocfs2_meta_unlock(main_bm_inode, 1);
+	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
 	mutex_unlock(&main_bm_inode->i_mutex);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index a7f0ccc6fdd8..3dc18d67557c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	 * node. Taking the data lock will also ensure that we don't
 	 * attempt page truncation as part of a downconvert.
 	 */
-	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -186,7 +186,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
 	brelse(di_bh);
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 
 out:
 	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@@ -205,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	int ret = 0, lock_level = 0;
 
-	ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
+	ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
 				    file->f_vfsmnt, &lock_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
-	ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
+	ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
 out:
 	vma->vm_ops = &ocfs2_file_vm_ops;
 	vma->vm_flags |= VM_CAN_NONLINEAR;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6295fd6ae469..74018caf8053 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -115,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 	mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
 	     dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
 
-	status = ocfs2_meta_lock(dir, NULL, 0);
+	status = ocfs2_inode_lock(dir, NULL, 0);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -176,7 +176,7 @@ bail_unlock:
 	 * unlink on another node will message us to remove that
 	 * dentry under this lock so otherwise we can race this with
 	 * the downconvert thread and have a stale dentry. */
-	ocfs2_meta_unlock(dir, 0);
+	ocfs2_inode_unlock(dir, 0);
 
 bail:
 
@@ -208,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
 	/* get our super block */
 	osb = OCFS2_SB(dir->i_sb);
 
-	status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+	status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -322,7 +322,7 @@ leave:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
-	ocfs2_meta_unlock(dir, 1);
+	ocfs2_inode_unlock(dir, 1);
 
 	if (status == -ENOSPC)
 		mlog(0, "Disk is full\n");
@@ -552,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
 
-	err = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+	err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
 	if (err < 0) {
 		if (err != -ENOENT)
 			mlog_errno(err);
@@ -577,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out;
 	}
 
-	err = ocfs2_meta_lock(inode, &fe_bh, 1);
+	err = ocfs2_inode_lock(inode, &fe_bh, 1);
 	if (err < 0) {
 		if (err != -ENOENT)
 			mlog_errno(err);
@@ -642,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out_unlock_inode:
-	ocfs2_meta_unlock(inode, 1);
+	ocfs2_inode_unlock(inode, 1);
 
 out:
-	ocfs2_meta_unlock(dir, 1);
+	ocfs2_inode_unlock(dir, 1);
 
 	if (de_bh)
 		brelse(de_bh);
@@ -719,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
 		return -EPERM;
 	}
 
-	status = ocfs2_meta_lock(dir, &parent_node_bh, 1);
+	status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -744,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
 		goto leave;
 	}
 
-	status = ocfs2_meta_lock(inode, &fe_bh, 1);
+	status = ocfs2_inode_lock(inode, &fe_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -840,13 +840,13 @@ leave:
 		ocfs2_commit_trans(osb, handle);
 
 	if (child_locked)
-		ocfs2_meta_unlock(inode, 1);
+		ocfs2_inode_unlock(inode, 1);
 
-	ocfs2_meta_unlock(dir, 1);
+	ocfs2_inode_unlock(dir, 1);
 
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
-		ocfs2_meta_unlock(orphan_dir, 1);
+		ocfs2_inode_unlock(orphan_dir, 1);
 		mutex_unlock(&orphan_dir->i_mutex);
 		iput(orphan_dir);
 	}
@@ -907,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 			inode1 = tmpinode;
 		}
 		/* lock id2 */
-		status = ocfs2_meta_lock(inode2, bh2, 1);
+		status = ocfs2_inode_lock(inode2, bh2, 1);
 		if (status < 0) {
 			if (status != -ENOENT)
 				mlog_errno(status);
@@ -916,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 	}
 
 	/* lock id1 */
-	status = ocfs2_meta_lock(inode1, bh1, 1);
+	status = ocfs2_inode_lock(inode1, bh1, 1);
 	if (status < 0) {
 		/*
 		 * An error return must mean that no cluster locks
 		 * were held on function exit.
 		 */
 		if (oi1->ip_blkno != oi2->ip_blkno)
-			ocfs2_meta_unlock(inode2, 1);
+			ocfs2_inode_unlock(inode2, 1);
 
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -936,10 +936,10 @@ bail:
 
 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
 {
-	ocfs2_meta_unlock(inode1, 1);
+	ocfs2_inode_unlock(inode1, 1);
 
 	if (inode1 != inode2)
-		ocfs2_meta_unlock(inode2, 1);
+		ocfs2_inode_unlock(inode2, 1);
 }
 
 static int ocfs2_rename(struct inode *old_dir,
@@ -1034,7 +1034,7 @@ static int ocfs2_rename(struct inode *old_dir,
 	 * won't have to concurrently downconvert the inode and the
 	 * dentry locks.
 	 */
-	status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1);
+	status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -1143,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
 			goto bail;
 		}
 
-		status = ocfs2_meta_lock(new_inode, &newfe_bh, 1);
+		status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
 		if (status < 0) {
 			if (status != -ENOENT)
 				mlog_errno(status);
@@ -1355,14 +1355,14 @@ bail:
 		ocfs2_double_unlock(old_dir, new_dir);
 
 	if (old_child_locked)
-		ocfs2_meta_unlock(old_inode, 1);
+		ocfs2_inode_unlock(old_inode, 1);
 
 	if (new_child_locked)
-		ocfs2_meta_unlock(new_inode, 1);
+		ocfs2_inode_unlock(new_inode, 1);
 
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
-		ocfs2_meta_unlock(orphan_dir, 1);
+		ocfs2_inode_unlock(orphan_dir, 1);
 		mutex_unlock(&orphan_dir->i_mutex);
 		iput(orphan_dir);
 	}
@@ -1530,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
 	credits = ocfs2_calc_symlink_credits(sb);
 
 	/* lock the parent directory */
-	status = ocfs2_meta_lock(dir, &parent_fe_bh, 1);
+	status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -1657,7 +1657,7 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
-	ocfs2_meta_unlock(dir, 1);
+	ocfs2_inode_unlock(dir, 1);
 
 	if (new_fe_bh)
 		brelse(new_fe_bh);
@@ -1735,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 
 	mutex_lock(&orphan_dir_inode->i_mutex);
 
-	status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1745,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 					      orphan_dir_bh, name,
 					      OCFS2_ORPHAN_NAMELEN, de_bh);
 	if (status < 0) {
-		ocfs2_meta_unlock(orphan_dir_inode, 1);
+		ocfs2_inode_unlock(orphan_dir_inode, 1);
 
 		mlog_errno(status);
 		goto leave;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8f09f5235e3a..6df4dbf67d18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -114,7 +114,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
 
 	if (inode) {
 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
-			ocfs2_meta_unlock(inode, 1);
+			ocfs2_inode_unlock(inode, 1);
 
 		mutex_unlock(&inode->i_mutex);
 
@@ -412,7 +412,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 
 	mutex_lock(&alloc_inode->i_mutex);
 
-	status = ocfs2_meta_lock(alloc_inode, &bh, 1);
+	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
 	if (status < 0) {
 		mutex_unlock(&alloc_inode->i_mutex);
 		iput(alloc_inode);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 064eba074f1e..7708df36e223 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -964,7 +964,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 		goto bail;
 	}
 
-	status = ocfs2_meta_lock(inode, &bh, 0);
+	status = ocfs2_inode_lock(inode, &bh, 0);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -988,7 +988,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	brelse(bh);
 
-	ocfs2_meta_unlock(inode, 0);
+	ocfs2_inode_unlock(inode, 0);
 	status = 0;
 bail:
 	if (inode)
@@ -1019,7 +1019,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
 	oi->ip_clusters = 0;
 
 	ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
-	ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
+	ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
 	ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 
 	ocfs2_metadata_cache_init(&oi->vfs_inode);
-- 
cgit v1.2.3


From 628a24f5bdf31b795d596eaed71670579b96a9aa Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 30 Oct 2007 12:08:32 -0700
Subject: ocfs2: Readpages support

Add ->readpages support to Ocfs2. This is rather trivial - all it required
is a small update to ocfs2_get_block (for mapping full extents via b_size)
and an ocfs2_readpages() function which partially mirrors ocfs2_readpage().

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ac8c39055717..286af3a11383 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -26,6 +26,7 @@
 #include <asm/byteorder.h>
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/mpage.h>
 
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 {
 	int err = 0;
 	unsigned int ext_flags;
-	u64 p_blkno, past_eof;
+	u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
+	u64 p_blkno, count, past_eof;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
+	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
 					  &ext_flags);
 	if (err) {
 		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
+	if (max_blocks < count)
+		count = max_blocks;
+
 	/*
 	 * ocfs2 never allocates in this function - the only time we
 	 * need to use BH_New is when we're extending i_size on a file
@@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 		map_bh(bh_result, inode->i_sb, p_blkno);
 
+	bh_result->b_size = count << inode->i_blkbits;
+
 	if (!ocfs2_sparse_alloc(osb)) {
 		if (p_blkno == 0) {
 			err = -EIO;
@@ -322,6 +329,62 @@ out:
 	return ret;
 }
 
+/*
+ * This is used only for read-ahead. Failures or difficult to handle
+ * situations are safe to ignore.
+ *
+ * Right now, we don't bother with BH_Boundary - in-inode extent lists
+ * are quite large (243 extents on 4k blocks), so most inodes don't
+ * grow out to a tree. If need be, detecting boundary extents could
+ * trivially be added in a future version of ocfs2_get_block().
+ */
+static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	int ret, err = -EIO;
+	struct inode *inode = mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	loff_t start;
+	struct page *last;
+
+	/*
+	 * Use the nonblocking flag for the dlm code to avoid page
+	 * lock inversion, but don't bother with retrying.
+	 */
+	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
+	if (ret)
+		return err;
+
+	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+		ocfs2_inode_unlock(inode, 0);
+		return err;
+	}
+
+	/*
+	 * Don't bother with inline-data. There isn't anything
+	 * to read-ahead in that case anyway...
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		goto out_unlock;
+
+	/*
+	 * Check whether a remote node truncated this file - we just
+	 * drop out in that case as it's not worth handling here.
+	 */
+	last = list_entry(pages->prev, struct page, lru);
+	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+	if (start >= i_size_read(inode))
+		goto out_unlock;
+
+	err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
+
+out_unlock:
+	up_read(&oi->ip_alloc_sem);
+	ocfs2_inode_unlock(inode, 0);
+
+	return err;
+}
+
 /* Note: Because we don't support holes, our allocation has
  * already happened (allocation writes zeros to the file data)
  * so we don't have to worry about ordered writes in
@@ -1877,6 +1940,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 
 const struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
+	.readpages	= ocfs2_readpages,
 	.writepage	= ocfs2_writepage,
 	.write_begin	= ocfs2_write_begin,
 	.write_end	= ocfs2_write_end,
-- 
cgit v1.2.3


From 1252c434e39dc60ca9e8ed682f3e04930e2c08de Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 30 Oct 2007 12:09:03 -0700
Subject: ocfs2: Documentation update

Remove 'readpages' from the list in ocfs2.txt. Instead of having two
identical lists, I just removed the list in the OCFS2 section of fs/Kconfig
and added a pointer to Documentation/filesystems/ocfs2.txt.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Kconfig | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 781b47d2f9f2..16598a417423 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -440,14 +440,8 @@ config OCFS2_FS
 	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
 	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
 
-	  Note: Features which OCFS2 does not support yet:
-	          - extended attributes
-	          - quotas
-	          - cluster aware flock
-	          - Directory change notification (F_NOTIFY)
-	          - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
-	          - POSIX ACLs
-	          - readpages / writepages (not user visible)
+	  For more information on OCFS2, see the file
+	  <file:Documentation/filesystems/ocfs2.txt>.
 
 config OCFS2_DEBUG_MASKLOG
 	bool "OCFS2 logging support"
-- 
cgit v1.2.3


From e9d578a8f279d5e7d1e903f436aefc76ba330b43 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 18 Dec 2007 15:46:10 +0800
Subject: ocfs2: Initalize bitmap_cpg of ocfs2_super to be the maximum.

This value is initialized from global_bitmap->id2.i_chain.cl_cpg. If there
is only 1 group, it will be equal to the total clusters in the volume. So
as for online resize, it should change for all the nodes in the cluster.
It isn't easy and there is no corresponding lock for it.

bitmap_cpg is only used in 2 areas:
1. Check whether the suballoc is too large for us to allocate from the global
   bitmap, so it is little used. And now the suballoc size is 2048, it rarely
   meet this situation and the check is almost useless.
2. Calculate which group a cluster belongs to. We use it during truncate to
   figure out which cluster group an extent belongs too. But we should be OK
   if we increase it though as the cluster group calculated shouldn't change
   and we only ever have a small bitmap_cpg on file systems with a single
   cluster group.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/super.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7708df36e223..479ac50c86d9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1280,7 +1280,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	int i, cbits, bbits;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 	struct inode *inode = NULL;
-	struct buffer_head *bitmap_bh = NULL;
 	struct ocfs2_journal *journal;
 	__le32 uuid_net_key;
 	struct ocfs2_super *osb;
@@ -1497,25 +1496,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 
 	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
-
-	/* We don't have a cluster lock on the bitmap here because
-	 * we're only interested in static information and the extra
-	 * complexity at mount time isn't worht it. Don't pass the
-	 * inode in to the read function though as we don't want it to
-	 * be put in the cache. */
-	status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
-				  NULL);
 	iput(inode);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
 
-	di = (struct ocfs2_dinode *) bitmap_bh->b_data;
-	osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
-	brelse(bitmap_bh);
-	mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
-	     (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
+	osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
 
 	status = ocfs2_init_slot_info(osb);
 	if (status < 0) {
-- 
cgit v1.2.3


From d659072f736837e56b6433d58e5315ad1d4d5ccf Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 18 Dec 2007 15:47:03 +0800
Subject: [PATCH 1/2] ocfs2: Add group extend for online resize

This patch adds the ability for a userspace program to request an extend of
last cluster group on an Ocfs2 file system. The request is made via ioctl,
OCFS2_IOC_GROUP_EXTEND. This is derived from EXT3_IOC_GROUP_EXTEND, but is
obviously Ocfs2 specific.

tunefs.ocfs2 would call this for an online-resize operation if the last
cluster group isn't full.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/Makefile         |   1 +
 fs/ocfs2/buffer_head_io.c |  61 +++++++
 fs/ocfs2/buffer_head_io.h |   2 +
 fs/ocfs2/ioctl.c          |   8 +
 fs/ocfs2/journal.h        |   3 +
 fs/ocfs2/ocfs2_fs.h       |   2 +
 fs/ocfs2/resize.c         | 398 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/resize.h         |  31 ++++
 fs/ocfs2/suballoc.c       |  11 +-
 fs/ocfs2/suballoc.h       |   8 +
 10 files changed, 518 insertions(+), 7 deletions(-)
 create mode 100644 fs/ocfs2/resize.c
 create mode 100644 fs/ocfs2/resize.h

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d2057e7fbda7..3591890b32c6 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -21,6 +21,7 @@ ocfs2-objs := \
 	localalloc.o 		\
 	mmap.o 			\
 	namei.o 		\
+	resize.o		\
 	slot_map.o 		\
 	suballoc.o 		\
 	super.o 		\
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f6..31aa61dc777b 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -280,3 +280,64 @@ bail:
 	mlog_exit(status);
 	return status;
 }
+
+/* Check whether the blkno is the super block or one of the backups. */
+static void ocfs2_check_super_or_backup(struct super_block *sb,
+					sector_t blkno)
+{
+	int i;
+	u64 backup_blkno;
+
+	if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
+		return;
+
+	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+		backup_blkno = ocfs2_backup_super_blkno(sb, i);
+		if (backup_blkno == blkno)
+			return;
+	}
+
+	BUG();
+}
+
+/*
+ * Write super block and backups doesn't need to collaborate with journal,
+ * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
+ * into this function.
+ */
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+				struct buffer_head *bh)
+{
+	int ret = 0;
+
+	mlog_entry_void();
+
+	BUG_ON(buffer_jbd(bh));
+	ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+
+	get_bh(bh); /* for end_buffer_write_sync() */
+	bh->b_end_io = end_buffer_write_sync;
+	submit_bh(WRITE, bh);
+
+	wait_on_buffer(bh);
+
+	if (!buffer_uptodate(bh)) {
+		ret = -EIO;
+		brelse(bh);
+	}
+
+out:
+	mlog_exit(ret);
+	return ret;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac3..c2e78614c3e5 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super          *osb,
 		      int                  flags,
 		      struct inode        *inode);
 
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+				struct buffer_head *bh);
 
 #define OCFS2_BH_CACHED            1
 #define OCFS2_BH_READAHEAD         8
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 67c2fb4bae91..b74b24ecf0e4 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -20,6 +20,7 @@
 
 #include "ocfs2_fs.h"
 #include "ioctl.h"
+#include "resize.h"
 
 #include <linux/ext2_fs.h>
 
@@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 	unsigned int cmd, unsigned long arg)
 {
 	unsigned int flags;
+	int new_clusters;
 	int status;
 	struct ocfs2_space_resv sr;
 
@@ -140,6 +142,11 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 			return -EFAULT;
 
 		return ocfs2_change_file_space(filp, cmd, &sr);
+	case OCFS2_IOC_GROUP_EXTEND:
+		if (get_user(new_clusters, (int __user *)arg))
+			return -EFAULT;
+
+		return ocfs2_group_extend(inode, new_clusters);
 	default:
 		return -ENOTTY;
 	}
@@ -162,6 +169,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case OCFS2_IOC_RESVSP64:
 	case OCFS2_IOC_UNRESVSP:
 	case OCFS2_IOC_UNRESVSP64:
+	case OCFS2_IOC_GROUP_EXTEND:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4b32e0961568..0ba3a421ccf2 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -278,6 +278,9 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* simple file updates like chmod, etc. */
 #define OCFS2_INODE_UPDATE_CREDITS 1
 
+/* group extend. inode update and last group update. */
+#define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
+
 /* get one bit out of a suballocator: dinode + group descriptor +
  * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 6ef876759a73..19ac421b613b 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,6 +231,8 @@ struct ocfs2_space_resv {
 #define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
 #define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
 
+#define OCFS2_IOC_GROUP_EXTEND	_IOW('o', 1, int)
+
 /*
  * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
  */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 000000000000..848f7293f4fc
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,398 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.c
+ *
+ * volume resize.
+ * Inspired by ext3/resize.c.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "resize.h"
+
+/*
+ * Check whether there are new backup superblocks exist
+ * in the last group. If there are some, mark them or clear
+ * them in the bitmap.
+ *
+ * Return how many backups we find in the last group.
+ */
+static u16 ocfs2_calc_new_backup_super(struct inode *inode,
+				       struct ocfs2_group_desc *gd,
+				       int new_clusters,
+				       u32 first_new_cluster,
+				       u16 cl_cpg,
+				       int set)
+{
+	int i;
+	u16 backups = 0;
+	u32 cluster;
+	u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
+
+	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+		blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+		gd_blkno = ocfs2_which_cluster_group(inode, cluster);
+		if (gd_blkno < lgd_blkno)
+			continue;
+		else if (gd_blkno > lgd_blkno)
+			break;
+
+		if (set)
+			ocfs2_set_bit(cluster % cl_cpg,
+				      (unsigned long *)gd->bg_bitmap);
+		else
+			ocfs2_clear_bit(cluster % cl_cpg,
+					(unsigned long *)gd->bg_bitmap);
+		backups++;
+	}
+
+	mlog_exit_void();
+	return backups;
+}
+
+static int ocfs2_update_last_group_and_inode(handle_t *handle,
+					     struct inode *bm_inode,
+					     struct buffer_head *bm_bh,
+					     struct buffer_head *group_bh,
+					     u32 first_new_cluster,
+					     int new_clusters)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+	struct ocfs2_chain_rec *cr;
+	struct ocfs2_group_desc *group;
+	u16 chain, num_bits, backups = 0;
+	u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
+	u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+
+	mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
+		   new_clusters, first_new_cluster);
+
+	ret = ocfs2_journal_access(handle, bm_inode, group_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+	/* update the group first. */
+	num_bits = new_clusters * cl_bpc;
+	le16_add_cpu(&group->bg_bits, num_bits);
+	le16_add_cpu(&group->bg_free_bits_count, num_bits);
+
+	/*
+	 * check whether there are some new backup superblocks exist in
+	 * this group and update the group bitmap accordingly.
+	 */
+	if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+				     OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
+		backups = ocfs2_calc_new_backup_super(bm_inode,
+						     group,
+						     new_clusters,
+						     first_new_cluster,
+						     cl_cpg, 1);
+		le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
+	}
+
+	ret = ocfs2_journal_dirty(handle, group_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_rollback;
+	}
+
+	/* update the inode accordingly. */
+	ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_rollback;
+	}
+
+	chain = le16_to_cpu(group->bg_chain);
+	cr = (&cl->cl_recs[chain]);
+	le32_add_cpu(&cr->c_total, num_bits);
+	le32_add_cpu(&cr->c_free, num_bits);
+	le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
+	le32_add_cpu(&fe->i_clusters, new_clusters);
+
+	if (backups) {
+		le32_add_cpu(&cr->c_free, -1 * backups);
+		le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
+	}
+
+	spin_lock(&OCFS2_I(bm_inode)->ip_lock);
+	OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
+	spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
+	i_size_write(bm_inode, le64_to_cpu(fe->i_size));
+
+	ocfs2_journal_dirty(handle, bm_bh);
+
+out_rollback:
+	if (ret < 0) {
+		ocfs2_calc_new_backup_super(bm_inode,
+					    group,
+					    new_clusters,
+					    first_new_cluster,
+					    cl_cpg, 0);
+		le16_add_cpu(&group->bg_free_bits_count, backups);
+		le16_add_cpu(&group->bg_bits, -1 * num_bits);
+		le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
+	}
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static int update_backups(struct inode * inode, u32 clusters, char *data)
+{
+	int i, ret = 0;
+	u32 cluster;
+	u64 blkno;
+	struct buffer_head *backup = NULL;
+	struct ocfs2_dinode *backup_di = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* calculate the real backups we need to update. */
+	for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+		blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+		if (cluster > clusters)
+			break;
+
+		ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
+
+		backup_di = (struct ocfs2_dinode *)backup->b_data;
+		backup_di->i_blkno = cpu_to_le64(blkno);
+
+		ret = ocfs2_write_super_or_backup(osb, backup);
+		brelse(backup);
+		backup = NULL;
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void ocfs2_update_super_and_backups(struct inode *inode,
+					   int new_clusters)
+{
+	int ret;
+	u32 clusters = 0;
+	struct buffer_head *super_bh = NULL;
+	struct ocfs2_dinode *super_di = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/*
+	 * update the superblock last.
+	 * It doesn't matter if the write failed.
+	 */
+	ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
+			       &super_bh, 0, NULL);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	super_di = (struct ocfs2_dinode *)super_bh->b_data;
+	le32_add_cpu(&super_di->i_clusters, new_clusters);
+	clusters = le32_to_cpu(super_di->i_clusters);
+
+	ret = ocfs2_write_super_or_backup(osb, super_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
+		ret = update_backups(inode, clusters, super_bh->b_data);
+
+out:
+	if (super_bh)
+		brelse(super_bh);
+	if (ret)
+		printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
+			" during fs resize. This condition is not fatal,"
+			" but fsck.ocfs2 should be run to fix it\n",
+			osb->dev_str);
+	return;
+}
+
+/*
+ * Extend the filesystem to the new number of clusters specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.
+ */
+int ocfs2_group_extend(struct inode * inode, int new_clusters)
+{
+	int ret;
+	handle_t *handle;
+	struct buffer_head *main_bm_bh = NULL;
+	struct buffer_head *group_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_group_desc *group = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 cl_bpc;
+	u32 first_new_cluster;
+	u64 lgd_blkno;
+
+	mlog_entry_void();
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	if (new_clusters < 0)
+		return -EINVAL;
+	else if (new_clusters == 0)
+		return 0;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+				 ocfs2_group_bitmap_size(osb->sb) * 8) {
+		mlog(ML_ERROR, "The disk is too old and small. "
+		     "Force to do offline resize.");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
+		ret = -EIO;
+		goto out_unlock;
+	}
+
+	first_new_cluster = le32_to_cpu(fe->i_clusters);
+	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
+					      first_new_cluster - 1);
+
+	ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
+			       main_bm_inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+	ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+	if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
+		le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	mlog(0, "extend the last group at %llu, new clusters = %d\n",
+	     le64_to_cpu(group->bg_blkno), new_clusters);
+
+	handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* update the last group descriptor and inode. */
+	ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
+						main_bm_bh, group_bh,
+						first_new_cluster,
+						new_clusters);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	if (group_bh)
+		brelse(group_bh);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+
+out:
+	mlog_exit_void();
+	return ret;
+}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 000000000000..3acb79af451b
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_RESIZE_H
+#define OCFS2_RESIZE_H
+
+int ocfs2_group_extend(struct inode * inode, int new_clusters);
+
+#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6df4dbf67d18..4391744e80f8 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
 						   u64 bg_blkno,
 						   u16 bg_bit_off);
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
-					    u32 cluster);
 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 						u64 data_blkno,
 						u64 *bg_blkno,
@@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 }
 
 /* somewhat more expensive than our other checks, so use sparingly. */
-static int ocfs2_check_group_descriptor(struct super_block *sb,
-					struct ocfs2_dinode *di,
-					struct ocfs2_group_desc *gd)
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct ocfs2_group_desc *gd)
 {
 	unsigned int max_bits;
 
@@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
 
 /* given a cluster offset, calculate which block group it belongs to
  * and return that block offset. */
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
-					    u32 cluster)
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u32 group_no;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index cafe93703095..8799033bb459 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 				      struct ocfs2_alloc_context *ac);
 
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
+
+/* somewhat more expensive than our other checks, so use sparingly. */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct ocfs2_group_desc *gd);
 #endif /* _CHAINALLOC_H_ */
-- 
cgit v1.2.3


From 7909f2bf835376a20d6dbf853eb459a27566eba2 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 18 Dec 2007 15:47:25 +0800
Subject: [PATCH 2/2] ocfs2: Implement group add for online resize

This patch adds the ability for a userspace program to request that a
properly formatted cluster group be added to the main allocation bitmap for
an Ocfs2 file system. The request is made via an ioctl, OCFS2_IOC_GROUP_ADD.
On a high level, this is similar to ext3, but we use a different ioctl as
the structure which has to be passed through is different.

During an online resize, tunefs.ocfs2 will format any new cluster groups
which must be added to complete the resize, and call OCFS2_IOC_GROUP_ADD on
each one. Kernel verifies that the core cluster group information is valid
and then does the work of linking it into the global allocation bitmap.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/ioctl.c    |   9 ++
 fs/ocfs2/journal.h  |   3 +
 fs/ocfs2/ocfs2_fs.h |  12 +++
 fs/ocfs2/resize.c   | 245 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/resize.h   |   1 +
 5 files changed, 269 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index b74b24ecf0e4..7003d5820d79 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -119,6 +119,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 	int new_clusters;
 	int status;
 	struct ocfs2_space_resv sr;
+	struct ocfs2_new_group_input input;
 
 	switch (cmd) {
 	case OCFS2_IOC_GETFLAGS:
@@ -147,6 +148,12 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 			return -EFAULT;
 
 		return ocfs2_group_extend(inode, new_clusters);
+	case OCFS2_IOC_GROUP_ADD:
+	case OCFS2_IOC_GROUP_ADD64:
+		if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
+			return -EFAULT;
+
+		return ocfs2_group_add(inode, &input);
 	default:
 		return -ENOTTY;
 	}
@@ -170,6 +177,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case OCFS2_IOC_UNRESVSP:
 	case OCFS2_IOC_UNRESVSP64:
 	case OCFS2_IOC_GROUP_EXTEND:
+	case OCFS2_IOC_GROUP_ADD:
+	case OCFS2_IOC_GROUP_ADD64:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 0ba3a421ccf2..220f3e818e78 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -281,6 +281,9 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 
+/* group add. inode update and the new group update. */
+#define OCFS2_GROUP_ADD_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
+
 /* get one bit out of a suballocator: dinode + group descriptor +
  * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 19ac421b613b..425551737f1f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,7 +231,19 @@ struct ocfs2_space_resv {
 #define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
 #define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
 
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+	__u64 group;		/* Group descriptor's blkno. */
+	__u32 clusters;		/* Total number of clusters in this group */
+	__u32 frees;		/* Total free clusters in this group */
+	__u16 chain;		/* Chain for this group */
+	__u16 reserved1;
+	__u32 reserved2;
+};
+
 #define OCFS2_IOC_GROUP_EXTEND	_IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD	_IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64	_IOW('o', 3,struct ocfs2_new_group_input)
 
 /*
  * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 848f7293f4fc..7791309bb258 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -356,7 +356,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 	}
 
 	mlog(0, "extend the last group at %llu, new clusters = %d\n",
-	     le64_to_cpu(group->bg_blkno), new_clusters);
+	     (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
 
 	handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
 	if (IS_ERR(handle)) {
@@ -396,3 +396,246 @@ out:
 	mlog_exit_void();
 	return ret;
 }
+
+static int ocfs2_check_new_group(struct inode *inode,
+				 struct ocfs2_dinode *di,
+				 struct ocfs2_new_group_input *input,
+				 struct buffer_head *group_bh)
+{
+	int ret;
+	struct ocfs2_group_desc *gd;
+	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
+	unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
+				le16_to_cpu(di->id2.i_chain.cl_bpc);
+
+
+	gd = (struct ocfs2_group_desc *)group_bh->b_data;
+
+	ret = -EIO;
+	if (!OCFS2_IS_VALID_GROUP_DESC(gd))
+		mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno));
+	else if (di->i_blkno != gd->bg_parent_dinode)
+		mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
+		     "pointer (%llu, expected %llu)\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+		     (unsigned long long)le64_to_cpu(di->i_blkno));
+	else if (le16_to_cpu(gd->bg_bits) > max_bits)
+		mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_bits));
+	else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
+		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+		     "claims that %u are free\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_bits),
+		     le16_to_cpu(gd->bg_free_bits_count));
+	else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
+		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+		     "max bitmap bits of %u\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_bits),
+		     8 * le16_to_cpu(gd->bg_size));
+	else if (le16_to_cpu(gd->bg_chain) != input->chain)
+		mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
+		     "while input has %u set.\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_chain), input->chain);
+	else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
+		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+		     "input has %u clusters set\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_bits), input->clusters);
+	else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
+		mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
+		     "but it should have %u set\n",
+		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		     le16_to_cpu(gd->bg_bits),
+		     input->frees * cl_bpc);
+	else
+		ret = 0;
+
+	return ret;
+}
+
+static int ocfs2_verify_group_and_input(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_new_group_input *input,
+					struct buffer_head *group_bh)
+{
+	u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
+	u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+	u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
+	u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
+	u32 total_clusters = le32_to_cpu(di->i_clusters);
+	int ret = -EINVAL;
+
+	if (cluster < total_clusters)
+		mlog(ML_ERROR, "add a group which is in the current volume.\n");
+	else if (input->chain >= cl_count)
+		mlog(ML_ERROR, "input chain exceeds the limit.\n");
+	else if (next_free != cl_count && next_free != input->chain)
+		mlog(ML_ERROR,
+		     "the add group should be in chain %u\n", next_free);
+	else if (total_clusters + input->clusters < total_clusters)
+		mlog(ML_ERROR, "add group's clusters overflow.\n");
+	else if (input->clusters > cl_cpg)
+		mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
+	else if (input->frees > input->clusters)
+		mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
+	else if (total_clusters % cl_cpg != 0)
+		mlog(ML_ERROR,
+		     "the last group isn't full. Use group extend first.\n");
+	else if (input->group != ocfs2_which_cluster_group(inode, cluster))
+		mlog(ML_ERROR, "group blkno is invalid\n");
+	else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
+		mlog(ML_ERROR, "group descriptor check failed.\n");
+	else
+		ret = 0;
+
+	return ret;
+}
+
+/* Add a new group descriptor to global_bitmap. */
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
+{
+	int ret;
+	handle_t *handle;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *group = NULL;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_chain_rec *cr;
+	u16 cl_bpc;
+
+	mlog_entry_void();
+
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&main_bm_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+				 ocfs2_group_bitmap_size(osb->sb) * 8) {
+		mlog(ML_ERROR, "The disk is too old and small."
+		     " Force to do offline resize.");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+	if (ret < 0) {
+		mlog(ML_ERROR, "Can't read the group descriptor # %llu "
+		     "from the device.", (unsigned long long)input->group);
+		goto out_unlock;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, group_bh);
+
+	ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	mlog(0, "Add a new group  %llu in chain = %u, length = %u\n",
+	     (unsigned long long)input->group, input->chain, input->clusters);
+
+	handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+	cl = &fe->id2.i_chain;
+	cr = &cl->cl_recs[input->chain];
+
+	ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	group = (struct ocfs2_group_desc *)group_bh->b_data;
+	group->bg_next_group = cr->c_blkno;
+
+	ret = ocfs2_journal_dirty(handle, group_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
+		le16_add_cpu(&cl->cl_next_free_rec, 1);
+		memset(cr, 0, sizeof(struct ocfs2_chain_rec));
+	}
+
+	cr->c_blkno = le64_to_cpu(input->group);
+	le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
+	le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
+
+	le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
+	le32_add_cpu(&fe->id1.bitmap1.i_used,
+		     (input->clusters - input->frees) * cl_bpc);
+	le32_add_cpu(&fe->i_clusters, input->clusters);
+
+	ocfs2_journal_dirty(handle, main_bm_bh);
+
+	spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
+	OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
+	spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
+	i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
+
+	ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	if (group_bh)
+		brelse(group_bh);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+	mutex_unlock(&main_bm_inode->i_mutex);
+	iput(main_bm_inode);
+
+out:
+	mlog_exit_void();
+	return ret;
+}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
index 3acb79af451b..f38841abf10b 100644
--- a/fs/ocfs2/resize.h
+++ b/fs/ocfs2/resize.h
@@ -27,5 +27,6 @@
 #define OCFS2_RESIZE_H
 
 int ocfs2_group_extend(struct inode * inode, int new_clusters);
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
 
 #endif /* OCFS2_RESIZE_H */
-- 
cgit v1.2.3


From 0957f00796157564281ea6ff2cea7ef4f897775a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 18 Dec 2007 18:58:18 -0800
Subject: ocfs2: Add missing permission checks

Check that an online resize is being driven by a user with permission to
change system resource limits.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7003d5820d79..5177fba5162b 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -144,12 +144,18 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 
 		return ocfs2_change_file_space(filp, cmd, &sr);
 	case OCFS2_IOC_GROUP_EXTEND:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
 		if (get_user(new_clusters, (int __user *)arg))
 			return -EFAULT;
 
 		return ocfs2_group_extend(inode, new_clusters);
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
 		if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
 			return -EFAULT;
 
-- 
cgit v1.2.3


From d147b3d630edef1d34de6ea819787a1ac1b8603b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 7 Nov 2007 14:40:36 -0800
Subject: ocfs2: Support commit= mount option

Mostly taken from ext3. This allows the user to set the jbd commit interval,
in seconds. The default of 5 seconds stays the same, but now users can
easily increase the commit interval. Typically, this would be increased in
order to benefit performance at the expense of data-safety.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/journal.c |  8 ++++++--
 fs/ocfs2/ocfs2.h   |  1 +
 fs/ocfs2/super.c   | 23 +++++++++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4f440a88bf53..8b9ce2a729ab 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -313,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
 	return err;
 }
 
-#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * 5)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
 
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
 {
 	journal_t *journal = osb->journal->j_journal;
+	unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+
+	if (osb->osb_commit_interval)
+		commit_interval = osb->osb_commit_interval;
 
 	spin_lock(&journal->j_state_lock);
-	journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+	journal->j_commit_interval = commit_interval;
 	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
 		journal->j_flags |= JFS_BARRIER;
 	else
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index f8f866144c6a..82802f5672a1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -229,6 +229,7 @@ struct ocfs2_super
 	wait_queue_head_t checkpoint_event;
 	atomic_t needs_checkpoint;
 	struct ocfs2_journal *journal;
+	unsigned long osb_commit_interval;
 
 	enum ocfs2_local_alloc_state local_alloc_state;
 	struct buffer_head *local_alloc_bh;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 479ac50c86d9..8044ed97d362 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -83,6 +83,7 @@ MODULE_LICENSE("GPL");
 
 struct mount_options
 {
+	unsigned long	commit_interval;
 	unsigned long	mount_opt;
 	unsigned int	atime_quantum;
 	signed short	slot;
@@ -149,6 +150,7 @@ enum {
 	Opt_data_writeback,
 	Opt_atime_quantum,
 	Opt_slot,
+	Opt_commit,
 	Opt_err,
 };
 
@@ -164,6 +166,7 @@ static match_table_t tokens = {
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_atime_quantum, "atime_quantum=%u"},
 	{Opt_slot, "preferred_slot=%u"},
+	{Opt_commit, "commit=%u"},
 	{Opt_err, NULL}
 };
 
@@ -442,6 +445,8 @@ unlock_osb:
 		osb->s_mount_opt = parsed_options.mount_opt;
 		osb->s_atime_quantum = parsed_options.atime_quantum;
 		osb->preferred_slot = parsed_options.slot;
+		if (parsed_options.commit_interval)
+			osb->osb_commit_interval = parsed_options.commit_interval;
 
 		if (!ocfs2_is_hard_readonly(osb))
 			ocfs2_set_journal_params(osb);
@@ -596,6 +601,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->s_mount_opt = parsed_options.mount_opt;
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
+	osb->osb_commit_interval = parsed_options.commit_interval;
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
@@ -746,6 +752,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
 		   options ? options : "(none)");
 
+	mopt->commit_interval = 0;
 	mopt->mount_opt = 0;
 	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 	mopt->slot = OCFS2_INVALID_SLOT;
@@ -815,6 +822,18 @@ static int ocfs2_parse_options(struct super_block *sb,
 			if (option)
 				mopt->slot = (s16)option;
 			break;
+		case Opt_commit:
+			option = 0;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = JBD_DEFAULT_MAX_COMMIT_AGE;
+			mopt->commit_interval = HZ * option;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -863,6 +882,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
 		seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
 
+	if (osb->osb_commit_interval)
+		seq_printf(s, ",commit=%u",
+			   (unsigned) (osb->osb_commit_interval / HZ));
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2fbe8d1ebe004425b4f7b8bba345623d2280be82 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 20 Dec 2007 14:58:11 -0800
Subject: ocfs2: Local alloc window size changeable via mount option

Local alloc is a performance optimization in ocfs2 in which a node
takes a window of bits from the global bitmap and then uses that for
all small local allocations. This window size is fixed to 8MB currently.
This patch allows users to specify the window size in MB including
disabling it by passing in 0. If the number specified is too large,
the fs will use the default value of 8MB.

mount -o localalloc=X /dev/sdX /mntpoint

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/localalloc.c | 42 ++++++++++++++++++++++++++++++------------
 fs/ocfs2/ocfs2.h      |  1 +
 fs/ocfs2/ocfs2_fs.h   |  8 ++++++++
 fs/ocfs2/suballoc.c   |  5 +++--
 fs/ocfs2/super.c      | 17 +++++++++++++++++
 5 files changed, 59 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0de0792fce7f..add1ffdc5c6c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 					  struct inode *local_alloc_inode);
 
-/*
- * Determine how large our local alloc window should be, in bits.
- *
- * These values (and the behavior in ocfs2_alloc_should_use_local) have
- * been chosen so that most allocations, including new block groups go
- * through local alloc.
- */
 static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
 {
-	BUG_ON(osb->s_clustersize_bits < 12);
+	BUG_ON(osb->s_clustersize_bits > 20);
 
-	return 2048 >> (osb->s_clustersize_bits - 12);
+	/* Size local alloc windows by the megabyte */
+	return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
 }
 
 /*
@@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 {
 	int la_bits = ocfs2_local_alloc_window_bits(osb);
+	int ret = 0;
 
 	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
-		return 0;
+		goto bail;
 
 	/* la_bits should be at least twice the size (in clusters) of
 	 * a new block group. We want to be sure block group
 	 * allocations go through the local alloc, so allow an
 	 * allocation to take up to half the bitmap. */
 	if (bits > (la_bits / 2))
-		return 0;
+		goto bail;
 
-	return 1;
+	ret = 1;
+bail:
+	mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
+	     osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+	return ret;
 }
 
 int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	if (osb->local_alloc_size == 0)
+		goto bail;
+
+	if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+		mlog(ML_NOTICE, "Requested local alloc window %d is larger "
+		     "than max possible %u. Using defaults.\n",
+		     ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
+		osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+	}
+
 	/* read the alloc off disk */
 	inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
 					    osb->slot_num);
@@ -181,6 +193,9 @@ bail:
 	if (inode)
 		iput(inode);
 
+	mlog(0, "Local alloc window bits = %d\n",
+	     ocfs2_local_alloc_window_bits(osb));
+
 	mlog_exit(status);
 	return status;
 }
@@ -521,6 +536,9 @@ bail:
 		iput(local_alloc_inode);
 	}
 
+	mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
+	     status);
+
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 82802f5672a1..d12bd7036da7 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -231,6 +231,7 @@ struct ocfs2_super
 	struct ocfs2_journal *journal;
 	unsigned long osb_commit_interval;
 
+	int local_alloc_size;
 	enum ocfs2_local_alloc_state local_alloc_state;
 	struct buffer_head *local_alloc_bh;
 	u64 la_last_gd;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 425551737f1f..3633edd3982f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -270,6 +270,14 @@ struct ocfs2_new_group_input {
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
 
+/*
+ * Default local alloc size (in megabytes)
+ *
+ * The value chosen should be such that most allocations, including new
+ * block groups, use local alloc.
+ */
+#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE	8
+
 struct ocfs2_system_inode_info {
 	char	*si_name;
 	int	si_iflags;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4391744e80f8..7e397e2c25dd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1516,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
 		if (min_clusters > (osb->bitmap_cpg - 1)) {
 			/* The only paths asking for contiguousness
 			 * should know about this already. */
-			mlog(ML_ERROR, "minimum allocation requested exceeds "
-				       "group bitmap size!");
+			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
+			     "group bitmap size %u!\n", min_clusters,
+			     osb->bitmap_cpg);
 			status = -ENOSPC;
 			goto bail;
 		}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8044ed97d362..1104f14c3183 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -87,6 +87,7 @@ struct mount_options
 	unsigned long	mount_opt;
 	unsigned int	atime_quantum;
 	signed short	slot;
+	unsigned int	localalloc_opt;
 };
 
 static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -151,6 +152,7 @@ enum {
 	Opt_atime_quantum,
 	Opt_slot,
 	Opt_commit,
+	Opt_localalloc,
 	Opt_err,
 };
 
@@ -167,6 +169,7 @@ static match_table_t tokens = {
 	{Opt_atime_quantum, "atime_quantum=%u"},
 	{Opt_slot, "preferred_slot=%u"},
 	{Opt_commit, "commit=%u"},
+	{Opt_localalloc, "localalloc=%d"},
 	{Opt_err, NULL}
 };
 
@@ -602,6 +605,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
+	osb->local_alloc_size = parsed_options.localalloc_opt;
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
@@ -756,6 +760,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 	mopt->mount_opt = 0;
 	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 	mopt->slot = OCFS2_INVALID_SLOT;
+	mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
 
 	if (!options) {
 		status = 1;
@@ -834,6 +839,15 @@ static int ocfs2_parse_options(struct super_block *sb,
 				option = JBD_DEFAULT_MAX_COMMIT_AGE;
 			mopt->commit_interval = HZ * option;
 			break;
+		case Opt_localalloc:
+			option = 0;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+				mopt->localalloc_opt = option;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -886,6 +900,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",commit=%u",
 			   (unsigned) (osb->osb_commit_interval / HZ));
 
+	if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+		seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From cf8e06f1a860d8680d6bb4ac8ec7d7724988e46f Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 20 Dec 2007 16:43:10 -0800
Subject: [PATCH 1/2] ocfs2: add flock lock type

This adds a new dlmglue lock type which is intended to back flock()
requests.

Since these locks are driven from userspace, usage rules are much more
liberal than the typical Ocfs2 internal cluster lock. As a result, we can't
make use of most dlmglue features - lock caching and lock level
optimizations in particular. Additionally, userspace is free to deadlock
itself, so we have to deal with that in the same way as the rest of the
kernel - by allowing a signal to abort a lock request.

In order to keep ocfs2_cluster_lock() complexity down, ocfs2_file_lock()
does it's own dlm coordination. We still use the same helper functions
though, so duplicated code is kept to a minimum.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlmglue.c      | 267 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlmglue.h      |   5 +
 fs/ocfs2/file.h         |   6 ++
 fs/ocfs2/ocfs2.h        |   1 +
 fs/ocfs2/ocfs2_lockid.h |   5 +
 5 files changed, 284 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index fa5e3bdc295d..3867244fb144 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -68,6 +68,7 @@ struct ocfs2_mask_waiter {
 
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
 
 /*
  * Return value from ->downconvert_worker functions.
@@ -252,6 +253,11 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
 	.flags		= 0,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+	.get_osb	= ocfs2_get_file_osb,
+	.flags		= 0,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -310,6 +316,17 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 				  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+				      int new_level);
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb);
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres);
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres);
+
 
 static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
 				  u64 blkno,
@@ -419,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
 	return OCFS2_SB(inode->i_sb);
 }
 
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_file_private *fp = lockres->l_priv;
+
+	return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
+}
+
 static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
 {
 	__be64 inode_blkno_be;
@@ -499,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
 				   &ocfs2_rename_lops, osb);
 }
 
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+			      struct ocfs2_file_private *fp)
+{
+	struct inode *inode = fp->fp_file->f_mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
+			      inode->i_generation, lockres->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+				   OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
+				   fp);
+	lockres->l_flags |= OCFS2_LOCK_NOCACHE;
+}
+
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
 	mlog_entry_void();
@@ -715,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 	     lockres->l_name, level, lockres->l_level,
 	     ocfs2_lock_type_string(lockres->l_type));
 
+	/*
+	 * We can skip the bast for locks which don't enable caching -
+	 * they'll be dropped at the earliest possible time anyway.
+	 */
+	if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
+		return;
+
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
 	if (needs_downconvert)
@@ -926,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
 
 }
 
+static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
+					     struct ocfs2_lock_res *lockres)
+{
+	int ret;
+
+	ret = wait_for_completion_interruptible(&mw->mw_complete);
+	if (ret)
+		lockres_remove_mask_waiter(lockres, mw);
+	else
+		ret = mw->mw_status;
+	/* Re-arm the completion in case we want to wait on it again */
+	INIT_COMPLETION(mw->mw_complete);
+	return ret;
+}
+
 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
 			      struct ocfs2_lock_res *lockres,
 			      int level,
@@ -1296,6 +1357,212 @@ out:
 	mlog_exit_void();
 }
 
+static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	int ret;
+	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+	unsigned long flags;
+	struct ocfs2_mask_waiter mw;
+
+	ocfs2_init_mask_waiter(&mw);
+
+retry_cancel:
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		if (ret) {
+			spin_unlock_irqrestore(&lockres->l_lock, flags);
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+			goto retry_cancel;
+		}
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ocfs2_wait_for_mask(&mw);
+		goto retry_cancel;
+	}
+
+	ret = -ERESTARTSYS;
+	/*
+	 * We may still have gotten the lock, in which case there's no
+	 * point to restarting the syscall.
+	 */
+	if (lockres->l_level == level)
+		ret = 0;
+
+	mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
+	     lockres->l_flags, lockres->l_level, lockres->l_action);
+
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
+ * flock() calls. The locking approach this requires is sufficiently
+ * different from all other cluster lock types that we implement a
+ * seperate path to the "low-level" dlm calls. In particular:
+ *
+ * - No optimization of lock levels is done - we take at exactly
+ *   what's been requested.
+ *
+ * - No lock caching is employed. We immediately downconvert to
+ *   no-lock at unlock time. This also means flock locks never go on
+ *   the blocking list).
+ *
+ * - Since userspace can trivially deadlock itself with flock, we make
+ *   sure to allow cancellation of a misbehaving applications flock()
+ *   request.
+ *
+ * - Access to any flock lockres doesn't require concurrency, so we
+ *   can simplify the code by requiring the caller to guarantee
+ *   serialization of dlmglue flock calls.
+ */
+int ocfs2_file_lock(struct file *file, int ex, int trylock)
+{
+	int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
+	unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+	unsigned long flags;
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_lock_res *lockres = &fp->fp_flock;
+	struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+	struct ocfs2_mask_waiter mw;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
+	    (lockres->l_level > LKM_NLMODE)) {
+		mlog(ML_ERROR,
+		     "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
+		     "level: %u\n", lockres->l_name, lockres->l_flags,
+		     lockres->l_level);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		/*
+		 * Get the lock at NLMODE to start - that way we
+		 * can cancel the upconvert request if need be.
+		 */
+		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_wait_for_mask(&mw);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+
+	lockres->l_action = OCFS2_AST_CONVERT;
+	lkm_flags |= LKM_CONVERT;
+	lockres->l_requested = level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+
+	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
+		      lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+		      ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
+	if (ret != DLM_NORMAL) {
+		if (trylock && ret == DLM_NOTQUEUED)
+			ret = -EAGAIN;
+		else {
+			ocfs2_log_dlm_error("dlmlock", ret, lockres);
+			ret = -EINVAL;
+		}
+
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		lockres_remove_mask_waiter(lockres, &mw);
+		goto out;
+	}
+
+	ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
+	if (ret == -ERESTARTSYS) {
+		/*
+		 * Userspace can cause deadlock itself with
+		 * flock(). Current behavior locally is to allow the
+		 * deadlock, but abort the system call if a signal is
+		 * received. We follow this example, otherwise a
+		 * poorly written program could sit in kernel until
+		 * reboot.
+		 *
+		 * Handling this is a bit more complicated for Ocfs2
+		 * though. We can't exit this function with an
+		 * outstanding lock request, so a cancel convert is
+		 * required. We intentionally overwrite 'ret' - if the
+		 * cancel fails and the lock was granted, it's easier
+		 * to just bubble sucess back up to the user.
+		 */
+		ret = ocfs2_flock_handle_signal(lockres, level);
+	}
+
+out:
+
+	mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
+	     lockres->l_name, ex, trylock, ret);
+	return ret;
+}
+
+void ocfs2_file_unlock(struct file *file)
+{
+	int ret;
+	unsigned long flags;
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_lock_res *lockres = &fp->fp_flock;
+	struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+	struct ocfs2_mask_waiter mw;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
+		return;
+
+	if (lockres->l_level == LKM_NLMODE)
+		return;
+
+	mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
+	     lockres->l_name, lockres->l_flags, lockres->l_level,
+	     lockres->l_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	/*
+	 * Fake a blocking ast for the downconvert code.
+	 */
+	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+	lockres->l_blocking = LKM_EXMODE;
+
+	ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+	if (ret) {
+		mlog_errno(ret);
+		return;
+	}
+
+	ret = ocfs2_wait_for_mask(&mw);
+	if (ret)
+		mlog_errno(ret);
+}
+
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres)
 {
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 6dcbc944e8ce..5f17243ba501 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -66,6 +66,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
 			       struct inode *inode);
 void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
 				u64 parent, struct inode *inode);
+struct ocfs2_file_private;
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+			      struct ocfs2_file_private *fp);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -98,6 +101,8 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
 int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
+int ocfs2_file_lock(struct file *file, int ex, int trylock);
+void ocfs2_file_unlock(struct file *file);
 
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 066f14add3a8..048ddcaf5c80 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
 
+struct ocfs2_file_private {
+	struct file		*fp_file;
+	struct mutex		fp_mutex;
+	struct ocfs2_lock_res	fp_flock;
+};
+
 enum ocfs2_alloc_restarted {
 	RESTART_NONE = 0,
 	RESTART_TRANS,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d12bd7036da7..63c131e1cc77 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
 					       * about to be
 					       * dropped. */
 #define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+#define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
 
 struct ocfs2_lock_res_ops;
 
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4ca02b1c38ac..86f3e3799c2b 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_RW,
 	OCFS2_LOCK_TYPE_DENTRY,
 	OCFS2_LOCK_TYPE_OPEN,
+	OCFS2_LOCK_TYPE_FLOCK,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 		case OCFS2_LOCK_TYPE_OPEN:
 			c = 'O';
 			break;
+		case OCFS2_LOCK_TYPE_FLOCK:
+			c = 'F';
+			break;
 		default:
 			c = '\0';
 	}
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
 	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
 	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
 	[OCFS2_LOCK_TYPE_OPEN] = "Open",
+	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
 };
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
-- 
cgit v1.2.3


From 53fc622b9e829c8e632e45ef8c14f054388759c1 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 20 Dec 2007 16:49:04 -0800
Subject: [PATCH 2/2] ocfs2: cluster aware flock()

Hook up ocfs2_flock(), using the new flock lock type in dlmglue.c. A new
mount option, "localflocks" is added so that users can revert to old
functionality as need be.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/Makefile |   1 +
 fs/ocfs2/file.c   |  60 +++++++++++++++++++++++++-
 fs/ocfs2/locks.c  | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/locks.h  |  31 ++++++++++++++
 fs/ocfs2/ocfs2.h  |   1 +
 fs/ocfs2/super.c  |  19 +++++++++
 6 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 fs/ocfs2/locks.c
 create mode 100644 fs/ocfs2/locks.h

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 3591890b32c6..4d4ce48bb42c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -19,6 +19,7 @@ ocfs2-objs := \
 	ioctl.o 		\
 	journal.o 		\
 	localalloc.o 		\
+	locks.o			\
 	mmap.o 			\
 	namei.o 		\
 	resize.o		\
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 432e5f3c4784..caefd571782e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -51,6 +51,7 @@
 #include "inode.h"
 #include "ioctl.h"
 #include "journal.h"
+#include "locks.h"
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
@@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
 	return sync_mapping_buffers(inode->i_mapping);
 }
 
+static int ocfs2_init_file_private(struct inode *inode, struct file *file)
+{
+	struct ocfs2_file_private *fp;
+
+	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
+	if (!fp)
+		return -ENOMEM;
+
+	fp->fp_file = file;
+	mutex_init(&fp->fp_mutex);
+	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
+	file->private_data = fp;
+
+	return 0;
+}
+
+static void ocfs2_free_file_private(struct inode *inode, struct file *file)
+{
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (fp) {
+		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
+		ocfs2_lock_res_free(&fp->fp_flock);
+		kfree(fp);
+		file->private_data = NULL;
+	}
+}
+
 static int ocfs2_file_open(struct inode *inode, struct file *file)
 {
 	int status;
@@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
 
 	oi->ip_open_count++;
 	spin_unlock(&oi->ip_lock);
-	status = 0;
+
+	status = ocfs2_init_file_private(inode, file);
+	if (status) {
+		/*
+		 * We want to set open count back if we're failing the
+		 * open.
+		 */
+		spin_lock(&oi->ip_lock);
+		oi->ip_open_count--;
+		spin_unlock(&oi->ip_lock);
+	}
+
 leave:
 	mlog_exit(status);
 	return status;
@@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
 		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 	spin_unlock(&oi->ip_lock);
 
+	ocfs2_free_file_private(inode, file);
+
 	mlog_exit(0);
 
 	return 0;
 }
 
+static int ocfs2_dir_open(struct inode *inode, struct file *file)
+{
+	return ocfs2_init_file_private(inode, file);
+}
+
+static int ocfs2_dir_release(struct inode *inode, struct file *file)
+{
+	ocfs2_free_file_private(inode, file);
+	return 0;
+}
+
 static int ocfs2_sync_file(struct file *file,
 			   struct dentry *dentry,
 			   int datasync)
@@ -2191,6 +2245,7 @@ const struct file_operations ocfs2_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
 };
@@ -2199,8 +2254,11 @@ const struct file_operations ocfs2_dops = {
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_dir_release,
+	.open		= ocfs2_dir_open,
 	.ioctl		= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+	.flock		= ocfs2_flock,
 };
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 000000000000..203f87143877
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,125 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.c
+ *
+ * Userspace file locking support
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "locks.h"
+
+static int ocfs2_do_flock(struct file *file, struct inode *inode,
+			  int cmd, struct file_lock *fl)
+{
+	int ret = 0, level = 0, trylock = 0;
+	struct ocfs2_file_private *fp = file->private_data;
+	struct ocfs2_lock_res *lockres = &fp->fp_flock;
+
+	if (fl->fl_type == F_WRLCK)
+		level = 1;
+	if (!IS_SETLKW(cmd))
+		trylock = 1;
+
+	mutex_lock(&fp->fp_mutex);
+
+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+	    lockres->l_level > LKM_NLMODE) {
+		int old_level = 0;
+
+		if (lockres->l_level == LKM_EXMODE)
+			old_level = 1;
+
+		if (level == old_level)
+			goto out;
+
+		/*
+		 * Converting an existing lock is not guaranteed to be
+		 * atomic, so we can get away with simply unlocking
+		 * here and allowing the lock code to try at the new
+		 * level.
+		 */
+
+		flock_lock_file_wait(file,
+				     &(struct file_lock){.fl_type = F_UNLCK});
+
+		ocfs2_file_unlock(file);
+	}
+
+	ret = ocfs2_file_lock(file, level, trylock);
+	if (ret) {
+		if (ret == -EAGAIN && trylock)
+			ret = -EWOULDBLOCK;
+		else
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = flock_lock_file_wait(file, fl);
+
+out:
+	mutex_unlock(&fp->fp_mutex);
+
+	return ret;
+}
+
+static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
+{
+	int ret;
+	struct ocfs2_file_private *fp = file->private_data;
+
+	mutex_lock(&fp->fp_mutex);
+	ocfs2_file_unlock(file);
+	ret = flock_lock_file_wait(file, fl);
+	mutex_unlock(&fp->fp_mutex);
+
+	return ret;
+}
+
+/*
+ * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
+ */
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	if (__mandatory_lock(inode))
+		return -ENOLCK;
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+	    ocfs2_mount_local(osb))
+		return flock_lock_file_wait(file, fl);
+
+	if (fl->fl_type == F_UNLCK)
+		return ocfs2_do_funlock(file, cmd, fl);
+	else
+		return ocfs2_do_flock(file, inode, cmd, fl);
+}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
new file mode 100644
index 000000000000..9743ef2324ec
--- /dev/null
+++ b/fs/ocfs2/locks.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.h
+ *
+ * Function prototypes for Userspace file locking support
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCKS_H
+#define OCFS2_LOCKS_H
+
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+
+#endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 63c131e1cc77..22e334d125d0 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -171,6 +171,7 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
 	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 1104f14c3183..4a091f586646 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -153,6 +153,7 @@ enum {
 	Opt_slot,
 	Opt_commit,
 	Opt_localalloc,
+	Opt_localflocks,
 	Opt_err,
 };
 
@@ -170,6 +171,7 @@ static match_table_t tokens = {
 	{Opt_slot, "preferred_slot=%u"},
 	{Opt_commit, "commit=%u"},
 	{Opt_localalloc, "localalloc=%d"},
+	{Opt_localflocks, "localflocks"},
 	{Opt_err, NULL}
 };
 
@@ -848,6 +850,20 @@ static int ocfs2_parse_options(struct super_block *sb,
 			if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
 				mopt->localalloc_opt = option;
 			break;
+		case Opt_localflocks:
+			/*
+			 * Changing this during remount could race
+			 * flock() requests, or "unbalance" existing
+			 * ones (e.g., a lock is taken in one mode but
+			 * dropped in the other). If users care enough
+			 * to flip locking modes during remount, we
+			 * could add a "local" flag to individual
+			 * flock structures for proper tracking of
+			 * state.
+			 */
+			if (!is_remount)
+				mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -903,6 +919,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
 		seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
 
+	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
+		seq_printf(s, ",localflocks,");
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5fa0613ea58a80f69852b242337121bd39dc798e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jan 2008 00:11:45 +0100
Subject: ocfs2: Silence false lockdep warnings

Create separate lockdep lock classes for system file's i_mutexes. They are
used to guard allocations and similar things and thus rank differently
than i_mutex of a regular file or directory.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/export.c  |  4 ++--
 fs/ocfs2/inode.c   | 10 +++++++++-
 fs/ocfs2/inode.h   |  7 ++++---
 fs/ocfs2/journal.c |  2 +-
 fs/ocfs2/namei.c   |  2 +-
 fs/ocfs2/super.c   |  4 ++--
 fs/ocfs2/sysfile.c |  2 +-
 7 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 1f9e353cac45..67527cebf214 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 		return ERR_PTR(-ESTALE);
 	}
 
-	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0);
+	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
 
 	if (IS_ERR(inode))
 		return (void *)inode;
@@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 		goto bail_unlock;
 	}
 
-	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
 	if (IS_ERR(inode)) {
 		mlog(ML_ERROR, "Unable to create inode %llu\n",
 		     (unsigned long long)blkno);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 00cd5b7f3e52..5e19c119183d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -57,8 +57,11 @@ struct ocfs2_find_inode_args
 	u64		fi_blkno;
 	unsigned long	fi_ino;
 	unsigned int	fi_flags;
+	unsigned int	fi_sysfile_type;
 };
 
+static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
+
 static int ocfs2_read_locked_inode(struct inode *inode,
 				   struct ocfs2_find_inode_args *args);
 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -106,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
 		oi->ip_attr |= OCFS2_DIRSYNC_FL;
 }
 
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
+			 int sysfile_type)
 {
 	struct inode *inode = NULL;
 	struct super_block *sb = osb->sb;
@@ -126,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
 	args.fi_blkno = blkno;
 	args.fi_flags = flags;
 	args.fi_ino = ino_from_blkno(sb, blkno);
+	args.fi_sysfile_type = sysfile_type;
 
 	inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
 			     ocfs2_init_locked_inode, &args);
@@ -200,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 
 	inode->i_ino = args->fi_ino;
 	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+	if (args->fi_sysfile_type != 0)
+		lockdep_set_class(&inode->i_mutex,
+			&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
 
 	mlog_exit(0);
 	return 0;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a61c044eb7da..390a85596aa0 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -120,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
 
 /* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_SYSFILE		0x4
-#define OCFS2_FI_FLAG_ORPHAN_RECOVERY	0x8
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
+#define OCFS2_FI_FLAG_SYSFILE		0x1
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY	0x2
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
+			 int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
 int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8b9ce2a729ab..f31c7e8c19c3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1244,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
 
 	/* Skip bad inodes so that recovery can continue */
 	iter = ocfs2_iget(p->osb, ino,
-			  OCFS2_FI_FLAG_ORPHAN_RECOVERY);
+			  OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
 	if (IS_ERR(iter))
 		return 0;
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 74018caf8053..ae9ad9587516 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -128,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 	if (status < 0)
 		goto bail_add;
 
-	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
 	if (IS_ERR(inode)) {
 		ret = ERR_PTR(-EACCES);
 		goto bail_unlock;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4a091f586646..01fe40ee5ea9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -220,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE);
+	new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
 	if (IS_ERR(new)) {
 		status = PTR_ERR(new);
 		mlog_errno(status);
@@ -228,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 	}
 	osb->root_inode = new;
 
-	new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE);
+	new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
 	if (IS_ERR(new)) {
 		status = PTR_ERR(new);
 		mlog_errno(status);
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fd2e846e3e6f..ab713ebdd546 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE);
+	inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
 	if (IS_ERR(inode)) {
 		mlog_errno(PTR_ERR(inode));
 		inode = NULL;
-- 
cgit v1.2.3


From d2849fb294d92d6eee0a811c688f1ecb39d26800 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 19 Dec 2007 15:24:09 +0100
Subject: ocfs2: Safer read_inline_data()

In ocfs2_read_inline_data() we should store file size in loff_t. Although
the file size should fit in 32 bits we cannot be sure in case filesystem is
corrupted.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 286af3a11383..bc7b4cbbe8ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -217,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 			   struct buffer_head *di_bh)
 {
 	void *kaddr;
-	unsigned int size;
+	loff_t size;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
 	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@@ -231,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 	if (size > PAGE_CACHE_SIZE ||
 	    size > ocfs2_max_inline_data(inode->i_sb)) {
 		ocfs2_error(inode->i_sb,
-			    "Inode %llu has with inline data has bad size: %u",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno, size);
+			    "Inode %llu has with inline data has bad size: %Lu",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)size);
 		return -EROFS;
 	}
 
-- 
cgit v1.2.3


From 32c3c0e2e515197ad240f5104116254975e6bbce Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 19 Dec 2007 15:24:52 +0100
Subject: ocfs2: Use generic_file_llseek

We should use generic_file_llseek() and not default_llseek() so that
s_maxbytes gets properly checked when seeking.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index caefd571782e..6ebc9f9ec52c 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2233,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
 };
 
 const struct file_operations ocfs2_fops = {
+	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.mmap		= ocfs2_mmap,
@@ -2251,6 +2252,7 @@ const struct file_operations ocfs2_fops = {
 };
 
 const struct file_operations ocfs2_dops = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
-- 
cgit v1.2.3


From 634bf74d1e8a8d06727505ea4eb73e780d7aa246 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 19 Dec 2007 15:25:42 +0100
Subject: ocfs2: printf fixes

Explicitely convert loff_t to long long in printf. Just for sure...

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6ebc9f9ec52c..ed5d5232e85d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -626,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 
 	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
 	     "clusters_to_add = %u, extents_to_split = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
 	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
 
 	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@@ -807,7 +807,7 @@ restarted_transaction:
 	     le32_to_cpu(fe->i_clusters),
 	     (unsigned long long)le64_to_cpu(fe->i_size));
 	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
-	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+	     OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 
 leave:
 	if (handle) {
-- 
cgit v1.2.3


From 17104683d262fc6ab58488c4a3f0415012acc636 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 6 Nov 2007 16:10:23 -0800
Subject: ocfs2: Update default cluster timeouts

Lots of people are having trouble with the default timeouts, which are too
low. These new values are derived from an informal survey taken on
ocfs2-users, as well as data from bug reports. This should reduce the amount
of cluster disconnects and subsequent fencing seen during normal workloads.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.h | 2 +-
 fs/ocfs2/cluster/tcp.h       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 35397dd5ecdb..e511339886b3 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -35,7 +35,7 @@
 #define O2HB_LIVE_THRESHOLD	   2
 /* number of equal samples to be seen as dead */
 extern unsigned int o2hb_dead_threshold;
-#define O2HB_DEFAULT_DEAD_THRESHOLD	   7
+#define O2HB_DEFAULT_DEAD_THRESHOLD	   31
 /* Otherwise MAX_WRITE_TIMEOUT will be zero... */
 #define O2HB_MIN_DEAD_THRESHOLD	  2
 #define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index da880fc215f0..f36f66aab3dd 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
 /* same as hb delay, we're waiting for another node to recognize our hb */
 #define O2NET_RECONNECT_DELAY_MS_DEFAULT	2000
 
-#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT	5000
-#define O2NET_IDLE_TIMEOUT_MS_DEFAULT		10000
+#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT	2000
+#define O2NET_IDLE_TIMEOUT_MS_DEFAULT		30000
 
 
 /* TODO: figure this out.... */
-- 
cgit v1.2.3


From 4092d49f705aa19750c39758fa1be767e162c48d Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Tue, 25 Dec 2007 15:52:59 +0100
Subject: ocfs2: convert byte order of constant instead of variable

Convert byte order of constant instead of variable it will be done at
compile time vs run time. Remove unused le32_and_cpu.

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/endian.h | 5 -----
 fs/ocfs2/inode.c  | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index ff257628af16..1942e09f6ee5 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
 	*var = cpu_to_le64(le64_to_cpu(*var) + val);
 }
 
-static inline void le32_and_cpu(__le32 *var, u32 val)
-{
-	*var = cpu_to_le32(le32_to_cpu(*var) & val);
-}
-
 static inline void be32_add_cpu(__be32 *var, u32 val)
 {
 	*var = cpu_to_be32(be32_to_cpu(*var) + val);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 5e19c119183d..7e9e4c79aec7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -620,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 
 	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-	le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+	di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
 
 	status = ocfs2_journal_dirty(handle, di_bh);
 	if (status < 0) {
-- 
cgit v1.2.3


From 2d4b1cbb44f5557727c35895a83f82d023573fa9 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 10 Jan 2008 15:20:55 +0800
Subject: ocfs2/dlm: Clear joining_node on hearbeat node down

Currently the process of dlm join contains 2 steps: query join and assert join.
After query join, the joined node will set its joining_node. So if the joining
node happens to panic before the 2nd step, the joined node will fail to clear
its joining_node flag because that node isn't in the domain map. It at least
cause 2 problems.
1. All the new join request will fail. So no new node can mount the volume.
2. The joined node can't umount the volume since during the umount process it
   has to wait for the joining_node to be unknown. So the umount will be hanged.

The solution is to clear the joining_node before we check the domain map.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b10f3e313fbf..91f747b8a538 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 		}
 	}
 
+	/* Clean up join state on node death. */
+	if (dlm->joining_node == idx) {
+		mlog(0, "Clearing join state for node %u\n", idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	}
+
 	/* check to see if the node is already considered dead */
 	if (!test_bit(idx, dlm->live_nodes_map)) {
 		mlog(0, "for domain %s, node %d is already dead. "
@@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 
 	clear_bit(idx, dlm->live_nodes_map);
 
-	/* Clean up join state on node death. */
-	if (dlm->joining_node == idx) {
-		mlog(0, "Clearing join state for node %u\n", idx);
-		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-	}
-
 	/* make sure local cleanup occurs before the heartbeat events */
 	if (!test_bit(idx, dlm->recovery_map))
 		dlm_do_local_recovery_cleanup(dlm, idx);
-- 
cgit v1.2.3


From 0e5ae032030387bf0926aa980f2105646ead2b47 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 6 Nov 2007 15:52:58 -0800
Subject: ocfs2: bump version number

Bump the printed version to 1.5.0. This helps us quickly identify which
version of Ocfs2 a bug filer is running.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/ver.c  | 2 +-
 fs/ocfs2/dlm/dlmfsver.c | 2 +-
 fs/ocfs2/dlm/dlmver.c   | 2 +-
 fs/ocfs2/ver.c          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
index 7286c48bb30d..a56eee6abad3 100644
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -28,7 +28,7 @@
 
 #include "ver.h"
 
-#define CLUSTER_BUILD_VERSION "1.3.3"
+#define CLUSTER_BUILD_VERSION "1.5.0"
 
 #define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
 
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
index d2be3ad841f9..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -28,7 +28,7 @@
 
 #include "dlmfsver.h"
 
-#define DLM_BUILD_VERSION "1.3.3"
+#define DLM_BUILD_VERSION "1.5.0"
 
 #define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
 
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
index 7ef2653f8f41..dfc0da4d158d 100644
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -28,7 +28,7 @@
 
 #include "dlmver.h"
 
-#define DLM_BUILD_VERSION "1.3.3"
+#define DLM_BUILD_VERSION "1.5.0"
 
 #define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
 
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
index 5405ce121c99..e2488f4128a2 100644
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
@@ -29,7 +29,7 @@
 
 #include "ver.h"
 
-#define OCFS2_BUILD_VERSION "1.3.3"
+#define OCFS2_BUILD_VERSION "1.5.0"
 
 #define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
 
-- 
cgit v1.2.3


From 02ac0499c0e3c62f2e2bf61a13870b36ea103564 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 31 Dec 2007 13:56:47 -0800
Subject: configfs: Remove EXPERIMENTAL

configfs has been alive and kicking for a while now.  It underpins some
non-EXPERIMENTAL subsystems, such as OCFS2's cluster stack.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 16598a417423..ad63dfd6d76d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1022,8 +1022,8 @@ config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
 config CONFIGFS_FS
-	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
-	depends on SYSFS && EXPERIMENTAL
+	tristate "Userspace-driven configuration filesystem"
+	depends on SYSFS
 	help
 	  configfs is a ram-based filesystem that provides the converse
 	  of sysfs's functionality. Where sysfs is a filesystem-based
-- 
cgit v1.2.3


From ba611edfe406be745be95c332990c8e908c026c3 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwpark81@gmail.com>
Date: Wed, 26 Dec 2007 12:09:57 +0900
Subject: configfs: dir.c fix possible recursive locking

configfs_register_subsystem() with default_groups triggers recursive locking.
it seems that mutex_lock_nested is needed.

=============================================
[ INFO: possible recursive locking detected ]
2.6.24-rc6 #141
---------------------------------------------
swapper/1 is trying to acquire lock:
 (&sb->s_type->i_mutex_key#3){--..}, at: [<c40ca76f>] configfs_attach_group+0x4f/0x190

but task is already holding lock:
 (&sb->s_type->i_mutex_key#3){--..}, at: [<c40ca9d5>] configfs_register_subsystem+0x55/0x130

other info that might help us debug this:
1 lock held by swapper/1:
 #0:  (&sb->s_type->i_mutex_key#3){--..}, at: [<c40ca9d5>] configfs_register_subsystem+0x55/0x130

stack backtrace:
Pid: 1, comm: swapper Not tainted 2.6.24-rc6 #141
 [<c40053ba>] show_trace_log_lvl+0x1a/0x30
 [<c4005e82>] show_trace+0x12/0x20
 [<c400687e>] dump_stack+0x6e/0x80
 [<c404ec72>] __lock_acquire+0xe62/0x1120
 [<c404efb2>] lock_acquire+0x82/0xa0
 [<c43fdad8>] mutex_lock_nested+0x98/0x2e0
 [<c40ca76f>] configfs_attach_group+0x4f/0x190
 [<c40caa46>] configfs_register_subsystem+0xc6/0x130
 [<c45c8186>] init_netconsole+0x2b6/0x300
 [<c45a75f2>] kernel_init+0x142/0x320
 [<c4004fb3>] kernel_thread_helper+0x7/0x14
 =======================

Signed-off-by: Joonwoo Park <joonwpark81@gmail.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/configfs/dir.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 50ed691098bc..a48dc7dd8765 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -546,7 +546,7 @@ static int populate_groups(struct config_group *group)
 		 * That said, taking our i_mutex is closer to mkdir
 		 * emulation, and shouldn't hurt.
 		 */
-		mutex_lock(&dentry->d_inode->i_mutex);
+		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
 
 		for (i = 0; group->default_groups[i]; i++) {
 			new_group = group->default_groups[i];
@@ -1405,7 +1405,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	sd = configfs_sb->s_root->d_fsdata;
 	link_group(to_config_group(sd->s_element), group);
 
-	mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
+			I_MUTEX_PARENT);
 
 	name.name = group->cg_item.ci_name;
 	name.len = strlen(name.name);
-- 
cgit v1.2.3


From 116ba5d5ea1a5789a8c14b1087014007cada363b Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwpark81@gmail.com>
Date: Wed, 26 Dec 2007 12:09:57 +0900
Subject: configfs: file.c fix possible recursive locking

configfs_register_subsystem() with default_groups triggers recursive locking.
it seems that mutex_lock_nested is needed.

=============================================
[ INFO: possible recursive locking detected ]
2.6.24-rc6 #145
---------------------------------------------
swapper/1 is trying to acquire lock:
 (&sb->s_type->i_mutex_key#3){--..}, at: [<c40c9a9e>] configfs_add_file+0x2e/0x70

but task is already holding lock:
 (&sb->s_type->i_mutex_key#3){--..}, at: [<c40ca985>] configfs_register_subsystem+0x55/0x130

other info that might help us debug this:
1 lock held by swapper/1:
 #0:  (&sb->s_type->i_mutex_key#3){--..}, at: [<c40ca985>] configfs_register_subsystem+0x55/0x130

stack backtrace:
Pid: 1, comm: swapper Not tainted 2.6.24-rc6 #145
 [<c40053ba>] show_trace_log_lvl+0x1a/0x30
 [<c4005e82>] show_trace+0x12/0x20
 [<c400687e>] dump_stack+0x6e/0x80
 [<c404ec72>] __lock_acquire+0xe62/0x1120
 [<c404efb2>] lock_acquire+0x82/0xa0
 [<c43fda88>] mutex_lock_nested+0x98/0x2e0
 [<c40c9a9e>] configfs_add_file+0x2e/0x70
 [<c40c9b0c>] configfs_create_file+0x2c/0x40
 [<c40ca639>] configfs_attach_item+0x139/0x220
 [<c40ca734>] configfs_attach_group+0x14/0x140
 [<c40ca7e9>] configfs_attach_group+0xc9/0x140
 [<c40ca9f6>] configfs_register_subsystem+0xc6/0x130
 [<c45c8186>] init_netconsole+0x2b6/0x300
 [<c45a75f2>] kernel_init+0x142/0x320
 [<c4004fb3>] kernel_thread_helper+0x7/0x14
 =======================

Signed-off-by: Joonwoo Park <joonwpark81@gmail.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/configfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index a3658f9a082c..397cb503a180 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -320,7 +320,7 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
 	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
 	int error = 0;
 
-	mutex_lock(&dir->d_inode->i_mutex);
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
 	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
 	mutex_unlock(&dir->d_inode->i_mutex);
 
-- 
cgit v1.2.3


From 7ec373cf33533af6c50828a62f6b305c2d7fa931 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 23 Jan 2008 16:54:48 -0800
Subject: ocfs2: document access rules for blocked_lock_list

ocfs2_super->blocked_lock_list and ocfs2_super->blocked_lock_count have some
usage restrictions which aren't immediately obvious to anyone reading the
code. It's a good idea to document this so that we avoid making costly
mistakes in the future.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/ocfs2.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 22e334d125d0..d08480580470 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -262,6 +262,12 @@ struct ocfs2_super
 	unsigned long dc_wake_sequence;
 	unsigned long dc_work_sequence;
 
+	/*
+	 * Any thread can add locks to the list, but the downconvert
+	 * thread is the only one allowed to remove locks. Any change
+	 * to this rule requires updating
+	 * ocfs2_downconvert_thread_do_work().
+	 */
 	struct list_head blocked_lock_list;
 	unsigned long blocked_lock_count;
 
-- 
cgit v1.2.3


From 2fe5c1d7eb88830b09c863a4b5b3279dc120f3af Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 23 Jan 2008 18:35:31 -0800
Subject: ocfs2: clean up bh null checks

If we know a buffer_head is non-null, then brelse() is unnecessary and
put_bh() can be used instead. Also, an explicit check for NULL is
unnecessary when using brelse(). This patch only covers buffer_head_io.c and
resize.c, which have recently added code which exhibits this problem.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/buffer_head_io.c |  6 +++---
 fs/ocfs2/resize.c         | 17 +++++------------
 2 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 31aa61dc777b..f136639f5b41 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 		 * information for this bh as it's not marked locally
 		 * uptodate. */
 		ret = -EIO;
-		brelse(bh);
+		put_bh(bh);
 	}
 
 	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 				 * for this bh as it's not marked locally
 				 * uptodate. */
 				status = -EIO;
-				brelse(bh);
+				put_bh(bh);
 				bhs[i] = NULL;
 				continue;
 			}
@@ -334,7 +334,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 
 	if (!buffer_uptodate(bh)) {
 		ret = -EIO;
-		brelse(bh);
+		put_bh(bh);
 	}
 
 out:
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 7791309bb258..37835ffcb039 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -257,8 +257,7 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
 		ret = update_backups(inode, clusters, super_bh->b_data);
 
 out:
-	if (super_bh)
-		brelse(super_bh);
+	brelse(super_bh);
 	if (ret)
 		printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
 			" during fs resize. This condition is not fatal,"
@@ -380,11 +379,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out_unlock:
-	if (group_bh)
-		brelse(group_bh);
-
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(group_bh);
+	brelse(main_bm_bh);
 
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
@@ -623,11 +619,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out_unlock:
-	if (group_bh)
-		brelse(group_bh);
-
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(group_bh);
+	brelse(main_bm_bh);
 
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
-- 
cgit v1.2.3


From 5d84070ee0a433620c57e85dac7f82faaec5fbb3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 25 Jan 2008 12:44:44 +0100
Subject: __bio_clone: don't calculate hw/phys segment counts

If the users sets a new ->bi_bdev on the bio after __bio_clone() has
returned it, the "segment counts valid" flag still remains even though
it may be different with the new target. So don't calculate segment
counts in __bio_clone().

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index d59ddbf79626..242e409dab4b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -248,11 +248,13 @@ inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
  */
 void __bio_clone(struct bio *bio, struct bio *bio_src)
 {
-	struct request_queue *q = bdev_get_queue(bio_src->bi_bdev);
-
 	memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
 		bio_src->bi_max_vecs * sizeof(struct bio_vec));
 
+	/*
+	 * most users will be overriding ->bi_bdev with a new target,
+	 * so we don't set nor calculate new physical/hw segment counts here
+	 */
 	bio->bi_sector = bio_src->bi_sector;
 	bio->bi_bdev = bio_src->bi_bdev;
 	bio->bi_flags |= 1 << BIO_CLONED;
@@ -260,8 +262,6 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 	bio->bi_vcnt = bio_src->bi_vcnt;
 	bio->bi_size = bio_src->bi_size;
 	bio->bi_idx = bio_src->bi_idx;
-	bio_phys_segments(q, bio);
-	bio_hw_segments(q, bio);
 }
 
 /**
-- 
cgit v1.2.3


From fd0928df98b9578be8a786ac0cb78a47a5e17a20 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:52:45 +0100
Subject: ioprio: move io priority from task_struct to io_context

This is where it belongs and then it doesn't take up space for a
process that doesn't do IO.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/ioprio.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index e4e01bc7f338..a7600401ecf7 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -41,18 +41,29 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 		return err;
 
 	task_lock(task);
+	do {
+		ioc = task->io_context;
+		/* see wmb() in current_io_context() */
+		smp_read_barrier_depends();
+		if (ioc)
+			break;
 
-	task->ioprio = ioprio;
-
-	ioc = task->io_context;
-	/* see wmb() in current_io_context() */
-	smp_read_barrier_depends();
+		ioc = alloc_io_context(GFP_ATOMIC, -1);
+		if (!ioc) {
+			err = -ENOMEM;
+			break;
+		}
+		task->io_context = ioc;
+		ioc->task = task;
+	} while (1);
 
-	if (ioc)
+	if (!err) {
+		ioc->ioprio = ioprio;
 		ioc->ioprio_changed = 1;
+	}
 
 	task_unlock(task);
-	return 0;
+	return err;
 }
 
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
@@ -148,7 +159,9 @@ static int get_task_ioprio(struct task_struct *p)
 	ret = security_task_getioprio(p);
 	if (ret)
 		goto out;
-	ret = p->ioprio;
+	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	if (p->io_context)
+		ret = p->io_context->ioprio;
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From d38ecf935fcb10264a6bc190855d9595165e6eeb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:53:35 +0100
Subject: io context sharing: preliminary support

Detach task state from ioc, instead keep track of how many processes
are accessing the ioc.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/ioprio.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index a7600401ecf7..06b5d97c5fdd 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -54,7 +54,6 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 			break;
 		}
 		task->io_context = ioc;
-		ioc->task = task;
 	} while (1);
 
 	if (!err) {
-- 
cgit v1.2.3


From 0871714e08fed7ba66cadad11b2e4f85a9dc9b96 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 28 Jan 2008 11:38:15 +0100
Subject: cfq-iosched: relax IOPRIO_CLASS_IDLE restrictions

Currently you must be root to set idle io prio class on a process. This
is due to the fact that the idle class is implemented as a true idle
class, meaning that it will not make progress if someone else is
requesting disk access. Unfortunately this means that it opens DOS
opportunities by locking down file system resources, hence it is root
only at the moment.

This patch relaxes the idle class a little, by removing the truly idle
part (which entals a grace period with associated timer). The
modifications make the idle class as close to zero impact as can be done
while still guarenteeing progress. This means we can relax the root only
criteria as well.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/ioprio.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index 06b5d97c5fdd..c4a1c3c65aac 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -85,8 +85,6 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 
 			break;
 		case IOPRIO_CLASS_IDLE:
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
 			break;
 		case IOPRIO_CLASS_NONE:
 			if (data)
-- 
cgit v1.2.3


From 7491a76b23f5100823098b9d5d74ef18a2ca0dc1 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Wed, 2 Jan 2008 13:55:33 +0800
Subject: FS: Remove dead code

Remove dead code in smbfs makefile.

Cc: Al Viro <viro@www.linux.org.uk>
Cc: Tim Shimmin <xfs-masters@oss.sgi.com>
Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 fs/smbfs/Makefile | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'fs')

diff --git a/fs/smbfs/Makefile b/fs/smbfs/Makefile
index 6673ee82cb4c..4faf8c4722c3 100644
--- a/fs/smbfs/Makefile
+++ b/fs/smbfs/Makefile
@@ -16,23 +16,3 @@ EXTRA_CFLAGS += -DSMBFS_PARANOIA
 #EXTRA_CFLAGS += -DDEBUG_SMB_TIMESTAMP
 #EXTRA_CFLAGS += -Werror
 
-#
-# Maintainer rules
-#
-
-# getopt.c not included. It is intentionally separate
-SRC = proc.c dir.c cache.c sock.c inode.c file.c ioctl.c smbiod.c request.c \
-	symlink.c
-
-proto:
-	-rm -f proto.h
-	@echo >  proto2.h "/*"
-	@echo >> proto2.h " *  Autogenerated with cproto on: " `date`
-	@echo >> proto2.h " */"
-	@echo >> proto2.h ""
-	@echo >> proto2.h "struct smb_request;"
-	@echo >> proto2.h "struct sock;"
-	@echo >> proto2.h "struct statfs;"
-	@echo >> proto2.h ""
-	cproto -E "gcc -E" -e -v -I $(TOPDIR)/include -DMAKING_PROTO -D__KERNEL__ $(SRC) >> proto2.h
-	mv proto2.h proto.h
-- 
cgit v1.2.3


From 3ff6eecca4e5c49a5d1dd8b58ea0e20102ce08f0 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Thu, 24 Jan 2008 22:16:20 +0100
Subject: remove __attribute_used__

Remove the deprecated __attribute_used__.

[Introduce __section in a few places to silence checkpatch /sam]

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 fs/compat_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index da8cb3b3592c..ffdc022cae64 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1376,7 +1376,7 @@ static int do_atm_ioctl(unsigned int fd, unsigned int cmd32, unsigned long arg)
         return -EINVAL;
 }
 
-static __attribute_used__ int 
+static __used int
 ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From bbdfc2f70610bebb841d0874dc901c648308e43a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 6 Nov 2007 23:29:47 -0800
Subject: [SPLICE]: Don't assume regular pages in splice_to_pipe()

Allow caller to pass in a release function, there might be
other resources that need releasing as well. Needed for
network receive.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/splice.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 56b802bfbfa4..0a0b79b01d05 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -254,11 +254,16 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 	}
 
 	while (page_nr < spd_pages)
-		page_cache_release(spd->pages[page_nr++]);
+		spd->spd_release(spd, page_nr++);
 
 	return ret;
 }
 
+static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
+{
+	page_cache_release(spd->pages[i]);
+}
+
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
@@ -277,6 +282,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		.partial = partial,
 		.flags = flags,
 		.ops = &page_cache_pipe_buf_ops,
+		.spd_release = spd_release_page,
 	};
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
@@ -1432,6 +1438,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 		.partial = partial,
 		.flags = flags,
 		.ops = &user_page_pipe_buf_ops,
+		.spd_release = spd_release_page,
 	};
 
 	pipe = pipe_info(file->f_path.dentry->d_inode);
-- 
cgit v1.2.3


From e372c41401993b45c721c4d92730e7e0a79f7c1b Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Mon, 19 Nov 2007 22:31:54 -0800
Subject: [NET]: Consolidate net namespace related proc files creation.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 0afe21ee0607..cfc4f6c072f1 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -22,10 +22,48 @@
 #include <linux/mount.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
+#include <linux/seq_file.h>
 
 #include "internal.h"
 
 
+int seq_open_net(struct inode *ino, struct file *f,
+		 const struct seq_operations *ops, int size)
+{
+	struct net *net;
+	struct seq_net_private *p;
+
+	BUG_ON(size < sizeof(*p));
+
+	net = get_proc_net(ino);
+	if (net == NULL)
+		return -ENXIO;
+
+	p = __seq_open_private(f, ops, size);
+	if (p == NULL) {
+		put_net(net);
+		return -ENOMEM;
+	}
+	p->net = net;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seq_open_net);
+
+int seq_release_net(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq;
+	struct seq_net_private *p;
+
+	seq = f->private_data;
+	p = seq->private;
+
+	put_net(p->net);
+	seq_release_private(ino, f);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seq_release_net);
+
+
 struct proc_dir_entry *proc_net_fops_create(struct net *net,
 	const char *name, mode_t mode, const struct file_operations *fops)
 {
-- 
cgit v1.2.3


From e5d69b9f4a6ce17f0d09595da45e37b870fee5ae Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Thu, 10 Jan 2008 03:51:41 -0800
Subject: [ATM]: Oops reading net/atm/arp

cat /proc/net/atm/arp causes the NULL pointer dereference in the
get_proc_net+0xc/0x3a. This happens as proc_get_net believes that the
parent proc dir entry contains struct net.

Fix this assumption for "net/atm" case.

The problem is introduced by the commit c0097b07abf5f92ab135d024dd41bd2aada1512f
from Eric W. Biederman/Daniel Lezcano.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index cfc4f6c072f1..4823c9677fac 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -96,6 +96,17 @@ static struct proc_dir_entry *proc_net_shadow(struct task_struct *task,
 	return task->nsproxy->net_ns->proc_net;
 }
 
+struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
+		struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry *pde;
+	pde = proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
+	if (pde != NULL)
+		pde->data = net;
+	return pde;
+}
+EXPORT_SYMBOL_GPL(proc_net_mkdir);
+
 static __net_init int proc_net_ns_init(struct net *net)
 {
 	struct proc_dir_entry *root, *netd, *net_statd;
@@ -107,18 +118,16 @@ static __net_init int proc_net_ns_init(struct net *net)
 		goto out;
 
 	err = -EEXIST;
-	netd = proc_mkdir("net", root);
+	netd = proc_net_mkdir(net, "net", root);
 	if (!netd)
 		goto free_root;
 
 	err = -EEXIST;
-	net_statd = proc_mkdir("stat", netd);
+	net_statd = proc_net_mkdir(net, "stat", netd);
 	if (!net_statd)
 		goto free_net;
 
 	root->data = net;
-	netd->data = net;
-	net_statd->data = net;
 
 	net->proc_net_root = root;
 	net->proc_net = netd;
-- 
cgit v1.2.3


From b7c6ba6eb1234e35a74fb8ba8123232a7b1ba9e4 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Mon, 28 Jan 2008 14:41:19 -0800
Subject: [NETNS]: Consolidate kernel netlink socket destruction.

Create a specific helper for netlink kernel socket disposal. This just
let the code look better and provides a ground for proper disposal
inside a namespace.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Tested-by: Alexey Dobriyan <adobriyan@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/ecryptfs/netlink.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index 9aa345121e09..f638a698dc52 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -237,7 +237,6 @@ out:
  */
 void ecryptfs_release_netlink(void)
 {
-	if (ecryptfs_nl_sock && ecryptfs_nl_sock->sk_socket)
-		sock_release(ecryptfs_nl_sock->sk_socket);
+	netlink_kernel_release(ecryptfs_nl_sock);
 	ecryptfs_nl_sock = NULL;
 }
-- 
cgit v1.2.3


From 6b11d8179d1c6e560edc02c40a53b65fde83bf3f Mon Sep 17 00:00:00 2001
From: Joel Becker <Joel.Becker@oracle.com>
Date: Mon, 28 Jan 2008 18:52:04 -0800
Subject: ocfs2: Fix userspace ABI breakage in sysfs

The userspace ABI of ocfs2's internal cluster stack (o2cb) was broken by
commit c60b71787982cefcf9fa09aa281fa8c4c685d557 "kset: convert ocfs2 to
use kset_create".  Specifically, the '/sys/o2cb' kset was moved to
'/sys/fs/o2cb'.  This breaks all ocfs2 tools and renders the
filesystem unmountable.

This fix moves '/sys/o2cb' back where it belongs.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index a4b07730b2e1..0c095ce7723d 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -64,7 +64,7 @@ int o2cb_sys_init(void)
 {
 	int ret;
 
-	o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj);
+	o2cb_kset = kset_create_and_add("o2cb", NULL, NULL);
 	if (!o2cb_kset)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From afc7cbca5bfd556c3e12d3acefbee5ab0cbd4670 Mon Sep 17 00:00:00 2001
From: Takashi Sato <sho@tnes.nec.co.jp>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4:  Support large blocksize up to PAGESIZE

This patch set supports large block size(>4k, <=64k) in ext4,
just enlarging the block size limit. But it is NOT possible to have 64kB
blocksize on ext4 without some changes to the directory handling
code.  The reason is that an empty 64kB directory block would have a
rec_len == (__u16)2^16 == 0, and this would cause an error to be hit in
the filesystem.  The proposed solution is treat 64k rec_len
with a an impossible value like rec_len = 0xffff to handle this.

The Patch-set consists of the following 2 patches.
  [1/2]  ext4: enlarge blocksize
         - Allow blocksize up to pagesize

  [2/2]  ext4: fix rec_len overflow
         - prevent rec_len from overflow with 64KB blocksize

Now on 64k page ppc64 box runs with this patch set we could create a 64k
block size ext4dev, and able to handle empty directory block.

Signed-off-by: Takashi Sato <sho@tnes.nec.co.jp>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1ca0f546c466..ab7010dde1b5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1624,6 +1624,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		goto out_fail;
 	}
 
+	if (!sb_set_blocksize(sb, blocksize)) {
+		printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
+		goto out_fail;
+	}
+
 	/*
 	 * The ext4 superblock will not be buffer aligned for other than 1kB
 	 * block sizes.  We need to calculate the offset from buffer start.
-- 
cgit v1.2.3


From a72d7f834e1afa08421938d7eb06bd8e56b0e58c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Avoid rec_len overflow with 64KB block size

With 64KB blocksize, a directory entry can have size 64KB which does not fit
into 16 bits we have for entry lenght. So we store 0xffff instead and convert
value when read from / written to disk. The patch also converts some places
to use ext4_next_entry() when we are changing them anyway.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/dir.c   | 12 ++++-----
 fs/ext4/namei.c | 77 +++++++++++++++++++++++++++------------------------------
 2 files changed, 43 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f612bef98315..145a9c0c972d 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
 			  unsigned long offset)
 {
 	const char * error_msg = NULL;
-	const int rlen = le16_to_cpu(de->rec_len);
+	const int rlen = ext4_rec_len_from_disk(de->rec_len);
 
 	if (rlen < EXT4_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
@@ -172,10 +172,10 @@ revalidate:
 				 * least that it is non-zero.  A
 				 * failure will be detected in the
 				 * dirent test below. */
-				if (le16_to_cpu(de->rec_len) <
-						EXT4_DIR_REC_LEN(1))
+				if (ext4_rec_len_from_disk(de->rec_len)
+						< EXT4_DIR_REC_LEN(1))
 					break;
-				i += le16_to_cpu(de->rec_len);
+				i += ext4_rec_len_from_disk(de->rec_len);
 			}
 			offset = i;
 			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -197,7 +197,7 @@ revalidate:
 				ret = stored;
 				goto out;
 			}
-			offset += le16_to_cpu(de->rec_len);
+			offset += ext4_rec_len_from_disk(de->rec_len);
 			if (le32_to_cpu(de->inode)) {
 				/* We might block in the next section
 				 * if the data destination is
@@ -219,7 +219,7 @@ revalidate:
 					goto revalidate;
 				stored ++;
 			}
-			filp->f_pos += le16_to_cpu(de->rec_len);
+			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
 		}
 		offset = 0;
 		brelse (bh);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 94ee6f315dc1..d9a3a2fc5b0d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -280,7 +280,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
 			space += EXT4_DIR_REC_LEN(de->name_len);
 			names++;
 		}
-		de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+		de = ext4_next_entry(de);
 	}
 	printk("(%i)\n", names);
 	return (struct stats) { names, space, 1 };
@@ -551,7 +551,8 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
  */
 static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
 {
-	return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
+	return (struct ext4_dir_entry_2 *)((char *)p +
+		ext4_rec_len_from_disk(p->rec_len));
 }
 
 /*
@@ -720,7 +721,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
 			cond_resched();
 		}
 		/* XXX: do we need to check rec_len == 0 case? -Chris */
-		de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+		de = ext4_next_entry(de);
 	}
 	return count;
 }
@@ -820,7 +821,7 @@ static inline int search_dirblock(struct buffer_head * bh,
 			return 1;
 		}
 		/* prevent looping on a bad block */
-		de_len = le16_to_cpu(de->rec_len);
+		de_len = ext4_rec_len_from_disk(de->rec_len);
 		if (de_len <= 0)
 			return -1;
 		offset += de_len;
@@ -1128,7 +1129,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
 		((struct ext4_dir_entry_2 *) to)->rec_len =
-				cpu_to_le16(rec_len);
+				ext4_rec_len_to_disk(rec_len);
 		de->inode = 0;
 		map++;
 		to += rec_len;
@@ -1147,13 +1148,12 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
 
 	prev = to = de;
 	while ((char*)de < base + size) {
-		next = (struct ext4_dir_entry_2 *) ((char *) de +
-						    le16_to_cpu(de->rec_len));
+		next = ext4_next_entry(de);
 		if (de->inode && de->name_len) {
 			rec_len = EXT4_DIR_REC_LEN(de->name_len);
 			if (de > to)
 				memmove(to, de, rec_len);
-			to->rec_len = cpu_to_le16(rec_len);
+			to->rec_len = ext4_rec_len_to_disk(rec_len);
 			prev = to;
 			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
 		}
@@ -1227,8 +1227,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	/* Fancy dance to stay within two buffers */
 	de2 = dx_move_dirents(data1, data2, map + split, count - split);
 	de = dx_pack_dirents(data1,blocksize);
-	de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
-	de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
 
@@ -1297,7 +1297,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 				return -EEXIST;
 			}
 			nlen = EXT4_DIR_REC_LEN(de->name_len);
-			rlen = le16_to_cpu(de->rec_len);
+			rlen = ext4_rec_len_from_disk(de->rec_len);
 			if ((de->inode? rlen - nlen: rlen) >= reclen)
 				break;
 			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1316,11 +1316,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 
 	/* By now the buffer is marked for journaling */
 	nlen = EXT4_DIR_REC_LEN(de->name_len);
-	rlen = le16_to_cpu(de->rec_len);
+	rlen = ext4_rec_len_from_disk(de->rec_len);
 	if (de->inode) {
 		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-		de1->rec_len = cpu_to_le16(rlen - nlen);
-		de->rec_len = cpu_to_le16(nlen);
+		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
+		de->rec_len = ext4_rec_len_to_disk(nlen);
 		de = de1;
 	}
 	de->file_type = EXT4_FT_UNKNOWN;
@@ -1397,17 +1397,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 
 	/* The 0th block becomes the root, move the dirents out */
 	fde = &root->dotdot;
-	de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
+	de = (struct ext4_dir_entry_2 *)((char *)fde +
+		ext4_rec_len_from_disk(fde->rec_len));
 	len = ((char *) root) + blocksize - (char *) de;
 	memcpy (data1, de, len);
 	de = (struct ext4_dir_entry_2 *) data1;
 	top = data1 + len;
-	while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
+	while ((char *)(de2 = ext4_next_entry(de)) < top)
 		de = de2;
-	de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
 	/* Initialize the root; the dot dirents already exist */
 	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-	de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2));
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
 	memset (&root->info, 0, sizeof(root->info));
 	root->info.info_length = sizeof(root->info);
 	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1487,7 +1488,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
 		return retval;
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
-	de->rec_len = cpu_to_le16(blocksize);
+	de->rec_len = ext4_rec_len_to_disk(blocksize);
 	return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
 
@@ -1550,7 +1551,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
-		node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
 		node2->fake.inode = 0;
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1648,9 +1649,9 @@ static int ext4_delete_entry (handle_t *handle,
 			BUFFER_TRACE(bh, "get_write_access");
 			ext4_journal_get_write_access(handle, bh);
 			if (pde)
-				pde->rec_len =
-					cpu_to_le16(le16_to_cpu(pde->rec_len) +
-						    le16_to_cpu(de->rec_len));
+				pde->rec_len = ext4_rec_len_to_disk(
+					ext4_rec_len_from_disk(pde->rec_len) +
+					ext4_rec_len_from_disk(de->rec_len));
 			else
 				de->inode = 0;
 			dir->i_version++;
@@ -1658,10 +1659,9 @@ static int ext4_delete_entry (handle_t *handle,
 			ext4_journal_dirty_metadata(handle, bh);
 			return 0;
 		}
-		i += le16_to_cpu(de->rec_len);
+		i += ext4_rec_len_from_disk(de->rec_len);
 		pde = de;
-		de = (struct ext4_dir_entry_2 *)
-			((char *) de + le16_to_cpu(de->rec_len));
+		de = ext4_next_entry(de);
 	}
 	return -ENOENT;
 }
@@ -1824,13 +1824,13 @@ retry:
 	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
-	de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len));
+	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
 	strcpy (de->name, ".");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-	de = (struct ext4_dir_entry_2 *)
-			((char *) de + le16_to_cpu(de->rec_len));
+	de = ext4_next_entry(de);
 	de->inode = cpu_to_le32(dir->i_ino);
-	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1));
+	de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
+						EXT4_DIR_REC_LEN(1));
 	de->name_len = 2;
 	strcpy (de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1882,8 +1882,7 @@ static int empty_dir (struct inode * inode)
 		return 1;
 	}
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
-	de1 = (struct ext4_dir_entry_2 *)
-			((char *) de + le16_to_cpu(de->rec_len));
+	de1 = ext4_next_entry(de);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
 			!le32_to_cpu(de1->inode) ||
 			strcmp (".", de->name) ||
@@ -1894,9 +1893,9 @@ static int empty_dir (struct inode * inode)
 		brelse (bh);
 		return 1;
 	}
-	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
-	de = (struct ext4_dir_entry_2 *)
-			((char *) de1 + le16_to_cpu(de1->rec_len));
+	offset = ext4_rec_len_from_disk(de->rec_len) +
+		 ext4_rec_len_from_disk(de1->rec_len);
+	de = ext4_next_entry(de1);
 	while (offset < inode->i_size ) {
 		if (!bh ||
 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1925,9 +1924,8 @@ static int empty_dir (struct inode * inode)
 			brelse (bh);
 			return 0;
 		}
-		offset += le16_to_cpu(de->rec_len);
-		de = (struct ext4_dir_entry_2 *)
-				((char *) de + le16_to_cpu(de->rec_len));
+		offset += ext4_rec_len_from_disk(de->rec_len);
+		de = ext4_next_entry(de);
 	}
 	brelse (bh);
 	return 1;
@@ -2282,8 +2280,7 @@ retry:
 }
 
 #define PARENT_INO(buffer) \
-	((struct ext4_dir_entry_2 *) ((char *) buffer + \
-	le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
+	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
 
 /*
  * Anybody can rename anything with this: the permission checks are left to the
-- 
cgit v1.2.3


From 725d26d3f09ccb5bac4b4293096b985a312a0d67 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Introduce ext4_lblk_t

This patch adds a new data type ext4_lblk_t to represent
the logical file blocks.

This is the preparatory patch to support large files in ext4
The follow up patch with convert the ext4_inode i_blocks to
represent the number of blocks in file system block size. This
changes makes it possible to have a block number 2**32 -1 which
will result in overflow if the block number is represented by
signed long. This patch convert all the block number to type
ext4_lblk_t which is typedef to __u32

Also remove dead code ext4_ext_walk_space

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/ext4/dir.c     |   2 +-
 fs/ext4/extents.c | 218 ++++++++++++++++--------------------------------------
 fs/ext4/inode.c   |  34 +++++----
 fs/ext4/namei.c   |  54 ++++++++------
 fs/ext4/super.c   |   4 +-
 5 files changed, 116 insertions(+), 196 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 145a9c0c972d..33888bb58144 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -124,7 +124,7 @@ static int ext4_readdir(struct file * filp,
 	offset = filp->f_pos & (sb->s_blocksize - 1);
 
 	while (!error && !stored && filp->f_pos < inode->i_size) {
-		unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
 		struct buffer_head map_bh;
 		struct buffer_head *bh = NULL;
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 85287742f2ae..19d8059b58aa 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -144,7 +144,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
 
 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 			      struct ext4_ext_path *path,
-			      ext4_fsblk_t block)
+			      ext4_lblk_t block)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	ext4_fsblk_t bg_start;
@@ -367,13 +367,14 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path)
  * the header must be checked before calling this
  */
 static void
-ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
+ext4_ext_binsearch_idx(struct inode *inode,
+			struct ext4_ext_path *path, ext4_lblk_t block)
 {
 	struct ext4_extent_header *eh = path->p_hdr;
 	struct ext4_extent_idx *r, *l, *m;
 
 
-	ext_debug("binsearch for %d(idx):  ", block);
+	ext_debug("binsearch for %lu(idx):  ", (unsigned long)block);
 
 	l = EXT_FIRST_INDEX(eh) + 1;
 	r = EXT_LAST_INDEX(eh);
@@ -425,7 +426,8 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int bloc
  * the header must be checked before calling this
  */
 static void
-ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
+ext4_ext_binsearch(struct inode *inode,
+		struct ext4_ext_path *path, ext4_lblk_t block)
 {
 	struct ext4_extent_header *eh = path->p_hdr;
 	struct ext4_extent *r, *l, *m;
@@ -438,7 +440,7 @@ ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
 		return;
 	}
 
-	ext_debug("binsearch for %d:  ", block);
+	ext_debug("binsearch for %lu:  ", (unsigned long)block);
 
 	l = EXT_FIRST_EXTENT(eh) + 1;
 	r = EXT_LAST_EXTENT(eh);
@@ -494,7 +496,8 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 }
 
 struct ext4_ext_path *
-ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
+ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
+					struct ext4_ext_path *path)
 {
 	struct ext4_extent_header *eh;
 	struct buffer_head *bh;
@@ -979,8 +982,8 @@ repeat:
 		/* refill path */
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
-					    le32_to_cpu(newext->ee_block),
-					    path);
+				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
+				    path);
 		if (IS_ERR(path))
 			err = PTR_ERR(path);
 	} else {
@@ -992,8 +995,8 @@ repeat:
 		/* refill path */
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
-					    le32_to_cpu(newext->ee_block),
-					    path);
+				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
+				    path);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -1021,7 +1024,7 @@ out:
  * allocated block. Thus, index entries have to be consistent
  * with leaves.
  */
-static unsigned long
+static ext4_lblk_t
 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 {
 	int depth;
@@ -1054,7 +1057,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
  * ext4_ext_next_leaf_block:
  * returns first allocated block from next leaf or EXT_MAX_BLOCK
  */
-static unsigned ext4_ext_next_leaf_block(struct inode *inode,
+static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
 					struct ext4_ext_path *path)
 {
 	int depth;
@@ -1072,7 +1075,8 @@ static unsigned ext4_ext_next_leaf_block(struct inode *inode,
 	while (depth >= 0) {
 		if (path[depth].p_idx !=
 				EXT_LAST_INDEX(path[depth].p_hdr))
-		  return le32_to_cpu(path[depth].p_idx[1].ei_block);
+			return (ext4_lblk_t)
+				le32_to_cpu(path[depth].p_idx[1].ei_block);
 		depth--;
 	}
 
@@ -1239,7 +1243,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
 				    struct ext4_extent *newext,
 				    struct ext4_ext_path *path)
 {
-	unsigned long b1, b2;
+	ext4_lblk_t b1, b2;
 	unsigned int depth, len1;
 	unsigned int ret = 0;
 
@@ -1260,7 +1264,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
 			goto out;
 	}
 
-	/* check for wrap through zero */
+	/* check for wrap through zero on extent logical start block*/
 	if (b1 + len1 < b1) {
 		len1 = EXT_MAX_BLOCK - b1;
 		newext->ee_len = cpu_to_le16(len1);
@@ -1290,7 +1294,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	struct ext4_extent *ex, *fex;
 	struct ext4_extent *nearex; /* nearest extent */
 	struct ext4_ext_path *npath = NULL;
-	int depth, len, err, next;
+	int depth, len, err;
+	ext4_lblk_t next;
 	unsigned uninitialized = 0;
 
 	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
@@ -1435,114 +1440,8 @@ cleanup:
 	return err;
 }
 
-int ext4_ext_walk_space(struct inode *inode, unsigned long block,
-			unsigned long num, ext_prepare_callback func,
-			void *cbdata)
-{
-	struct ext4_ext_path *path = NULL;
-	struct ext4_ext_cache cbex;
-	struct ext4_extent *ex;
-	unsigned long next, start = 0, end = 0;
-	unsigned long last = block + num;
-	int depth, exists, err = 0;
-
-	BUG_ON(func == NULL);
-	BUG_ON(inode == NULL);
-
-	while (block < last && block != EXT_MAX_BLOCK) {
-		num = last - block;
-		/* find extent for this block */
-		path = ext4_ext_find_extent(inode, block, path);
-		if (IS_ERR(path)) {
-			err = PTR_ERR(path);
-			path = NULL;
-			break;
-		}
-
-		depth = ext_depth(inode);
-		BUG_ON(path[depth].p_hdr == NULL);
-		ex = path[depth].p_ext;
-		next = ext4_ext_next_allocated_block(path);
-
-		exists = 0;
-		if (!ex) {
-			/* there is no extent yet, so try to allocate
-			 * all requested space */
-			start = block;
-			end = block + num;
-		} else if (le32_to_cpu(ex->ee_block) > block) {
-			/* need to allocate space before found extent */
-			start = block;
-			end = le32_to_cpu(ex->ee_block);
-			if (block + num < end)
-				end = block + num;
-		} else if (block >= le32_to_cpu(ex->ee_block)
-					+ ext4_ext_get_actual_len(ex)) {
-			/* need to allocate space after found extent */
-			start = block;
-			end = block + num;
-			if (end >= next)
-				end = next;
-		} else if (block >= le32_to_cpu(ex->ee_block)) {
-			/*
-			 * some part of requested space is covered
-			 * by found extent
-			 */
-			start = block;
-			end = le32_to_cpu(ex->ee_block)
-				+ ext4_ext_get_actual_len(ex);
-			if (block + num < end)
-				end = block + num;
-			exists = 1;
-		} else {
-			BUG();
-		}
-		BUG_ON(end <= start);
-
-		if (!exists) {
-			cbex.ec_block = start;
-			cbex.ec_len = end - start;
-			cbex.ec_start = 0;
-			cbex.ec_type = EXT4_EXT_CACHE_GAP;
-		} else {
-			cbex.ec_block = le32_to_cpu(ex->ee_block);
-			cbex.ec_len = ext4_ext_get_actual_len(ex);
-			cbex.ec_start = ext_pblock(ex);
-			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
-		}
-
-		BUG_ON(cbex.ec_len == 0);
-		err = func(inode, path, &cbex, cbdata);
-		ext4_ext_drop_refs(path);
-
-		if (err < 0)
-			break;
-		if (err == EXT_REPEAT)
-			continue;
-		else if (err == EXT_BREAK) {
-			err = 0;
-			break;
-		}
-
-		if (ext_depth(inode) != depth) {
-			/* depth was changed. we have to realloc path */
-			kfree(path);
-			path = NULL;
-		}
-
-		block = cbex.ec_block + cbex.ec_len;
-	}
-
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
-
-	return err;
-}
-
 static void
-ext4_ext_put_in_cache(struct inode *inode, __u32 block,
+ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 			__u32 len, ext4_fsblk_t start, int type)
 {
 	struct ext4_ext_cache *cex;
@@ -1561,10 +1460,11 @@ ext4_ext_put_in_cache(struct inode *inode, __u32 block,
  */
 static void
 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
-				unsigned long block)
+				ext4_lblk_t block)
 {
 	int depth = ext_depth(inode);
-	unsigned long lblock, len;
+	unsigned long len;
+	ext4_lblk_t lblock;
 	struct ext4_extent *ex;
 
 	ex = path[depth].p_ext;
@@ -1582,15 +1482,17 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 				(unsigned long) ext4_ext_get_actual_len(ex));
 	} else if (block >= le32_to_cpu(ex->ee_block)
 			+ ext4_ext_get_actual_len(ex)) {
+		ext4_lblk_t next;
 		lblock = le32_to_cpu(ex->ee_block)
 			+ ext4_ext_get_actual_len(ex);
-		len = ext4_ext_next_allocated_block(path);
+
+		next = ext4_ext_next_allocated_block(path);
 		ext_debug("cache gap(after): [%lu:%lu] %lu",
 				(unsigned long) le32_to_cpu(ex->ee_block),
 				(unsigned long) ext4_ext_get_actual_len(ex),
 				(unsigned long) block);
-		BUG_ON(len == lblock);
-		len = len - lblock;
+		BUG_ON(next == lblock);
+		len = next - lblock;
 	} else {
 		lblock = len = 0;
 		BUG();
@@ -1601,7 +1503,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 }
 
 static int
-ext4_ext_in_cache(struct inode *inode, unsigned long block,
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 			struct ext4_extent *ex)
 {
 	struct ext4_ext_cache *cex;
@@ -1714,7 +1616,7 @@ int ext4_ext_calc_credits_for_insert(struct inode *inode,
 
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 				struct ext4_extent *ex,
-				unsigned long from, unsigned long to)
+				ext4_lblk_t from, ext4_lblk_t to)
 {
 	struct buffer_head *bh;
 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
@@ -1738,11 +1640,12 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	if (from >= le32_to_cpu(ex->ee_block)
 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* tail removal */
-		unsigned long num;
+		ext4_lblk_t num;
 		ext4_fsblk_t start;
+
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
 		start = ext_pblock(ex) + ee_len - num;
-		ext_debug("free last %lu blocks starting %llu\n", num, start);
+		ext_debug("free last %u blocks starting %llu\n", num, start);
 		for (i = 0; i < num; i++) {
 			bh = sb_find_get_block(inode->i_sb, start + i);
 			ext4_forget(handle, 0, inode, bh, start + i);
@@ -1750,30 +1653,32 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		ext4_free_blocks(handle, inode, start, num);
 	} else if (from == le32_to_cpu(ex->ee_block)
 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-		printk("strange request: removal %lu-%lu from %u:%u\n",
+		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
 			from, to, le32_to_cpu(ex->ee_block), ee_len);
 	} else {
-		printk("strange request: removal(2) %lu-%lu from %u:%u\n",
-			from, to, le32_to_cpu(ex->ee_block), ee_len);
+		printk(KERN_INFO "strange request: removal(2) "
+				"%u-%u from %u:%u\n",
+				from, to, le32_to_cpu(ex->ee_block), ee_len);
 	}
 	return 0;
 }
 
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-		struct ext4_ext_path *path, unsigned long start)
+		struct ext4_ext_path *path, ext4_lblk_t start)
 {
 	int err = 0, correct_index = 0;
 	int depth = ext_depth(inode), credits;
 	struct ext4_extent_header *eh;
-	unsigned a, b, block, num;
-	unsigned long ex_ee_block;
+	ext4_lblk_t a, b, block;
+	unsigned num;
+	ext4_lblk_t ex_ee_block;
 	unsigned short ex_ee_len;
 	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
-	ext_debug("truncate since %lu in leaf\n", start);
+	ext_debug("truncate since %u in leaf\n", start);
 	if (!path[depth].p_hdr)
 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
 	eh = path[depth].p_hdr;
@@ -1904,7 +1809,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
 	return 1;
 }
 
-int ext4_ext_remove_space(struct inode *inode, unsigned long start)
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 {
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
@@ -1912,7 +1817,7 @@ int ext4_ext_remove_space(struct inode *inode, unsigned long start)
 	handle_t *handle;
 	int i = 0, err = 0;
 
-	ext_debug("truncate since %lu\n", start);
+	ext_debug("truncate since %u\n", start);
 
 	/* probably first extent we're gonna free will be last in block */
 	handle = ext4_journal_start(inode, depth + 1);
@@ -2094,17 +1999,19 @@ void ext4_ext_release(struct super_block *sb)
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
  */
-int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
-					struct ext4_ext_path *path,
-					ext4_fsblk_t iblock,
-					unsigned long max_blocks)
+static int ext4_ext_convert_to_initialized(handle_t *handle,
+						struct inode *inode,
+						struct ext4_ext_path *path,
+						ext4_lblk_t iblock,
+						unsigned long max_blocks)
 {
 	struct ext4_extent *ex, newex;
 	struct ext4_extent *ex1 = NULL;
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
 	struct ext4_extent_header *eh;
-	unsigned int allocated, ee_block, ee_len, depth;
+	ext4_lblk_t ee_block;
+	unsigned int allocated, ee_len, depth;
 	ext4_fsblk_t newblock;
 	int err = 0;
 	int ret = 0;
@@ -2226,7 +2133,7 @@ out:
 }
 
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t iblock,
+			ext4_lblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize)
 {
@@ -2238,8 +2145,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	unsigned long allocated = 0;
 
 	__clear_bit(BH_New, &bh_result->b_state);
-	ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock,
-			max_blocks, (unsigned) inode->i_ino);
+	ext_debug("blocks %lu/%lu requested for inode %u\n",
+			(unsigned long) iblock, max_blocks,
+			(unsigned) inode->i_ino);
 	mutex_lock(&EXT4_I(inode)->truncate_mutex);
 
 	/* check in cache */
@@ -2288,7 +2196,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 
 	ex = path[depth].p_ext;
 	if (ex) {
-		unsigned long ee_block = le32_to_cpu(ex->ee_block);
+		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
 		ext4_fsblk_t ee_start = ext_pblock(ex);
 		unsigned short ee_len;
 
@@ -2423,7 +2331,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
-	unsigned long last_block;
+	ext4_lblk_t last_block;
 	handle_t *handle;
 	int err = 0;
 
@@ -2516,7 +2424,8 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
 	handle_t *handle;
-	ext4_fsblk_t block, max_blocks;
+	ext4_lblk_t block;
+	unsigned long max_blocks;
 	ext4_fsblk_t nblocks = 0;
 	int ret = 0;
 	int ret2 = 0;
@@ -2561,8 +2470,9 @@ retry:
 		if (!ret) {
 			ext4_error(inode->i_sb, "ext4_fallocate",
 				   "ext4_ext_get_blocks returned 0! inode#%lu"
-				   ", block=%llu, max_blocks=%llu",
-				   inode->i_ino, block, max_blocks);
+				   ", block=%lu, max_blocks=%lu",
+				   inode->i_ino, (unsigned long)block,
+				   (unsigned long)max_blocks);
 			ret = -EIO;
 			ext4_mark_inode_dirty(handle, inode);
 			ret2 = ext4_journal_stop(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5489703d9573..488f829a8879 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -105,7 +105,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
  */
 static unsigned long blocks_for_truncate(struct inode *inode)
 {
-	unsigned long needed;
+	ext4_lblk_t needed;
 
 	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 
@@ -282,7 +282,8 @@ static int verify_chain(Indirect *from, Indirect *to)
  */
 
 static int ext4_block_to_path(struct inode *inode,
-			long i_block, int offsets[4], int *boundary)
+			ext4_lblk_t i_block,
+			ext4_lblk_t offsets[4], int *boundary)
 {
 	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -349,7 +350,8 @@ static int ext4_block_to_path(struct inode *inode,
  *	or when it reads all @depth-1 indirect blocks successfully and finds
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  */
-static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+				 ext4_lblk_t  *offsets,
 				 Indirect chain[4], int *err)
 {
 	struct super_block *sb = inode->i_sb;
@@ -445,7 +447,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
  *	stores it in *@goal and returns zero.
  */
 
-static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block,
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 		Indirect chain[4], Indirect *partial)
 {
 	struct ext4_block_alloc_info *block_i;
@@ -590,7 +592,7 @@ failed_out:
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 			int indirect_blks, int *blks, ext4_fsblk_t goal,
-			int *offsets, Indirect *branch)
+			ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
@@ -680,7 +682,7 @@ failed:
  * chain to new block and return 0.
  */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-			long block, Indirect *where, int num, int blks)
+			ext4_lblk_t block, Indirect *where, int num, int blks)
 {
 	int i;
 	int err = 0;
@@ -784,12 +786,12 @@ err_out:
  * return < 0, error case.
  */
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-		sector_t iblock, unsigned long maxblocks,
+		ext4_lblk_t iblock, unsigned long maxblocks,
 		struct buffer_head *bh_result,
 		int create, int extend_disksize)
 {
 	int err = -EIO;
-	int offsets[4];
+	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	ext4_fsblk_t goal;
@@ -803,7 +805,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 	J_ASSERT(handle != NULL || create == 0);
-	depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+	depth = ext4_block_to_path(inode, iblock, offsets,
+					&blocks_to_boundary);
 
 	if (depth == 0)
 		goto out;
@@ -996,7 +999,7 @@ get_block:
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
-				long block, int create, int *errp)
+				ext4_lblk_t block, int create, int *errp)
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
@@ -1063,7 +1066,7 @@ err:
 }
 
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
-			       int block, int create, int *err)
+			       ext4_lblk_t block, int create, int *err)
 {
 	struct buffer_head * bh;
 
@@ -1828,7 +1831,8 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned blocksize, iblock, length, pos;
+	unsigned blocksize, length, pos;
+	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
 	int err = 0;
@@ -1964,7 +1968,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
  *			(no partially truncated stuff there).  */
 
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
-			int offsets[4], Indirect chain[4], __le32 *top)
+			ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
 {
 	Indirect *partial, *p;
 	int k, err;
@@ -2289,12 +2293,12 @@ void ext4_truncate(struct inode *inode)
 	__le32 *i_data = ei->i_data;
 	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
-	int offsets[4];
+	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
 	int n;
-	long last_block;
+	ext4_lblk_t last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	struct page *page;
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d9a3a2fc5b0d..fb673b14ccd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -51,7 +51,7 @@
 
 static struct buffer_head *ext4_append(handle_t *handle,
 					struct inode *inode,
-					u32 *block, int *err)
+					ext4_lblk_t *block, int *err)
 {
 	struct buffer_head *bh;
 
@@ -144,8 +144,8 @@ struct dx_map_entry
 	u16 size;
 };
 
-static inline unsigned dx_get_block (struct dx_entry *entry);
-static void dx_set_block (struct dx_entry *entry, unsigned value);
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
+static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
 static inline unsigned dx_get_hash (struct dx_entry *entry);
 static void dx_set_hash (struct dx_entry *entry, unsigned value);
 static unsigned dx_get_count (struct dx_entry *entries);
@@ -166,7 +166,8 @@ static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
 		struct dx_map_entry *offsets, int count);
 static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+static void dx_insert_block(struct dx_frame *frame,
+					u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
 				 struct dx_frame *frames,
@@ -181,12 +182,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
  * Mask them off for now.
  */
 
-static inline unsigned dx_get_block (struct dx_entry *entry)
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
 {
 	return le32_to_cpu(entry->block) & 0x00ffffff;
 }
 
-static inline void dx_set_block (struct dx_entry *entry, unsigned value)
+static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
 {
 	entry->block = cpu_to_le32(value);
 }
@@ -243,8 +244,8 @@ static void dx_show_index (char * label, struct dx_entry *entries)
 	int i, n = dx_get_count (entries);
 	printk("%s index ", label);
 	for (i = 0; i < n; i++) {
-		printk("%x->%u ", i? dx_get_hash(entries + i) :
-				0, dx_get_block(entries + i));
+		printk("%x->%lu ", i? dx_get_hash(entries + i) :
+				0, (unsigned long)dx_get_block(entries + i));
 	}
 	printk("\n");
 }
@@ -297,7 +298,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 	printk("%i indexed blocks...\n", count);
 	for (i = 0; i < count; i++, entries++)
 	{
-		u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
+		ext4_lblk_t block = dx_get_block(entries);
+		ext4_lblk_t hash  = i ? dx_get_hash(entries): 0;
 		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
 		struct stats stats;
 		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
@@ -561,7 +563,7 @@ static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *
  * into the tree.  If there is an error it is returned in err.
  */
 static int htree_dirblock_to_tree(struct file *dir_file,
-				  struct inode *dir, int block,
+				  struct inode *dir, ext4_lblk_t block,
 				  struct dx_hash_info *hinfo,
 				  __u32 start_hash, __u32 start_minor_hash)
 {
@@ -569,7 +571,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 	struct ext4_dir_entry_2 *de, *top;
 	int err, count = 0;
 
-	dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
+	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
+							(unsigned long)block));
 	if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
 		return err;
 
@@ -621,9 +624,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	struct ext4_dir_entry_2 *de;
 	struct dx_frame frames[2], *frame;
 	struct inode *dir;
-	int block, err;
+	ext4_lblk_t block;
 	int count = 0;
-	int ret;
+	int ret, err;
 	__u32 hashval;
 
 	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
@@ -753,7 +756,7 @@ static void dx_sort_map (struct dx_map_entry *map, unsigned count)
 	} while(more);
 }
 
-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
+static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
 {
 	struct dx_entry *entries = frame->entries;
 	struct dx_entry *old = frame->at, *new = old + 1;
@@ -848,13 +851,14 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
 	struct super_block * sb;
 	struct buffer_head * bh_use[NAMEI_RA_SIZE];
 	struct buffer_head * bh, *ret = NULL;
-	unsigned long start, block, b;
+	ext4_lblk_t start, block, b;
 	int ra_max = 0;		/* Number of bh's in the readahead
 				   buffer, bh_use[] */
 	int ra_ptr = 0;		/* Current index into readahead
 				   buffer */
 	int num = 0;
-	int nblocks, i, err;
+	ext4_lblk_t  nblocks;
+	int i, err;
 	struct inode *dir = dentry->d_parent->d_inode;
 	int namelen;
 	const u8 *name;
@@ -915,7 +919,8 @@ restart:
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
 			ext4_error(sb, __FUNCTION__, "reading directory #%lu "
-				   "offset %lu", dir->i_ino, block);
+				   "offset %lu", dir->i_ino,
+				   (unsigned long)block);
 			brelse(bh);
 			goto next;
 		}
@@ -962,7 +967,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 	struct dx_frame frames[2], *frame;
 	struct ext4_dir_entry_2 *de, *top;
 	struct buffer_head *bh;
-	unsigned long block;
+	ext4_lblk_t block;
 	int retval;
 	int namelen = dentry->d_name.len;
 	const u8 *name = dentry->d_name.name;
@@ -1174,7 +1179,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	unsigned blocksize = dir->i_sb->s_blocksize;
 	unsigned count, continued;
 	struct buffer_head *bh2;
-	u32 newblock;
+	ext4_lblk_t newblock;
 	u32 hash2;
 	struct dx_map_entry *map;
 	char *data1 = (*bh)->b_data, *data2;
@@ -1221,8 +1226,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	split = count - move;
 	hash2 = map[split].hash;
 	continued = hash2 == map[split - 1].hash;
-	dxtrace(printk("Split block %i at %x, %i/%i\n",
-		dx_get_block(frame->at), hash2, split, count-split));
+	dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
+			(unsigned long)dx_get_block(frame->at),
+					hash2, split, count-split));
 
 	/* Fancy dance to stay within two buffers */
 	de2 = dx_move_dirents(data1, data2, map + split, count - split);
@@ -1374,7 +1380,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	int		retval;
 	unsigned	blocksize;
 	struct dx_hash_info hinfo;
-	u32		block;
+	ext4_lblk_t  block;
 	struct fake_dirent *fde;
 
 	blocksize =  dir->i_sb->s_blocksize;
@@ -1455,7 +1461,7 @@ static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
 	int	retval;
 	int	dx_fallback=0;
 	unsigned blocksize;
-	u32 block, blocks;
+	ext4_lblk_t block, blocks;
 
 	sb = dir->i_sb;
 	blocksize = sb->s_blocksize;
@@ -1532,7 +1538,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 		       dx_get_count(entries), dx_get_limit(entries)));
 	/* Need to split index? */
 	if (dx_get_count(entries) == dx_get_limit(entries)) {
-		u32 newblock;
+		ext4_lblk_t newblock;
 		unsigned icount = dx_get_count(entries);
 		int levels = frame - frames;
 		struct dx_entry *entries2;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ab7010dde1b5..6302b036c121 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2914,7 +2914,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off)
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
-	sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
 	int tocopy;
@@ -2952,7 +2952,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off)
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
-	sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
 	int tocopy;
-- 
cgit v1.2.3


From bba907433b85ba2adae1bb3b6fd29b4e5f35c468 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4 extents: remove unneeded casts

There are many casts in extents.c which are not needed,
as the variables are already the type of the cast, or
are being promoted for no particular reason in printk's.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/extents.c | 49 ++++++++++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 19d8059b58aa..68537229ee1c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -374,7 +374,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
 	struct ext4_extent_idx *r, *l, *m;
 
 
-	ext_debug("binsearch for %lu(idx):  ", (unsigned long)block);
+	ext_debug("binsearch for %u(idx):  ", block);
 
 	l = EXT_FIRST_INDEX(eh) + 1;
 	r = EXT_LAST_INDEX(eh);
@@ -440,7 +440,7 @@ ext4_ext_binsearch(struct inode *inode,
 		return;
 	}
 
-	ext_debug("binsearch for %lu:  ", (unsigned long)block);
+	ext_debug("binsearch for %u:  ", block);
 
 	l = EXT_FIRST_EXTENT(eh) + 1;
 	r = EXT_LAST_EXTENT(eh);
@@ -766,7 +766,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	while (k--) {
 		oldblock = newblock;
 		newblock = ablocks[--a];
-		bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock);
+		bh = sb_getblk(inode->i_sb, newblock);
 		if (!bh) {
 			err = -EIO;
 			goto cleanup;
@@ -786,9 +786,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		fidx->ei_block = border;
 		ext4_idx_store_pblock(fidx, oldblock);
 
-		ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i,
-				newblock, (unsigned long) le32_to_cpu(border),
-				oldblock);
+		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
+				i, newblock, le32_to_cpu(border), oldblock);
 		/* copy indexes */
 		m = 0;
 		path[i].p_idx++;
@@ -1476,10 +1475,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 	} else if (block < le32_to_cpu(ex->ee_block)) {
 		lblock = block;
 		len = le32_to_cpu(ex->ee_block) - block;
-		ext_debug("cache gap(before): %lu [%lu:%lu]",
-				(unsigned long) block,
-				(unsigned long) le32_to_cpu(ex->ee_block),
-				(unsigned long) ext4_ext_get_actual_len(ex));
+		ext_debug("cache gap(before): %u [%u:%u]",
+				block,
+				le32_to_cpu(ex->ee_block),
+				 ext4_ext_get_actual_len(ex));
 	} else if (block >= le32_to_cpu(ex->ee_block)
 			+ ext4_ext_get_actual_len(ex)) {
 		ext4_lblk_t next;
@@ -1487,10 +1486,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 			+ ext4_ext_get_actual_len(ex);
 
 		next = ext4_ext_next_allocated_block(path);
-		ext_debug("cache gap(after): [%lu:%lu] %lu",
-				(unsigned long) le32_to_cpu(ex->ee_block),
-				(unsigned long) ext4_ext_get_actual_len(ex),
-				(unsigned long) block);
+		ext_debug("cache gap(after): [%u:%u] %u",
+				le32_to_cpu(ex->ee_block),
+				ext4_ext_get_actual_len(ex),
+				block);
 		BUG_ON(next == lblock);
 		len = next - lblock;
 	} else {
@@ -1498,7 +1497,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 		BUG();
 	}
 
-	ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len);
+	ext_debug(" -> %u:%lu\n", lblock, len);
 	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
 }
 
@@ -1520,11 +1519,9 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 		ex->ee_block = cpu_to_le32(cex->ec_block);
 		ext4_ext_store_pblock(ex, cex->ec_start);
 		ex->ee_len = cpu_to_le16(cex->ec_len);
-		ext_debug("%lu cached by %lu:%lu:%llu\n",
-				(unsigned long) block,
-				(unsigned long) cex->ec_block,
-				(unsigned long) cex->ec_len,
-				cex->ec_start);
+		ext_debug("%u cached by %u:%u:%llu\n",
+				block,
+				cex->ec_block, cex->ec_len, cex->ec_start);
 		return cex->ec_type;
 	}
 
@@ -2145,9 +2142,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	unsigned long allocated = 0;
 
 	__clear_bit(BH_New, &bh_result->b_state);
-	ext_debug("blocks %lu/%lu requested for inode %u\n",
-			(unsigned long) iblock, max_blocks,
-			(unsigned) inode->i_ino);
+	ext_debug("blocks %u/%lu requested for inode %u\n",
+			iblock, max_blocks, inode->i_ino);
 	mutex_lock(&EXT4_I(inode)->truncate_mutex);
 
 	/* check in cache */
@@ -2210,7 +2206,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			newblock = iblock - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
 			allocated = ee_len - (iblock - ee_block);
-			ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
+			ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
 					ee_block, ee_len, newblock);
 
 			/* Do not put uninitialized extent in the cache */
@@ -2470,9 +2466,8 @@ retry:
 		if (!ret) {
 			ext4_error(inode->i_sb, "ext4_fallocate",
 				   "ext4_ext_get_blocks returned 0! inode#%lu"
-				   ", block=%lu, max_blocks=%lu",
-				   inode->i_ino, (unsigned long)block,
-				   (unsigned long)max_blocks);
+				   ", block=%u, max_blocks=%lu",
+				   inode->i_ino, block, max_blocks);
 			ret = -EIO;
 			ext4_mark_inode_dirty(handle, inode);
 			ret2 = ext4_journal_stop(handle);
-- 
cgit v1.2.3


From fd2d42912f9f09e5250cb3b024ee0625704e9cb7 Mon Sep 17 00:00:00 2001
From: Avantika Mathur <mathur@us.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: add ext4_group_t, and change all group variables to this type.

In many places variables for block group are of type int, which limits the
maximum number of block groups to 2^31.  Each block group can have up to
2^15 blocks, with a 4K block size,  and the max filesystem size is limited to
2^31 * (2^15 * 2^12) = 2^58  -- or 256 PB

This patch introduces a new type ext4_group_t, of type unsigned long, to
represent block group numbers in ext4.
All occurrences of block group variables are converted to type ext4_group_t.

Signed-off-by: Avantika Mathur <mathur@us.ibm.com>
---
 fs/ext4/balloc.c | 69 +++++++++++++++++++++++++++-----------------------------
 fs/ext4/group.h  |  8 ++++---
 fs/ext4/ialloc.c | 46 +++++++++++++++++++------------------
 fs/ext4/inode.c  |  5 ++--
 fs/ext4/resize.c | 12 +++++-----
 fs/ext4/super.c  | 20 ++++++++--------
 6 files changed, 80 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 71ee95e534fd..9568a57c607c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -29,7 +29,7 @@
  * Calculate the block group number and offset, given a block number
  */
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-		unsigned long *blockgrpp, ext4_grpblk_t *offsetp)
+		ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	ext4_grpblk_t offset;
@@ -46,7 +46,7 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 /* Initializes an uninitialized block bitmap if given, and returns the
  * number of blocks free in the group. */
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
-				int block_group, struct ext4_group_desc *gdp)
+		 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
 	unsigned long start;
 	int bit, bit_max;
@@ -60,7 +60,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 * essentially implementing a per-group read-only flag. */
 		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
 			ext4_error(sb, __FUNCTION__,
-				   "Checksum bad for group %u\n", block_group);
+				  "Checksum bad for group %lu\n", block_group);
 			gdp->bg_free_blocks_count = 0;
 			gdp->bg_free_inodes_count = 0;
 			gdp->bg_itable_unused = 0;
@@ -153,7 +153,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  *			group descriptor
  */
 struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
-					     unsigned int block_group,
+					     ext4_group_t block_group,
 					     struct buffer_head ** bh)
 {
 	unsigned long group_desc;
@@ -164,7 +164,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 	if (block_group >= sbi->s_groups_count) {
 		ext4_error (sb, "ext4_get_group_desc",
 			    "block_group >= groups_count - "
-			    "block_group = %d, groups_count = %lu",
+			    "block_group = %lu, groups_count = %lu",
 			    block_group, sbi->s_groups_count);
 
 		return NULL;
@@ -176,7 +176,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 	if (!sbi->s_group_desc[group_desc]) {
 		ext4_error (sb, "ext4_get_group_desc",
 			    "Group descriptor not loaded - "
-			    "block_group = %d, group_desc = %lu, desc = %lu",
+			    "block_group = %lu, group_desc = %lu, desc = %lu",
 			     block_group, group_desc, offset);
 		return NULL;
 	}
@@ -200,7 +200,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, unsigned int block_group)
+read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
 	struct ext4_group_desc * desc;
 	struct buffer_head * bh = NULL;
@@ -227,7 +227,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
 	if (!bh)
 		ext4_error (sb, __FUNCTION__,
 			    "Cannot read block bitmap - "
-			    "block_group = %d, block_bitmap = %llu",
+			    "block_group = %lu, block_bitmap = %llu",
 			    block_group, bitmap_blk);
 	return bh;
 }
@@ -320,7 +320,7 @@ restart:
  */
 static int
 goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
-			unsigned int group, struct super_block * sb)
+			ext4_group_t group, struct super_block *sb)
 {
 	ext4_fsblk_t group_first_block, group_last_block;
 
@@ -540,7 +540,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gd_bh;
-	unsigned long block_group;
+	ext4_group_t block_group;
 	ext4_grpblk_t bit;
 	unsigned long i;
 	unsigned long overflow;
@@ -920,9 +920,10 @@ claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
  * ext4_journal_release_buffer(), else we'll run out of credits.
  */
 static ext4_grpblk_t
-ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
-			struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal,
-			unsigned long *count, struct ext4_reserve_window *my_rsv)
+ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
+			ext4_group_t group, struct buffer_head *bitmap_bh,
+			ext4_grpblk_t grp_goal, unsigned long *count,
+			struct ext4_reserve_window *my_rsv)
 {
 	ext4_fsblk_t group_first_block;
 	ext4_grpblk_t start, end;
@@ -1156,7 +1157,7 @@ static int find_next_reservable_window(
  */
 static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
 		ext4_grpblk_t grp_goal, struct super_block *sb,
-		unsigned int group, struct buffer_head *bitmap_bh)
+		ext4_group_t group, struct buffer_head *bitmap_bh)
 {
 	struct ext4_reserve_window_node *search_head;
 	ext4_fsblk_t group_first_block, group_end_block, start_block;
@@ -1354,7 +1355,7 @@ static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
  */
 static ext4_grpblk_t
 ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
-			unsigned int group, struct buffer_head *bitmap_bh,
+			ext4_group_t group, struct buffer_head *bitmap_bh,
 			ext4_grpblk_t grp_goal,
 			struct ext4_reserve_window_node * my_rsv,
 			unsigned long *count, int *errp)
@@ -1528,12 +1529,12 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gdp_bh;
-	unsigned long group_no;
-	int goal_group;
+	ext4_group_t group_no;
+	ext4_group_t goal_group;
 	ext4_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
 	ext4_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
 	ext4_fsblk_t ret_block;		/* filesyetem-wide allocated block */
-	int bgi;			/* blockgroup iteration index */
+	ext4_group_t bgi;			/* blockgroup iteration index */
 	int fatal = 0, err;
 	int performed_allocation = 0;
 	ext4_grpblk_t free_blocks;	/* number of free blocks in a group */
@@ -1544,10 +1545,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_reserve_window_node *my_rsv = NULL;
 	struct ext4_block_alloc_info *block_i;
 	unsigned short windowsz = 0;
-#ifdef EXT4FS_DEBUG
-	static int goal_hits, goal_attempts;
-#endif
-	unsigned long ngroups;
+	ext4_group_t ngroups;
 	unsigned long num = *count;
 
 	*errp = -ENOSPC;
@@ -1743,9 +1741,6 @@ allocated:
 	 * list of some description.  We don't know in advance whether
 	 * the caller wants to use it as metadata or data.
 	 */
-	ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
-			ret_block, goal_hits, goal_attempts);
-
 	spin_lock(sb_bgl_lock(sbi, group_no));
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
@@ -1804,8 +1799,8 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 {
 	ext4_fsblk_t desc_count;
 	struct ext4_group_desc *gdp;
-	int i;
-	unsigned long ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t i;
+	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	ext4_fsblk_t bitmap_count;
@@ -1829,7 +1824,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 			continue;
 
 		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-		printk("group %d: stored = %d, counted = %lu\n",
+		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
 			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
 		bitmap_count += x;
 	}
@@ -1853,7 +1848,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 #endif
 }
 
-static inline int test_root(int a, int b)
+static inline int test_root(ext4_group_t a, int b)
 {
 	int num = b;
 
@@ -1862,7 +1857,7 @@ static inline int test_root(int a, int b)
 	return num == a;
 }
 
-static int ext4_group_sparse(int group)
+static int ext4_group_sparse(ext4_group_t group)
 {
 	if (group <= 1)
 		return 1;
@@ -1880,7 +1875,7 @@ static int ext4_group_sparse(int group)
  *	Return the number of blocks used by the superblock (primary or backup)
  *	in this group.  Currently this will be only 0 or 1.
  */
-int ext4_bg_has_super(struct super_block *sb, int group)
+int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
 {
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
@@ -1889,18 +1884,20 @@ int ext4_bg_has_super(struct super_block *sb, int group)
 	return 1;
 }
 
-static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group)
+static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
+					ext4_group_t group)
 {
 	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
-	unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb);
-	unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
+	ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
+	ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
 
 	if (group == first || group == first + 1 || group == last)
 		return 1;
 	return 0;
 }
 
-static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
+static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
+					ext4_group_t group)
 {
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
@@ -1918,7 +1915,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
  *	(primary or backup) in this group.  In the future there may be a
  *	different number of descriptor blocks in each group.
  */
-unsigned long ext4_bg_num_gdb(struct super_block *sb, int group)
+unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 {
 	unsigned long first_meta_bg =
 			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 1577910bb58b..7eb0604e7eea 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -14,14 +14,16 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
 				       struct ext4_group_desc *gdp);
 struct buffer_head *read_block_bitmap(struct super_block *sb,
-				      unsigned int block_group);
+				      ext4_group_t block_group);
 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-				       struct buffer_head *bh, int group,
+				       struct buffer_head *bh,
+				       ext4_group_t group,
 				       struct ext4_group_desc *desc);
 #define ext4_free_blocks_after_init(sb, group, desc)			\
 		ext4_init_block_bitmap(sb, NULL, group, desc)
 extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-				       struct buffer_head *bh, int group,
+				       struct buffer_head *bh,
+				       ext4_group_t group,
 				       struct ext4_group_desc *desc);
 extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 #endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c61f37fd3f05..64dea8689e1f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -64,8 +64,8 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb,
-				struct buffer_head *bh, int block_group,
+unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
+				ext4_group_t block_group,
 				struct ext4_group_desc *gdp)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -75,7 +75,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb,
 	/* If checksum is bad mark all blocks and inodes use to prevent
 	 * allocation, essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-		ext4_error(sb, __FUNCTION__, "Checksum bad for group %u\n",
+		ext4_error(sb, __FUNCTION__, "Checksum bad for group %lu\n",
 			   block_group);
 		gdp->bg_free_blocks_count = 0;
 		gdp->bg_free_inodes_count = 0;
@@ -98,7 +98,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb,
  * Return buffer_head of bitmap on success or NULL.
  */
 static struct buffer_head *
-read_inode_bitmap(struct super_block * sb, unsigned long block_group)
+read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
 	struct ext4_group_desc *desc;
 	struct buffer_head *bh = NULL;
@@ -152,7 +152,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	unsigned long ino;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
-	unsigned long block_group;
+	ext4_group_t block_group;
 	unsigned long bit;
 	struct ext4_group_desc * gdp;
 	struct ext4_super_block * es;
@@ -260,12 +260,12 @@ error_return:
  * For other inodes, search forward from the parent directory\'s block
  * group to find a free inode.
  */
-static int find_group_dir(struct super_block *sb, struct inode *parent)
+static ext4_group_t find_group_dir(struct super_block *sb, struct inode *parent)
 {
-	int ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	unsigned int freei, avefreei;
 	struct ext4_group_desc *desc, *best_desc = NULL;
-	int group, best_group = -1;
+	ext4_group_t group, best_group = -1;
 
 	freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
 	avefreei = freei / ngroups;
@@ -314,12 +314,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
 #define INODE_COST 64
 #define BLOCK_COST 256
 
-static int find_group_orlov(struct super_block *sb, struct inode *parent)
+static ext4_group_t find_group_orlov(struct super_block *sb,
+				      struct inode *parent)
 {
-	int parent_group = EXT4_I(parent)->i_block_group;
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	int ngroups = sbi->s_groups_count;
+	ext4_group_t ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 	unsigned int freei, avefreei;
 	ext4_fsblk_t freeb, avefreeb;
@@ -327,7 +328,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 	unsigned int ndirs;
 	int max_debt, max_dirs, min_inodes;
 	ext4_grpblk_t min_blocks;
-	int group = -1, i;
+	ext4_group_t group = -1, i;
 	struct ext4_group_desc *desc;
 
 	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
@@ -340,7 +341,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 	if ((parent == sb->s_root->d_inode) ||
 	    (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
 		int best_ndir = inodes_per_group;
-		int best_group = -1;
+		ext4_group_t best_group = -1;
 
 		get_random_bytes(&group, sizeof(group));
 		parent_group = (unsigned)group % ngroups;
@@ -415,12 +416,13 @@ fallback:
 	return -1;
 }
 
-static int find_group_other(struct super_block *sb, struct inode *parent)
+static ext4_group_t find_group_other(struct super_block *sb,
+					struct inode *parent)
 {
-	int parent_group = EXT4_I(parent)->i_block_group;
-	int ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	struct ext4_group_desc *desc;
-	int group, i;
+	ext4_group_t group, i;
 
 	/*
 	 * Try to place the inode in its parent directory
@@ -487,7 +489,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	struct super_block *sb;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
-	int group;
+	ext4_group_t group;
 	unsigned long ino = 0;
 	struct inode * inode;
 	struct ext4_group_desc * gdp = NULL;
@@ -583,7 +585,7 @@ got:
 	    ino > EXT4_INODES_PER_GROUP(sb)) {
 		ext4_error(sb, __FUNCTION__,
 			   "reserved inode or inode > inodes count - "
-			   "block_group = %d, inode=%lu", group,
+			   "block_group = %lu, inode=%lu", group,
 			   ino + group * EXT4_INODES_PER_GROUP(sb));
 		err = -EIO;
 		goto fail;
@@ -777,7 +779,7 @@ fail_drop:
 struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 {
 	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
-	unsigned long block_group;
+	ext4_group_t block_group;
 	int bit;
 	struct buffer_head *bitmap_bh = NULL;
 	struct inode *inode = NULL;
@@ -833,7 +835,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 {
 	unsigned long desc_count;
 	struct ext4_group_desc *gdp;
-	int i;
+	ext4_group_t i;
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	unsigned long bitmap_count, x;
@@ -879,7 +881,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 unsigned long ext4_count_dirs (struct super_block * sb)
 {
 	unsigned long count = 0;
-	int i;
+	ext4_group_t i;
 
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
 		struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 488f829a8879..1ee19c918686 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2464,7 +2464,8 @@ out_stop:
 static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
 		unsigned long ino, struct ext4_iloc *iloc)
 {
-	unsigned long desc, group_desc, block_group;
+	unsigned long desc, group_desc;
+	ext4_group_t block_group;
 	unsigned long offset;
 	ext4_fsblk_t block;
 	struct buffer_head *bh;
@@ -2551,7 +2552,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
 			struct ext4_group_desc *desc;
 			int inodes_per_buffer;
 			int inode_offset, i;
-			int block_group;
+			ext4_group_t block_group;
 			int start;
 
 			block_group = (inode->i_ino - 1) /
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bd8a52bb3999..7090c2d25c76 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -28,7 +28,7 @@ static int verify_group_input(struct super_block *sb,
 	struct ext4_super_block *es = sbi->s_es;
 	ext4_fsblk_t start = ext4_blocks_count(es);
 	ext4_fsblk_t end = start + input->blocks_count;
-	unsigned group = input->group;
+	ext4_group_t group = input->group;
 	ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
 	unsigned overhead = ext4_bg_has_super(sb, group) ?
 		(1 + ext4_bg_num_gdb(sb, group) +
@@ -357,7 +357,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 			       struct buffer_head *primary)
 {
 	const ext4_fsblk_t blk = primary->b_blocknr;
-	const unsigned long end = EXT4_SB(sb)->s_groups_count;
+	const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
 	unsigned three = 1;
 	unsigned five = 5;
 	unsigned seven = 7;
@@ -656,12 +656,12 @@ static void update_backups(struct super_block *sb,
 			   int blk_off, char *data, int size)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	const unsigned long last = sbi->s_groups_count;
+	const ext4_group_t last = sbi->s_groups_count;
 	const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
 	unsigned three = 1;
 	unsigned five = 5;
 	unsigned seven = 7;
-	unsigned group;
+	ext4_group_t group;
 	int rest = sb->s_blocksize - size;
 	handle_t *handle;
 	int err = 0, err2;
@@ -716,7 +716,7 @@ static void update_backups(struct super_block *sb,
 exit_err:
 	if (err) {
 		ext4_warning(sb, __FUNCTION__,
-			     "can't update backup for group %d (err %d), "
+			     "can't update backup for group %lu (err %d), "
 			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT4_VALID_FS;
 		sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -952,7 +952,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		      ext4_fsblk_t n_blocks_count)
 {
 	ext4_fsblk_t o_blocks_count;
-	unsigned long o_groups_count;
+	ext4_group_t o_groups_count;
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
 	struct buffer_head * bh;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6302b036c121..df8842b43544 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1364,7 +1364,7 @@ static int ext4_check_descriptors (struct super_block * sb)
 	struct ext4_group_desc * gdp = NULL;
 	int desc_block = 0;
 	int flexbg_flag = 0;
-	int i;
+	ext4_group_t i;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		flexbg_flag = 1;
@@ -1386,7 +1386,7 @@ static int ext4_check_descriptors (struct super_block * sb)
 		if (block_bitmap < first_block || block_bitmap > last_block)
 		{
 			ext4_error (sb, "ext4_check_descriptors",
-				    "Block bitmap for group %d"
+				    "Block bitmap for group %lu"
 				    " not in group (block %llu)!",
 				    i, block_bitmap);
 			return 0;
@@ -1395,7 +1395,7 @@ static int ext4_check_descriptors (struct super_block * sb)
 		if (inode_bitmap < first_block || inode_bitmap > last_block)
 		{
 			ext4_error (sb, "ext4_check_descriptors",
-				    "Inode bitmap for group %d"
+				    "Inode bitmap for group %lu"
 				    " not in group (block %llu)!",
 				    i, inode_bitmap);
 			return 0;
@@ -1405,17 +1405,16 @@ static int ext4_check_descriptors (struct super_block * sb)
 		    inode_table + sbi->s_itb_per_group - 1 > last_block)
 		{
 			ext4_error (sb, "ext4_check_descriptors",
-				    "Inode table for group %d"
+				    "Inode table for group %lu"
 				    " not in group (block %llu)!",
 				    i, inode_table);
 			return 0;
 		}
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
 			ext4_error(sb, __FUNCTION__,
-				   "Checksum for group %d failed (%u!=%u)\n", i,
-				   le16_to_cpu(ext4_group_desc_csum(sbi, i,
-								    gdp)),
-				   le16_to_cpu(gdp->bg_checksum));
+				   "Checksum for group %lu failed (%u!=%u)\n",
+				    i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+				    gdp)), le16_to_cpu(gdp->bg_checksum));
 			return 0;
 		}
 		if (!flexbg_flag)
@@ -1429,7 +1428,6 @@ static int ext4_check_descriptors (struct super_block * sb)
 	return 1;
 }
 
-
 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
  * the superblock) which were deleted from all directories, but held open by
  * a process at the time of a crash.  We walk the list and try to delete these
@@ -1570,7 +1568,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
 				ext4_fsblk_t logical_sb_block, int nr)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	unsigned long bg, first_meta_bg;
+	ext4_group_t bg, first_meta_bg;
 	int has_super = 0;
 
 	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
@@ -2678,7 +2676,7 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
 	if (test_opt(sb, MINIX_DF)) {
 		sbi->s_overhead_last = 0;
 	} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-		unsigned long ngroups = sbi->s_groups_count, i;
+		ext4_group_t ngroups = sbi->s_groups_count, i;
 		ext4_fsblk_t overhead = 0;
 		smp_rmb();
 
-- 
cgit v1.2.3


From 2aa9fc4c405467f6afbbb2162ff8afaced47d99b Mon Sep 17 00:00:00 2001
From: Avantika Mathur <mathur@us.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: fixes block group number being set to a negative value

This patch fixes various places where the group number is set to a negative
value.

Signed-off-by: Avantika Mathur <mathur@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 101 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 53 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 64dea8689e1f..7b5cfa62b663 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -260,12 +260,14 @@ error_return:
  * For other inodes, search forward from the parent directory\'s block
  * group to find a free inode.
  */
-static ext4_group_t find_group_dir(struct super_block *sb, struct inode *parent)
+static int find_group_dir(struct super_block *sb, struct inode *parent,
+				ext4_group_t *best_group)
 {
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	unsigned int freei, avefreei;
 	struct ext4_group_desc *desc, *best_desc = NULL;
-	ext4_group_t group, best_group = -1;
+	ext4_group_t group;
+	int ret = -1;
 
 	freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
 	avefreei = freei / ngroups;
@@ -279,11 +281,12 @@ static ext4_group_t find_group_dir(struct super_block *sb, struct inode *parent)
 		if (!best_desc ||
 		    (le16_to_cpu(desc->bg_free_blocks_count) >
 		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
-			best_group = group;
+			*best_group = group;
 			best_desc = desc;
+			ret = 0;
 		}
 	}
-	return best_group;
+	return ret;
 }
 
 /*
@@ -314,8 +317,8 @@ static ext4_group_t find_group_dir(struct super_block *sb, struct inode *parent)
 #define INODE_COST 64
 #define BLOCK_COST 256
 
-static ext4_group_t find_group_orlov(struct super_block *sb,
-				      struct inode *parent)
+static int find_group_orlov(struct super_block *sb, struct inode *parent,
+				ext4_group_t *group)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -328,7 +331,7 @@ static ext4_group_t find_group_orlov(struct super_block *sb,
 	unsigned int ndirs;
 	int max_debt, max_dirs, min_inodes;
 	ext4_grpblk_t min_blocks;
-	ext4_group_t group = -1, i;
+	ext4_group_t i;
 	struct ext4_group_desc *desc;
 
 	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
@@ -341,13 +344,14 @@ static ext4_group_t find_group_orlov(struct super_block *sb,
 	if ((parent == sb->s_root->d_inode) ||
 	    (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
 		int best_ndir = inodes_per_group;
-		ext4_group_t best_group = -1;
+		ext4_group_t grp;
+		int ret = -1;
 
-		get_random_bytes(&group, sizeof(group));
-		parent_group = (unsigned)group % ngroups;
+		get_random_bytes(&grp, sizeof(grp));
+		parent_group = (unsigned)grp % ngroups;
 		for (i = 0; i < ngroups; i++) {
-			group = (parent_group + i) % ngroups;
-			desc = ext4_get_group_desc (sb, group, NULL);
+			grp = (parent_group + i) % ngroups;
+			desc = ext4_get_group_desc(sb, grp, NULL);
 			if (!desc || !desc->bg_free_inodes_count)
 				continue;
 			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
@@ -356,11 +360,12 @@ static ext4_group_t find_group_orlov(struct super_block *sb,
 				continue;
 			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
 				continue;
-			best_group = group;
+			*group = grp;
+			ret = 0;
 			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
 		}
-		if (best_group >= 0)
-			return best_group;
+		if (ret == 0)
+			return ret;
 		goto fallback;
 	}
 
@@ -381,8 +386,8 @@ static ext4_group_t find_group_orlov(struct super_block *sb,
 		max_debt = 1;
 
 	for (i = 0; i < ngroups; i++) {
-		group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc (sb, group, NULL);
+		*group = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, *group, NULL);
 		if (!desc || !desc->bg_free_inodes_count)
 			continue;
 		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
@@ -391,17 +396,16 @@ static ext4_group_t find_group_orlov(struct super_block *sb,
 			continue;
 		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
 			continue;
-		return group;
+		return 0;
 	}
 
 fallback:
 	for (i = 0; i < ngroups; i++) {
-		group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc (sb, group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
-			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
-			return group;
+		*group = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, *group, NULL);
+		if (desc && desc->bg_free_inodes_count &&
+			le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+			return 0;
 	}
 
 	if (avefreei) {
@@ -416,22 +420,22 @@ fallback:
 	return -1;
 }
 
-static ext4_group_t find_group_other(struct super_block *sb,
-					struct inode *parent)
+static int find_group_other(struct super_block *sb, struct inode *parent,
+				ext4_group_t *group)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	struct ext4_group_desc *desc;
-	ext4_group_t group, i;
+	ext4_group_t i;
 
 	/*
 	 * Try to place the inode in its parent directory
 	 */
-	group = parent_group;
-	desc = ext4_get_group_desc (sb, group, NULL);
+	*group = parent_group;
+	desc = ext4_get_group_desc(sb, *group, NULL);
 	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
 			le16_to_cpu(desc->bg_free_blocks_count))
-		return group;
+		return 0;
 
 	/*
 	 * We're going to place this inode in a different blockgroup from its
@@ -442,33 +446,33 @@ static ext4_group_t find_group_other(struct super_block *sb,
 	 *
 	 * So add our directory's i_ino into the starting point for the hash.
 	 */
-	group = (group + parent->i_ino) % ngroups;
+	*group = (*group + parent->i_ino) % ngroups;
 
 	/*
 	 * Use a quadratic hash to find a group with a free inode and some free
 	 * blocks.
 	 */
 	for (i = 1; i < ngroups; i <<= 1) {
-		group += i;
-		if (group >= ngroups)
-			group -= ngroups;
-		desc = ext4_get_group_desc (sb, group, NULL);
+		*group += i;
+		if (*group >= ngroups)
+			*group -= ngroups;
+		desc = ext4_get_group_desc(sb, *group, NULL);
 		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
 				le16_to_cpu(desc->bg_free_blocks_count))
-			return group;
+			return 0;
 	}
 
 	/*
 	 * That failed: try linear search for a free inode, even if that group
 	 * has no free blocks.
 	 */
-	group = parent_group;
+	*group = parent_group;
 	for (i = 0; i < ngroups; i++) {
-		if (++group >= ngroups)
-			group = 0;
-		desc = ext4_get_group_desc (sb, group, NULL);
+		if (++*group >= ngroups)
+			*group = 0;
+		desc = ext4_get_group_desc(sb, *group, NULL);
 		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
-			return group;
+			return 0;
 	}
 
 	return -1;
@@ -489,16 +493,17 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	struct super_block *sb;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
-	ext4_group_t group;
+	ext4_group_t group = 0;
 	unsigned long ino = 0;
 	struct inode * inode;
 	struct ext4_group_desc * gdp = NULL;
 	struct ext4_super_block * es;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
-	int err = 0;
+	int ret2, err = 0;
 	struct inode *ret;
-	int i, free = 0;
+	ext4_group_t i;
+	int free = 0;
 
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
@@ -514,14 +519,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	es = sbi->s_es;
 	if (S_ISDIR(mode)) {
 		if (test_opt (sb, OLDALLOC))
-			group = find_group_dir(sb, dir);
+			ret2 = find_group_dir(sb, dir, &group);
 		else
-			group = find_group_orlov(sb, dir);
+			ret2 = find_group_orlov(sb, dir, &group);
 	} else
-		group = find_group_other(sb, dir);
+		ret2 = find_group_other(sb, dir, &group);
 
 	err = -ENOSPC;
-	if (group == -1)
+	if (ret2 == -1)
 		goto out;
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
-- 
cgit v1.2.3


From 99e6f829a854daa6d56006cad51156e98863e73a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Introduce ext4_update_*_feature

Introduce ext4_update_*_feature and use them instead
of opencoding.


Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/ialloc.c | 11 ++++-------
 fs/ext4/super.c  | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7b5cfa62b663..00b152b92480 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -748,13 +748,10 @@ got:
 	if (test_opt(sb, EXTENTS)) {
 		EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
 		ext4_ext_tree_init(handle, inode);
-		if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
-			err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
-			if (err) goto fail;
-			EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS);
-			BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
-			err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
-		}
+		err = ext4_update_incompat_feature(handle, sb,
+						EXT4_FEATURE_INCOMPAT_EXTENTS);
+		if (err)
+			goto fail;
 	}
 
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df8842b43544..4d7f33f79552 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -373,6 +373,66 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 	 */
 }
 
+int ext4_update_compat_feature(handle_t *handle,
+					struct super_block *sb, __u32 compat)
+{
+	int err = 0;
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
+		err = ext4_journal_get_write_access(handle,
+				EXT4_SB(sb)->s_sbh);
+		if (err)
+			return err;
+		EXT4_SET_COMPAT_FEATURE(sb, compat);
+		sb->s_dirt = 1;
+		handle->h_sync = 1;
+		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
+					"call ext4_journal_dirty_met adata");
+		err = ext4_journal_dirty_metadata(handle,
+				EXT4_SB(sb)->s_sbh);
+	}
+	return err;
+}
+
+int ext4_update_rocompat_feature(handle_t *handle,
+					struct super_block *sb, __u32 rocompat)
+{
+	int err = 0;
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
+		err = ext4_journal_get_write_access(handle,
+				EXT4_SB(sb)->s_sbh);
+		if (err)
+			return err;
+		EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
+		sb->s_dirt = 1;
+		handle->h_sync = 1;
+		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
+					"call ext4_journal_dirty_met adata");
+		err = ext4_journal_dirty_metadata(handle,
+				EXT4_SB(sb)->s_sbh);
+	}
+	return err;
+}
+
+int ext4_update_incompat_feature(handle_t *handle,
+					struct super_block *sb, __u32 incompat)
+{
+	int err = 0;
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
+		err = ext4_journal_get_write_access(handle,
+				EXT4_SB(sb)->s_sbh);
+		if (err)
+			return err;
+		EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
+		sb->s_dirt = 1;
+		handle->h_sync = 1;
+		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
+					"call ext4_journal_dirty_met adata");
+		err = ext4_journal_dirty_metadata(handle,
+				EXT4_SB(sb)->s_sbh);
+	}
+	return err;
+}
+
 /*
  * Open the external journal device
  */
-- 
cgit v1.2.3


From 1d03ec984ca41ba184822d1101babb3fa3e26c77 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4:  Fix sparse warnings.

Fix sparse warnings related to static functions
and local variables.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/extents.c |  6 +++---
 fs/ext4/inode.c   | 18 +++++++++++-------
 fs/ext4/super.c   |  3 +++
 3 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 68537229ee1c..754c0d36d162 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1088,7 +1088,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
  * then we have to correct all indexes above.
  * TODO: do we need to correct tree in all cases?
  */
-int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
+static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 				struct ext4_ext_path *path)
 {
 	struct ext4_extent_header *eh;
@@ -1535,7 +1535,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
  * It's used in truncate case only, thus all requests are for
  * last index in the block only.
  */
-int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
+static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 			struct ext4_ext_path *path)
 {
 	struct buffer_head *bh;
@@ -1806,7 +1806,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
 	return 1;
 }
 
-int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 {
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1ee19c918686..76ceba2718b9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2052,11 +2052,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 	for (p = first; p < last; p++) {
 		u32 nr = le32_to_cpu(*p);
 		if (nr) {
-			struct buffer_head *bh;
+			struct buffer_head *tbh;
 
 			*p = 0;
-			bh = sb_find_get_block(inode->i_sb, nr);
-			ext4_forget(handle, 0, inode, bh, nr);
+			tbh = sb_find_get_block(inode->i_sb, nr);
+			ext4_forget(handle, 0, inode, tbh, nr);
 		}
 	}
 
@@ -2324,8 +2324,10 @@ void ext4_truncate(struct inode *inode)
 			return;
 	}
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-		return ext4_ext_truncate(inode, page);
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+		ext4_ext_truncate(inode, page);
+		return;
+	}
 
 	handle = start_transaction(inode);
 	if (IS_ERR(handle)) {
@@ -3163,8 +3165,10 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
  * Expand an inode by new_extra_isize bytes.
  * Returns 0 on success or negative error number on failure.
  */
-int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize,
-			struct ext4_iloc iloc, handle_t *handle)
+static int ext4_expand_extra_isize(struct inode *inode,
+				   unsigned int new_extra_isize,
+				   struct ext4_iloc iloc,
+				   handle_t *handle)
 {
 	struct ext4_inode *raw_inode;
 	struct ext4_xattr_ibody_header *header;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4d7f33f79552..7be27dbe76bf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1644,6 +1644,9 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
 
 
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
+				__releases(kernel_sem)
+				__acquires(kernel_sem)
+
 {
 	struct buffer_head * bh;
 	struct ext4_super_block *es = NULL;
-- 
cgit v1.2.3


From 7973c0c19ecba92f113488045005f8e7ce1cd7c8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Rename i_file_acl to i_file_acl_lo

Rename i_file_acl to i_file_acl_lo. This helps
in finding bugs where we use i_file_acl instead
of the combined i_file_acl_lo and i_file_acl_high

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 76ceba2718b9..7bcec1860084 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2718,7 +2718,7 @@ void ext4_read_inode(struct inode * inode)
 	}
 	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
-	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD))
 		ei->i_file_acl |=
@@ -2866,7 +2866,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	    cpu_to_le32(EXT4_OS_HURD))
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
-	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
 	if (!S_ISREG(inode->i_mode)) {
 		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
 	} else {
-- 
cgit v1.2.3


From a48380f769dfed6163fb82a68b13bd562ea1e027 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Rename i_dir_acl to i_size_high

Rename ext4_inode.i_dir_acl to i_size_high
drop ext4_inode_info.i_dir_acl as it is not used
Rename ext4_inode.i_size to ext4_inode.i_size_lo
Add helper function for accessing the ext4_inode combined i_size.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/ialloc.c |  1 -
 fs/ext4/inode.c  | 55 ++++++++++++++++++++++---------------------------------
 2 files changed, 22 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00b152b92480..17b5df14f85b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -709,7 +709,6 @@ got:
 	if (!S_ISDIR(mode))
 		ei->i_flags &= ~EXT4_DIRSYNC_FL;
 	ei->i_file_acl = 0;
-	ei->i_dir_acl = 0;
 	ei->i_dtime = 0;
 	ei->i_block_alloc_info = NULL;
 	ei->i_block_group = group;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7bcec1860084..e6634550cfc8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2694,7 +2694,6 @@ void ext4_read_inode(struct inode * inode)
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
-	inode->i_size = le32_to_cpu(raw_inode->i_size);
 
 	ei->i_state = 0;
 	ei->i_dir_start_lookup = 0;
@@ -2720,15 +2719,11 @@ void ext4_read_inode(struct inode * inode)
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-	    cpu_to_le32(EXT4_OS_HURD))
+	    cpu_to_le32(EXT4_OS_HURD)) {
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-	if (!S_ISREG(inode->i_mode)) {
-		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
-	} else {
-		inode->i_size |=
-			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
 	}
+	inode->i_size = ext4_isize(raw_inode);
 	ei->i_disksize = inode->i_size;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
@@ -2852,7 +2847,6 @@ static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_gid_high = 0;
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
 
 	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
@@ -2867,32 +2861,27 @@ static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
-	if (!S_ISREG(inode->i_mode)) {
-		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
-	} else {
-		raw_inode->i_size_high =
-			cpu_to_le32(ei->i_disksize >> 32);
-		if (ei->i_disksize > 0x7fffffffULL) {
-			struct super_block *sb = inode->i_sb;
-			if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
-			    EXT4_SB(sb)->s_es->s_rev_level ==
-					cpu_to_le32(EXT4_GOOD_OLD_REV)) {
-			       /* If this is the first large file
-				* created, add a flag to the superblock.
-				*/
-				err = ext4_journal_get_write_access(handle,
-						EXT4_SB(sb)->s_sbh);
-				if (err)
-					goto out_brelse;
-				ext4_update_dynamic_rev(sb);
-				EXT4_SET_RO_COMPAT_FEATURE(sb,
+	ext4_isize_set(raw_inode, ei->i_disksize);
+	if (ei->i_disksize > 0x7fffffffULL) {
+		struct super_block *sb = inode->i_sb;
+		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+				EXT4_SB(sb)->s_es->s_rev_level ==
+				cpu_to_le32(EXT4_GOOD_OLD_REV)) {
+			/* If this is the first large file
+			 * created, add a flag to the superblock.
+			 */
+			err = ext4_journal_get_write_access(handle,
+					EXT4_SB(sb)->s_sbh);
+			if (err)
+				goto out_brelse;
+			ext4_update_dynamic_rev(sb);
+			EXT4_SET_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
-				sb->s_dirt = 1;
-				handle->h_sync = 1;
-				err = ext4_journal_dirty_metadata(handle,
-						EXT4_SB(sb)->s_sbh);
-			}
+			sb->s_dirt = 1;
+			handle->h_sync = 1;
+			err = ext4_journal_dirty_metadata(handle,
+					EXT4_SB(sb)->s_sbh);
 		}
 	}
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
-- 
cgit v1.2.3


From 0fc1b451471dfc3cabd6e99ef441df9804616e63 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:26 -0500
Subject: ext4: Add support for 48 bit inode i_blocks.

Use the __le16 l_i_reserved1 field of the linux2 struct of ext4_inode
to represet the higher 16 bits for i_blocks. With this change max_file
size becomes (2**48 -1 )* 512 bytes.

We add a RO_COMPAT feature to the super block to indicate that inode
have i_blocks represented as a split 48 bits. Super block with this
feature set cannot be mounted read write on a kernel with CONFIG_LSF
disabled.

Super block flag EXT4_FEATURE_RO_COMPAT_HUGE_FILE

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/ext4/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 112 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e6634550cfc8..bb89fe727bb1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2667,6 +2667,22 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
 	if (flags & S_DIRSYNC)
 		ei->i_flags |= EXT4_DIRSYNC_FL;
 }
+static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
+					struct ext4_inode_info *ei)
+{
+	blkcnt_t i_blocks ;
+	struct super_block *sb = ei->vfs_inode.i_sb;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		/* we are using combined 48 bit field */
+		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
+					le32_to_cpu(raw_inode->i_blocks_lo);
+		return i_blocks;
+	} else {
+		return le32_to_cpu(raw_inode->i_blocks_lo);
+	}
+}
 
 void ext4_read_inode(struct inode * inode)
 {
@@ -2715,8 +2731,8 @@ void ext4_read_inode(struct inode * inode)
 		 * recovery code: that's fine, we're about to complete
 		 * the process of deleting those. */
 	}
-	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD)) {
@@ -2799,6 +2815,43 @@ bad_inode:
 	return;
 }
 
+static int ext4_inode_blocks_set(handle_t *handle,
+				struct ext4_inode *raw_inode,
+				struct ext4_inode_info *ei)
+{
+	struct inode *inode = &(ei->vfs_inode);
+	u64 i_blocks = inode->i_blocks;
+	struct super_block *sb = inode->i_sb;
+	int err = 0;
+
+	if (i_blocks <= ~0U) {
+		/*
+		 * i_blocks can be represnted in a 32 bit variable
+		 * as multiple of 512 bytes
+		 */
+		raw_inode->i_blocks_lo   = cpu_to_le32((u32)i_blocks);
+		raw_inode->i_blocks_high = 0;
+	} else if (i_blocks <= 0xffffffffffffULL) {
+		/*
+		 * i_blocks can be represented in a 48 bit variable
+		 * as multiple of 512 bytes
+		 */
+		err = ext4_update_rocompat_feature(handle, sb,
+					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+		if (err)
+			goto  err_out;
+		/* i_block is stored in the split  48 bit fields */
+		raw_inode->i_blocks_lo   = cpu_to_le32((u32)i_blocks);
+		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+	} else {
+		ext4_error(sb, __FUNCTION__,
+				"Wrong inode i_blocks count  %llu\n",
+				(unsigned long long)inode->i_blocks);
+	}
+err_out:
+	return err;
+}
+
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
@@ -2853,7 +2906,8 @@ static int ext4_do_update_inode(handle_t *handle,
 	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 
-	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+	if (ext4_inode_blocks_set(handle, raw_inode, ei))
+		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7be27dbe76bf..2b9dc96ec43e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1603,17 +1603,50 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 
 /*
  * Maximal file size.  There is a direct, and {,double-,triple-}indirect
- * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
- * We need to be 1 filesystem block less than the 2^32 sector limit.
+ * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^48 sector limit.
  */
 static loff_t ext4_max_size(int bits)
 {
 	loff_t res = EXT4_NDIR_BLOCKS;
-	/* This constant is calculated to be the largest file size for a
-	 * dense, 4k-blocksize file such that the total number of
+	int meta_blocks;
+	loff_t upper_limit;
+	/* This is calculated to be the largest file size for a
+	 * dense, file such that the total number of
 	 * sectors in the file, including data and all indirect blocks,
-	 * does not exceed 2^32. */
-	const loff_t upper_limit = 0x1ff7fffd000LL;
+	 * does not exceed 2^48 -1
+	 * __u32 i_blocks_lo and _u16 i_blocks_high representing the
+	 * total number of  512 bytes blocks of the file
+	 */
+
+	if (sizeof(blkcnt_t) < sizeof(u64)) {
+		/*
+		 * CONFIG_LSF is not enabled implies the inode
+		 * i_block represent total blocks in 512 bytes
+		 * 32 == size of vfs inode i_blocks * 8
+		 */
+		upper_limit = (1LL << 32) - 1;
+
+		/* total blocks in file system block size */
+		upper_limit >>= (bits - 9);
+
+	} else {
+		/* We use 48 bit ext4_inode i_blocks */
+		upper_limit = (1LL << 48) - 1;
+
+		/* total blocks in file system block size */
+		upper_limit >>= (bits - 9);
+	}
+
+	/* indirect blocks */
+	meta_blocks = 1;
+	/* double indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2));
+	/* tripple indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+	upper_limit -= meta_blocks;
+	upper_limit <<= bits;
 
 	res += 1LL << (bits-2);
 	res += 1LL << (2*(bits-2));
@@ -1621,6 +1654,10 @@ static loff_t ext4_max_size(int bits)
 	res <<= bits;
 	if (res > upper_limit)
 		res = upper_limit;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
 	return res;
 }
 
@@ -1789,6 +1826,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		       sb->s_id, le32_to_cpu(features));
 		goto failed_mount;
 	}
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		/*
+		 * Large file size enabled file system can only be
+		 * mount if kernel is build with CONFIG_LSF
+		 */
+		if (sizeof(root->i_blocks) < sizeof(u64) &&
+				!(sb->s_flags & MS_RDONLY)) {
+			printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+					"files cannot be mounted read-write "
+					"without CONFIG_LSF.\n", sb->s_id);
+			goto failed_mount;
+		}
+	}
 	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
 
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
-- 
cgit v1.2.3


From 8180a5627d126362c2f64e4fa886d6f608d9632a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Support large files

This patch converts ext4_inode i_blocks to represent total
blocks occupied by the inode in file system block size.
Earlier the variable used to represent this in 512 byte
block size. This actually limited the total size of the file.

The feature is enabled transparently when we write an inode
whose i_blocks cannot be represnted as 512 byte units in a
48 bit variable.

inode flag  EXT4_HUGE_FILE_FL

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c | 32 +++++++++++++++++++++++++-------
 fs/ext4/super.c |  9 ++++++---
 2 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bb89fe727bb1..9cf85721d83c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2671,14 +2671,20 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 					struct ext4_inode_info *ei)
 {
 	blkcnt_t i_blocks ;
-	struct super_block *sb = ei->vfs_inode.i_sb;
+	struct inode *inode = &(ei->vfs_inode);
+	struct super_block *sb = inode->i_sb;
 
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
 		/* we are using combined 48 bit field */
 		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
 					le32_to_cpu(raw_inode->i_blocks_lo);
-		return i_blocks;
+		if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+			/* i_blocks represent file system block size */
+			return i_blocks  << (inode->i_blkbits - 9);
+		} else {
+			return i_blocks;
+		}
 	} else {
 		return le32_to_cpu(raw_inode->i_blocks_lo);
 	}
@@ -2829,8 +2835,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		 * i_blocks can be represnted in a 32 bit variable
 		 * as multiple of 512 bytes
 		 */
-		raw_inode->i_blocks_lo   = cpu_to_le32((u32)i_blocks);
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
+		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
 	} else if (i_blocks <= 0xffffffffffffULL) {
 		/*
 		 * i_blocks can be represented in a 48 bit variable
@@ -2841,12 +2848,23 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		if (err)
 			goto  err_out;
 		/* i_block is stored in the split  48 bit fields */
-		raw_inode->i_blocks_lo   = cpu_to_le32((u32)i_blocks);
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
 	} else {
-		ext4_error(sb, __FUNCTION__,
-				"Wrong inode i_blocks count  %llu\n",
-				(unsigned long long)inode->i_blocks);
+		/*
+		 * i_blocks should be represented in a 48 bit variable
+		 * as multiple of  file system block size
+		 */
+		err = ext4_update_rocompat_feature(handle, sb,
+					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+		if (err)
+			goto  err_out;
+		ei->i_flags |= EXT4_HUGE_FILE_FL;
+		/* i_block is stored in file system block size */
+		i_blocks = i_blocks >> (inode->i_blkbits - 9);
+		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
+		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 	}
 err_out:
 	return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2b9dc96ec43e..64067de70c6f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1631,11 +1631,14 @@ static loff_t ext4_max_size(int bits)
 		upper_limit >>= (bits - 9);
 
 	} else {
-		/* We use 48 bit ext4_inode i_blocks */
+		/*
+		 * We use 48 bit ext4_inode i_blocks
+		 * With EXT4_HUGE_FILE_FL set the i_blocks
+		 * represent total number of blocks in
+		 * file system block size
+		 */
 		upper_limit = (1LL << 48) - 1;
 
-		/* total blocks in file system block size */
-		upper_limit >>= (bits - 9);
 	}
 
 	/* indirect blocks */
-- 
cgit v1.2.3


From cd2291a463c26f60b18e0d9b1901be236dd7f402 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: different maxbytes functions for bitmap & extent	files

use 2 different maxbytes functions for bitmapped & extent-based
files.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/ext4/super.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 64067de70c6f..c79e46b7f159 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1600,19 +1600,58 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
+/*
+ * Maximal extent format file size.
+ * Resulting logical blkno at s_maxbytes must fit in our on-disk
+ * extent format containers, within a sector_t, and within i_blocks
+ * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
+ * so that won't be a limiting factor.
+ *
+ * Note, this does *not* consider any metadata overhead for vfs i_blocks.
+ */
+static loff_t ext4_max_size(int blkbits)
+{
+	loff_t res;
+	loff_t upper_limit = MAX_LFS_FILESIZE;
+
+	/* small i_blocks in vfs inode? */
+	if (sizeof(blkcnt_t) < sizeof(u64)) {
+		/*
+		 * CONFIG_LSF is not enabled implies the inode
+		 * i_block represent total blocks in 512 bytes
+		 * 32 == size of vfs inode i_blocks * 8
+		 */
+		upper_limit = (1LL << 32) - 1;
+
+		/* total blocks in file system block size */
+		upper_limit >>= (blkbits - 9);
+		upper_limit <<= blkbits;
+	}
+
+	/* 32-bit extent-start container, ee_block */
+	res = 1LL << 32;
+	res <<= blkbits;
+	res -= 1;
+
+	/* Sanity check against vm- & vfs- imposed limits */
+	if (res > upper_limit)
+		res = upper_limit;
+
+	return res;
+}
 
 /*
- * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+ * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
  * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
  * We need to be 1 filesystem block less than the 2^48 sector limit.
  */
-static loff_t ext4_max_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits)
 {
 	loff_t res = EXT4_NDIR_BLOCKS;
 	int meta_blocks;
 	loff_t upper_limit;
 	/* This is calculated to be the largest file size for a
-	 * dense, file such that the total number of
+	 * dense, bitmapped file such that the total number of
 	 * sectors in the file, including data and all indirect blocks,
 	 * does not exceed 2^48 -1
 	 * __u32 i_blocks_lo and _u16 i_blocks_high representing the
-- 
cgit v1.2.3


From 19295529db35381d46dbaf246f69b4e3b3393996 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: export iov_shorten from kernel for ext4's use

Export iov_shorten() from kernel so that ext4 can
truncate too-large writes to bitmapped files.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/read_write.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index c4d3d17923f1..1c177f29e1b7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -446,6 +446,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 	}
 	return seg;
 }
+EXPORT_SYMBOL(iov_shorten);
 
 ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
-- 
cgit v1.2.3


From e2b4657453c0d5571bd3c7256585c486ed42d364 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: store maxbytes for bitmapped  files and return EFBIG as
 appropriate

Calculate & store the max offset for bitmapped files, and
catch too-large seeks, truncates, and writes in ext4, shortening
or rejecting as appropriate.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/ext4/file.c  | 19 ++++++++++++++++++-
 fs/ext4/inode.c | 16 +++++++++++++++-
 fs/ext4/super.c |  1 +
 3 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1a81cd66d63b..a6b2aa14626e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -56,8 +56,25 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t ret;
 	int err;
 
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+	/*
+	 * If we have encountered a bitmap-format file, the size limit
+	 * is smaller than s_maxbytes, which is for extent-mapped files.
+	 */
+
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+		size_t length = iov_length(iov, nr_segs);
 
+		if (pos > sbi->s_bitmap_maxbytes)
+			return -EFBIG;
+
+		if (pos + length > sbi->s_bitmap_maxbytes) {
+			nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
+					      sbi->s_bitmap_maxbytes - pos);
+		}
+	}
+
+	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
 	/*
 	 * Skip flushing if there was an error, or if nothing was written.
 	 */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9cf85721d83c..eaace1373ccb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -314,7 +314,10 @@ static int ext4_block_to_path(struct inode *inode,
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else {
-		ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big");
+		ext4_warning(inode->i_sb, "ext4_block_to_path",
+				"block %u > max",
+				i_block + direct_blocks +
+				indirect_blocks + double_blocks);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
@@ -3092,6 +3095,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		ext4_journal_stop(handle);
 	}
 
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+			if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+				error = -EFBIG;
+				goto err_out;
+			}
+		}
+	}
+
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
 		handle_t *handle;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c79e46b7f159..0931831537a2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1922,6 +1922,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		}
 	}
 
+	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
 	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
-- 
cgit v1.2.3


From 902be4c5efe0289594c3acf43da40fe7ff0a138b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:26 -0500
Subject: ext2: Fix the max file size for ext2 file system.

The max file size for ext2 file system is now calculated
with hardcoded 4K block size. The patch fixes it to be
calculated with the right block size.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext2/super.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 154e25f13d77..6abaf75163f0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -680,11 +680,31 @@ static int ext2_check_descriptors (struct super_block * sb)
 static loff_t ext2_max_size(int bits)
 {
 	loff_t res = EXT2_NDIR_BLOCKS;
-	/* This constant is calculated to be the largest file size for a
-	 * dense, 4k-blocksize file such that the total number of
+	int meta_blocks;
+	loff_t upper_limit;
+
+	/* This is calculated to be the largest file size for a
+	 * dense, file such that the total number of
 	 * sectors in the file, including data and all indirect blocks,
-	 * does not exceed 2^32. */
-	const loff_t upper_limit = 0x1ff7fffd000LL;
+	 * does not exceed 2^32 -1
+	 * __u32 i_blocks representing the total number of
+	 * 512 bytes blocks of the file
+	 */
+	upper_limit = (1LL << 32) - 1;
+
+	/* total blocks in file system block size */
+	upper_limit >>= (bits - 9);
+
+
+	/* indirect blocks */
+	meta_blocks = 1;
+	/* double indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2));
+	/* tripple indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+	upper_limit -= meta_blocks;
+	upper_limit <<= bits;
 
 	res += 1LL << (bits-2);
 	res += 1LL << (2*(bits-2));
@@ -692,6 +712,10 @@ static loff_t ext2_max_size(int bits)
 	res <<= bits;
 	if (res > upper_limit)
 		res = upper_limit;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
 	return res;
 }
 
-- 
cgit v1.2.3


From fe7fdc37b5404afb068f928ceba7c3e591b501ca Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:26 -0500
Subject: ext3: Fix the max file size for ext3 file system.

The max file size for ext3 file system is now calculated
with hardcoded 4K block size. The patch fixes it to be
calculated with the right block size.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext3/super.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index cb14de1502c3..f3675cc630e9 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1436,11 +1436,31 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 static loff_t ext3_max_size(int bits)
 {
 	loff_t res = EXT3_NDIR_BLOCKS;
-	/* This constant is calculated to be the largest file size for a
-	 * dense, 4k-blocksize file such that the total number of
+	int meta_blocks;
+	loff_t upper_limit;
+
+	/* This is calculated to be the largest file size for a
+	 * dense, file such that the total number of
 	 * sectors in the file, including data and all indirect blocks,
-	 * does not exceed 2^32. */
-	const loff_t upper_limit = 0x1ff7fffd000LL;
+	 * does not exceed 2^32 -1
+	 * __u32 i_blocks representing the total number of
+	 * 512 bytes blocks of the file
+	 */
+	upper_limit = (1LL << 32) - 1;
+
+	/* total blocks in file system block size */
+	upper_limit >>= (bits - 9);
+
+
+	/* indirect blocks */
+	meta_blocks = 1;
+	/* double indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2));
+	/* tripple indirect blocks */
+	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+	upper_limit -= meta_blocks;
+	upper_limit <<= bits;
 
 	res += 1LL << (bits-2);
 	res += 1LL << (2*(bits-2));
@@ -1448,6 +1468,10 @@ static loff_t ext3_max_size(int bits)
 	res <<= bits;
 	if (res > upper_limit)
 		res = upper_limit;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
 	return res;
 }
 
-- 
cgit v1.2.3


From cb47dce79145d04634156fd18437e1e78af712e4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Return after ext4_error in case of failures

This fix some instances where we were continuing after calling
ext4_error. ext4_error call panic only if errors=panic mount option is
set. So we need to make sure we return correctly after ext4_error call

Reported by: Adrian Bunk <bunk@kernel.org>

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/balloc.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9568a57c607c..ff3428e195b4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -587,11 +587,13 @@ do_more:
 	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
 	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, desc),
-		     sbi->s_itb_per_group))
+		     sbi->s_itb_per_group)) {
 		ext4_error (sb, "ext4_free_blocks",
 			    "Freeing blocks in system zones - "
 			    "Block = %llu, count = %lu",
 			    block, count);
+		goto error_return;
+	}
 
 	/*
 	 * We are about to start releasing blocks in the bitmap,
@@ -1690,11 +1692,13 @@ allocated:
 	    in_range(ret_block, ext4_inode_table(sb, gdp),
 		     EXT4_SB(sb)->s_itb_per_group) ||
 	    in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group))
+		     EXT4_SB(sb)->s_itb_per_group)) {
 		ext4_error(sb, "ext4_new_block",
 			    "Allocating block in system zone - "
 			    "blocks from %llu, length %lu",
 			     ret_block, num);
+		goto out;
+	}
 
 	performed_allocation = 1;
 
-- 
cgit v1.2.3


From 07620f69eff6671fea6bd382c95709f757e33768 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4/super.c: fix #ifdef's (CONFIG_EXT4_* -> CONFIG_EXT4DEV_*)

Based on a report by Robert P. J. Day.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
---
 fs/ext4/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0931831537a2..1484a087bba0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -706,7 +706,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",debug");
 	if (test_opt(sb, OLDALLOC))
 		seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4_FS_XATTR
+#ifdef CONFIG_EXT4DEV_FS_XATTR
 	if (test_opt(sb, XATTR_USER))
 		seq_puts(seq, ",user_xattr");
 	if (!test_opt(sb, XATTR_USER) &&
@@ -714,7 +714,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nouser_xattr");
 	}
 #endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
 	if (test_opt(sb, POSIX_ACL))
 		seq_puts(seq, ",acl");
 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
-- 
cgit v1.2.3


From e7c95593001cb96ef5dd121a4523286c574c7133 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: fix oops on corrupted ext4 mount

When mounting an ext4 filesystem with corrupted s_first_data_block, things
can go very wrong and oops.

Because blocks_count in ext4_fill_super is a u64, and we must use do_div,
the calculation of db_count is done differently than on ext4.  If
first_data_block is corrupted such that it is larger than ext4_blocks_count,
for example, then the intermediate blocks_count value may go negative,
but sign-extend to a very large value:

        blocks_count = (ext4_blocks_count(es) -
                        le32_to_cpu(es->s_first_data_block) +
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);

This is then assigned to s_groups_count which is an unsigned long:

        sbi->s_groups_count = blocks_count;

This may result in a value of 0xFFFFFFFF which is then used to compute
db_count:

        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);

and in this case db_count will wind up as 0 because the addition overflows
32 bits.  This in turn causes the kmalloc for group_desc to be of 0 size:

        sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
                                    GFP_KERNEL);

and eventually in ext4_check_descriptors, dereferencing
sbi->s_group_desc[desc_block] will result in a NULL pointer dereference.

The simplest test seems to be to sanity check s_first_data_block,
EXT4_BLOCKS_PER_GROUP, and ext4_blocks_count values to be sure
their combination won't result in a bad intermediate value for
blocks_count.  We could just check for db_count == 0, but
catching it at the root cause seems like it provides more info.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/super.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1484a087bba0..32e3ecb35cd7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1997,6 +1997,17 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 
 	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
+
+	/* ensure blocks_count calculation below doesn't sign-extend */
+	if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
+	    le32_to_cpu(es->s_first_data_block) + 1) {
+		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
+		       "first data block %u, blocks per group %lu\n",
+			ext4_blocks_count(es),
+			le32_to_cpu(es->s_first_data_block),
+			EXT4_BLOCKS_PER_GROUP(sb));
+		goto failed_mount;
+	}
 	blocks_count = (ext4_blocks_count(es) -
 			le32_to_cpu(es->s_first_data_block) +
 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
-- 
cgit v1.2.3


From bb4f397a1a7f2330cb173233599aa159f5780f58 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:26 -0500
Subject: ext4: Change the default behaviour on error

ext4 file system was by default ignoring errors and continuing. This
is not a good default as continuing on error could lead to file system
corruption. Change the default to mark the file system
readonly. Debian and ubuntu already does this as the default in their
fstab.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/super.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32e3ecb35cd7..effd375ece80 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -688,16 +688,16 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
 		seq_printf(seq, ",resgid=%u", sbi->s_resgid);
 	}
-	if (test_opt(sb, ERRORS_CONT)) {
+	if (test_opt(sb, ERRORS_RO)) {
 		int def_errors = le16_to_cpu(es->s_errors);
 
 		if (def_errors == EXT4_ERRORS_PANIC ||
-		    def_errors == EXT4_ERRORS_RO) {
-			seq_puts(seq, ",errors=continue");
+		    def_errors == EXT4_ERRORS_CONTINUE) {
+			seq_puts(seq, ",errors=remount-ro");
 		}
 	}
-	if (test_opt(sb, ERRORS_RO))
-		seq_puts(seq, ",errors=remount-ro");
+	if (test_opt(sb, ERRORS_CONT))
+		seq_puts(seq, ",errors=continue");
 	if (test_opt(sb, ERRORS_PANIC))
 		seq_puts(seq, ",errors=panic");
 	if (test_opt(sb, NO_UID32))
@@ -1819,10 +1819,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 
 	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
 		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
-	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO)
-		set_opt(sbi->s_mount_opt, ERRORS_RO);
-	else
+	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
 		set_opt(sbi->s_mount_opt, ERRORS_CONT);
+	else
+		set_opt(sbi->s_mount_opt, ERRORS_RO);
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
-- 
cgit v1.2.3


From 389d1b083c767a360ec84b27a95da06244becec8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:26 -0500
Subject: Add buffer head related helper functions

Add buffer head related helper function bh_uptodate_or_lock and
bh_submit_read which can be used by file system

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/buffer.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 7249e014819e..456c9ab7705b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3213,6 +3213,50 @@ static int buffer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+/**
+ * bh_uptodate_or_lock: Test whether the buffer is uptodate
+ * @bh: struct buffer_head
+ *
+ * Return true if the buffer is up-to-date and false,
+ * with the buffer locked, if not.
+ */
+int bh_uptodate_or_lock(struct buffer_head *bh)
+{
+	if (!buffer_uptodate(bh)) {
+		lock_buffer(bh);
+		if (!buffer_uptodate(bh))
+			return 0;
+		unlock_buffer(bh);
+	}
+	return 1;
+}
+EXPORT_SYMBOL(bh_uptodate_or_lock);
+
+/**
+ * bh_submit_read: Submit a locked buffer for reading
+ * @bh: struct buffer_head
+ *
+ * Returns zero on success and -EIO on error.
+ */
+int bh_submit_read(struct buffer_head *bh)
+{
+	BUG_ON(!buffer_locked(bh));
+
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return 0;
+	}
+
+	get_bh(bh);
+	bh->b_end_io = end_buffer_read_sync;
+	submit_bh(READ, bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return 0;
+	return -EIO;
+}
+EXPORT_SYMBOL(bh_submit_read);
+
 void __init buffer_init(void)
 {
 	int nrpages;
-- 
cgit v1.2.3


From abcb2947c91130426539f209f7a473a67a1f6663 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: add block bitmap validation

When a new block bitmap is read from disk in read_block_bitmap()
there are a few bits that should ALWAYS be set.  In particular,
the blocks given corresponding to block bitmap, inode bitmap and inode tables.
Validate the block bitmap against these blocks.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/balloc.c | 99 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 81 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ff3428e195b4..d460223b8e1d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -189,13 +189,65 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 	return desc;
 }
 
+static int ext4_valid_block_bitmap(struct super_block *sb,
+					struct ext4_group_desc *desc,
+					unsigned int block_group,
+					struct buffer_head *bh)
+{
+	ext4_grpblk_t offset;
+	ext4_grpblk_t next_zero_bit;
+	ext4_fsblk_t bitmap_blk;
+	ext4_fsblk_t group_first_block;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		/* with FLEX_BG, the inode/block bitmaps and itable
+		 * blocks may not be in the group at all
+		 * so the bitmap validation will be skipped for those groups
+		 * or it has to also read the block group where the bitmaps
+		 * are located to verify they are set.
+		 */
+		return 1;
+	}
+	group_first_block = ext4_group_first_block_no(sb, block_group);
+
+	/* check whether block bitmap block number is set */
+	bitmap_blk = ext4_block_bitmap(sb, desc);
+	offset = bitmap_blk - group_first_block;
+	if (!ext4_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode bitmap block number is set */
+	bitmap_blk = ext4_inode_bitmap(sb, desc);
+	offset = bitmap_blk - group_first_block;
+	if (!ext4_test_bit(offset, bh->b_data))
+		/* bad block bitmap */
+		goto err_out;
+
+	/* check whether the inode table block number is set */
+	bitmap_blk = ext4_inode_table(sb, desc);
+	offset = bitmap_blk - group_first_block;
+	next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
+				offset + EXT4_SB(sb)->s_itb_per_group,
+				offset);
+	if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
+		/* good bitmap for inode tables */
+		return 1;
+
+err_out:
+	ext4_error(sb, __FUNCTION__,
+			"Invalid block bitmap - "
+			"block_group = %d, block = %llu",
+			block_group, bitmap_blk);
+	return 0;
+}
 /**
  * read_block_bitmap()
  * @sb:			super block
  * @block_group:	given block group
  *
- * Read the bitmap for a given block_group, reading into the specified
- * slot in the superblock's bitmap cache.
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
  *
  * Return buffer_head on success or NULL in case of failure.
  */
@@ -210,25 +262,36 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	if (!desc)
 		return NULL;
 	bitmap_blk = ext4_block_bitmap(sb, desc);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext4_error(sb, __FUNCTION__,
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %llu",
+			    (int)block_group, (unsigned long long)bitmap_blk);
+		return NULL;
+	}
+	if (bh_uptodate_or_lock(bh))
+		return bh;
+
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		bh = sb_getblk(sb, bitmap_blk);
-		if (!buffer_uptodate(bh)) {
-			lock_buffer(bh);
-			if (!buffer_uptodate(bh)) {
-				ext4_init_block_bitmap(sb, bh, block_group,
-						       desc);
-				set_buffer_uptodate(bh);
-			}
-			unlock_buffer(bh);
-		}
-	} else {
-		bh = sb_bread(sb, bitmap_blk);
+		ext4_init_block_bitmap(sb, bh, block_group, desc);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		return bh;
 	}
-	if (!bh)
-		ext4_error (sb, __FUNCTION__,
+	if (bh_submit_read(bh) < 0) {
+		put_bh(bh);
+		ext4_error(sb, __FUNCTION__,
 			    "Cannot read block bitmap - "
-			    "block_group = %lu, block_bitmap = %llu",
-			    block_group, bitmap_blk);
+			    "block_group = %d, block_bitmap = %llu",
+			    (int)block_group, (unsigned long long)bitmap_blk);
+		return NULL;
+	}
+	if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) {
+		put_bh(bh);
+		return NULL;
+	}
+
 	return bh;
 }
 /*
-- 
cgit v1.2.3


From f5a7a6b0d9b6af7d46124ed3f6b3995225cb62d0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: jbd2: Fix assertion failure in fs/jbd2/checkpoint.c

Before we start committing a transaction, we call
__journal_clean_checkpoint_list() to cleanup transaction's written-back
buffers.

If this call happens to remove all of them (and there were already some
buffers), __journal_remove_checkpoint() will decide to free the transaction
because it isn't (yet) a committing transaction and soon we fail some
assertion - the transaction really isn't ready to be freed :).

We change the check in __journal_remove_checkpoint() to free only a
transaction in T_FINISHED state.  The locking there is subtle though (as
everywhere in JBD ;().  We use j_list_lock to protect the check and a
subsequent call to __journal_drop_transaction() and do the same in the end
of journal_commit_transaction() which is the only place where a transaction
can get to T_FINISHED state.

Probably I'm too paranoid here and such locking is not really necessary -
checkpoint lists are processed only from log_do_checkpoint() where a
transaction must be already committed to be processed or from
__journal_clean_checkpoint_list() where kjournald itself calls it and thus
transaction cannot change state either.  Better be safe if something
changes in future...

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/jbd2/checkpoint.c | 12 ++++++------
 fs/jbd2/commit.c     |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 3fccde7ba008..7e958c86242f 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -602,15 +602,15 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 
 	/*
 	 * There is one special case to worry about: if we have just pulled the
-	 * buffer off a committing transaction's forget list, then even if the
-	 * checkpoint list is empty, the transaction obviously cannot be
-	 * dropped!
+	 * buffer off a running or committing transaction's checkpoing list,
+	 * then even if the checkpoint list is empty, the transaction obviously
+	 * cannot be dropped!
 	 *
-	 * The locking here around j_committing_transaction is a bit sleazy.
+	 * The locking here around t_state is a bit sleazy.
 	 * See the comment at the end of jbd2_journal_commit_transaction().
 	 */
-	if (transaction == journal->j_committing_transaction) {
-		JBUFFER_TRACE(jh, "belongs to committing transaction");
+	if (transaction->t_state != T_FINISHED) {
+		JBUFFER_TRACE(jh, "belongs to running/committing transaction");
 		goto out;
 	}
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6986f334c643..39b5cee3dd8a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -867,10 +867,10 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 	/*
-	 * This is a bit sleazy.  We borrow j_list_lock to protect
-	 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
-	 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
-	 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
+	 * This is a bit sleazy.  We use j_list_lock to protect transition
+	 * of a transaction into T_FINISHED state and calling
+	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
+	 * other checkpointing code processing the transaction...
 	 */
 	spin_lock(&journal->j_state_lock);
 	spin_lock(&journal->j_list_lock);
-- 
cgit v1.2.3


From 221879c927df05280283a4de6124806c17cc44d4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Check for the correct error return from

ext4_ext_get_blocks returns negative values on error. We should
check for  <= 0

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/extents.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 754c0d36d162..8593e59020fe 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2462,12 +2462,12 @@ retry:
 		ret = ext4_ext_get_blocks(handle, inode, block,
 					  max_blocks, &map_bh,
 					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
-		WARN_ON(!ret);
-		if (!ret) {
+		WARN_ON(ret <= 0);
+		if (ret <= 0) {
 			ext4_error(inode->i_sb, "ext4_fallocate",
-				   "ext4_ext_get_blocks returned 0! inode#%lu"
-				   ", block=%u, max_blocks=%lu",
-				   inode->i_ino, block, max_blocks);
+				    "ext4_ext_get_blocks returned error: "
+				    "inode#%lu, block=%u, max_blocks=%lu",
+				    inode->i_ino, block, max_blocks);
 			ret = -EIO;
 			ext4_mark_inode_dirty(handle, inode);
 			ret2 = ext4_journal_stop(handle);
-- 
cgit v1.2.3


From 01f4adc04480a4e0395906d0268c056cf09c39c0 Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: remove unused code from ext4_find_entry()

The unused code found in ext3_find_entry() is also present (and still
unused) in the ext4_find_entry() code. This patch removes it.

Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index fb673b14ccd5..67b6d8a1ceff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -861,14 +861,10 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
 	int i, err;
 	struct inode *dir = dentry->d_parent->d_inode;
 	int namelen;
-	const u8 *name;
-	unsigned blocksize;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
-	blocksize = sb->s_blocksize;
 	namelen = dentry->d_name.len;
-	name = dentry->d_name.name;
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
 	if (is_dx(dir)) {
-- 
cgit v1.2.3


From c278bfecebfb1ed67c326ef472660878baa745cd Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Make ext4_get_blocks_wrap take the truncate_mutex early.

When doing a migrate from ext3 to ext4 inode we need to make sure the test
for inode type and walking inode data happens inside  lock. To make this
happen move truncate_mutex early before checking the i_flags.


This actually should enable us to remove the verify_chain().

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/extents.c |  9 +++++---
 fs/ext4/inode.c   | 69 +++++++------------------------------------------------
 2 files changed, 14 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8593e59020fe..ec5019fa552f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2129,6 +2129,10 @@ out:
 	return err ? err : allocated;
 }
 
+/*
+ * Need to be called with
+ * mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
@@ -2144,7 +2148,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%lu requested for inode %u\n",
 			iblock, max_blocks, inode->i_ino);
-	mutex_lock(&EXT4_I(inode)->truncate_mutex);
 
 	/* check in cache */
 	goal = ext4_ext_in_cache(inode, iblock, &newex);
@@ -2318,8 +2321,6 @@ out2:
 		ext4_ext_drop_refs(path);
 		kfree(path);
 	}
-	mutex_unlock(&EXT4_I(inode)->truncate_mutex);
-
 	return err ? err : allocated;
 }
 
@@ -2449,6 +2450,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
 	 */
 	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+	mutex_lock(&EXT4_I(inode)->truncate_mutex)
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		block = block + ret;
@@ -2505,6 +2507,7 @@ retry:
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
+	mutex_unlock(&EXT4_I(inode)->truncate_mutex)
 	/*
 	 * Time to update the file size.
 	 * Update only when preallocation was requested beyond the file size.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index eaace1373ccb..71c7ad0c6723 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -243,13 +243,6 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 	p->bh = bh;
 }
 
-static int verify_chain(Indirect *from, Indirect *to)
-{
-	while (from <= to && from->key == *from->p)
-		from++;
-	return (from > to);
-}
-
 /**
  *	ext4_block_to_path - parse the block number into array of offsets
  *	@inode: inode in question (we are only interested in its superblock)
@@ -348,10 +341,11 @@ static int ext4_block_to_path(struct inode *inode,
  *		(pointer to last triple returned, *@err == 0)
  *	or when it gets an IO error reading an indirect block
  *		(ditto, *@err == -EIO)
- *	or when it notices that chain had been changed while it was reading
- *		(ditto, *@err == -EAGAIN)
  *	or when it reads all @depth-1 indirect blocks successfully and finds
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ *      Need to be called with
+ *      mutex_lock(&EXT4_I(inode)->truncate_mutex)
  */
 static Indirect *ext4_get_branch(struct inode *inode, int depth,
 				 ext4_lblk_t  *offsets,
@@ -370,9 +364,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 		bh = sb_bread(sb, le32_to_cpu(p->key));
 		if (!bh)
 			goto failure;
-		/* Reader: pointers */
-		if (!verify_chain(chain, p))
-			goto changed;
 		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
@@ -380,10 +371,6 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	}
 	return NULL;
 
-changed:
-	brelse(bh);
-	*err = -EAGAIN;
-	goto no_block;
 failure:
 	*err = -EIO;
 no_block:
@@ -787,6 +774,10 @@ err_out:
  * return > 0, # of blocks mapped or allocated.
  * return = 0, if plain lookup failed.
  * return < 0, error case.
+ *
+ *
+ * Need to be called with
+ * mutex_lock(&EXT4_I(inode)->truncate_mutex)
  */
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 		ext4_lblk_t iblock, unsigned long maxblocks,
@@ -825,18 +816,6 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 		while (count < maxblocks && count <= blocks_to_boundary) {
 			ext4_fsblk_t blk;
 
-			if (!verify_chain(chain, partial)) {
-				/*
-				 * Indirect block might be removed by
-				 * truncate while we were reading it.
-				 * Handling of that case: forget what we've
-				 * got now. Flag the err as EAGAIN, so it
-				 * will reread.
-				 */
-				err = -EAGAIN;
-				count = 0;
-				break;
-			}
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
 
 			if (blk == first_block + count)
@@ -844,44 +823,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 			else
 				break;
 		}
-		if (err != -EAGAIN)
-			goto got_it;
+		goto got_it;
 	}
 
 	/* Next simple case - plain lookup or failed read of indirect block */
 	if (!create || err == -EIO)
 		goto cleanup;
 
-	mutex_lock(&ei->truncate_mutex);
-
-	/*
-	 * If the indirect block is missing while we are reading
-	 * the chain(ext4_get_branch() returns -EAGAIN err), or
-	 * if the chain has been changed after we grab the semaphore,
-	 * (either because another process truncated this branch, or
-	 * another get_block allocated this branch) re-grab the chain to see if
-	 * the request block has been allocated or not.
-	 *
-	 * Since we already block the truncate/other get_block
-	 * at this point, we will have the current copy of the chain when we
-	 * splice the branch into the tree.
-	 */
-	if (err == -EAGAIN || !verify_chain(chain, partial)) {
-		while (partial > chain) {
-			brelse(partial->bh);
-			partial--;
-		}
-		partial = ext4_get_branch(inode, depth, offsets, chain, &err);
-		if (!partial) {
-			count++;
-			mutex_unlock(&ei->truncate_mutex);
-			if (err)
-				goto cleanup;
-			clear_buffer_new(bh_result);
-			goto got_it;
-		}
-	}
-
 	/*
 	 * Okay, we need to do block allocation.  Lazily initialize the block
 	 * allocation info here if necessary
@@ -923,7 +871,6 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	*/
 	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
 		ei->i_disksize = inode->i_size;
-	mutex_unlock(&ei->truncate_mutex);
 	if (err)
 		goto cleanup;
 
-- 
cgit v1.2.3


From 0e855ac8b103ef579052936b59fe7c599ac422a4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:26 -0500
Subject: ext4: Convert truncate_mutex to read write semaphore.

We are currently taking the truncate_mutex for every read. This would have
performance impact on large CPU configuration. Convert the lock to read write
semaphore and take read lock when we are trying to read the file.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/balloc.c  |  2 +-
 fs/ext4/extents.c | 13 +++++++------
 fs/ext4/file.c    |  4 ++--
 fs/ext4/inode.c   | 40 +++++++++++++++++++++++++++++++++-------
 fs/ext4/ioctl.c   |  4 ++--
 fs/ext4/super.c   |  2 +-
 6 files changed, 46 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d460223b8e1d..7ae223ed152f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -526,7 +526,7 @@ static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
  * when setting the reservation window size through ioctl before the file
  * is open for write (needs block allocation).
  *
- * Needs truncate_mutex protection prior to call this function.
+ * Needs down_write(i_data_sem) protection prior to call this function.
  */
 void ext4_init_block_alloc_info(struct inode *inode)
 {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ec5019fa552f..03d1bbb78a2f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1565,7 +1565,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
  * This routine returns max. credits that the extent tree can consume.
  * It should be OK for low-performance paths like ->writepage()
  * To allow many writing processes to fit into a single transaction,
- * the caller should calculate credits under truncate_mutex and
+ * the caller should calculate credits under i_data_sem and
  * pass the actual path.
  */
 int ext4_ext_calc_credits_for_insert(struct inode *inode,
@@ -2131,7 +2131,8 @@ out:
 
 /*
  * Need to be called with
- * mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
@@ -2350,7 +2351,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 	if (page)
 		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
 
-	mutex_lock(&EXT4_I(inode)->truncate_mutex);
+	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
 
 	/*
@@ -2386,7 +2387,7 @@ out_stop:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+	up_write(&EXT4_I(inode)->i_data_sem);
 	ext4_journal_stop(handle);
 }
 
@@ -2450,7 +2451,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
 	 */
 	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
-	mutex_lock(&EXT4_I(inode)->truncate_mutex)
+	down_write((&EXT4_I(inode)->i_data_sem));
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		block = block + ret;
@@ -2507,7 +2508,7 @@ retry:
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
-	mutex_unlock(&EXT4_I(inode)->truncate_mutex)
+	up_write((&EXT4_I(inode)->i_data_sem));
 	/*
 	 * Time to update the file size.
 	 * Update only when preallocation was requested beyond the file size.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a6b2aa14626e..ac35ec58db55 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -37,9 +37,9 @@ static int ext4_release_file (struct inode * inode, struct file * filp)
 	if ((filp->f_mode & FMODE_WRITE) &&
 			(atomic_read(&inode->i_writecount) == 1))
 	{
-		mutex_lock(&EXT4_I(inode)->truncate_mutex);
+		down_write(&EXT4_I(inode)->i_data_sem);
 		ext4_discard_reservation(inode);
-		mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+		up_write(&EXT4_I(inode)->i_data_sem);
 	}
 	if (is_dx(inode) && filp->private_data)
 		ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71c7ad0c6723..a7eb8bb4bdd4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -308,7 +308,7 @@ static int ext4_block_to_path(struct inode *inode,
 		final = ptrs;
 	} else {
 		ext4_warning(inode->i_sb, "ext4_block_to_path",
-				"block %u > max",
+				"block %lu > max",
 				i_block + direct_blocks +
 				indirect_blocks + double_blocks);
 	}
@@ -345,7 +345,7 @@ static int ext4_block_to_path(struct inode *inode,
  *	the whole chain, all way to the data (returns %NULL, *err == 0).
  *
  *      Need to be called with
- *      mutex_lock(&EXT4_I(inode)->truncate_mutex)
+ *      down_read(&EXT4_I(inode)->i_data_sem)
  */
 static Indirect *ext4_get_branch(struct inode *inode, int depth,
 				 ext4_lblk_t  *offsets,
@@ -777,7 +777,8 @@ err_out:
  *
  *
  * Need to be called with
- * mutex_lock(&EXT4_I(inode)->truncate_mutex)
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 		ext4_lblk_t iblock, unsigned long maxblocks,
@@ -865,7 +866,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 		err = ext4_splice_branch(handle, inode, iblock,
 					partial, indirect_blks, count);
 	/*
-	 * i_disksize growing is protected by truncate_mutex.  Don't forget to
+	 * i_disksize growing is protected by i_data_sem.  Don't forget to
 	 * protect it if you're about to implement concurrent
 	 * ext4_get_block() -bzzz
 	*/
@@ -895,6 +896,31 @@ out:
 
 #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
 
+int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
+			unsigned long max_blocks, struct buffer_head *bh,
+			int create, int extend_disksize)
+{
+	int retval;
+	if (create) {
+		down_write((&EXT4_I(inode)->i_data_sem));
+	} else {
+		down_read((&EXT4_I(inode)->i_data_sem));
+	}
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+				bh, create, extend_disksize);
+	} else {
+		retval = ext4_get_blocks_handle(handle, inode, block,
+				max_blocks, bh, create, extend_disksize);
+	}
+	if (create) {
+		up_write((&EXT4_I(inode)->i_data_sem));
+	} else {
+		up_read((&EXT4_I(inode)->i_data_sem));
+	}
+	return retval;
+}
+
 static int ext4_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
@@ -1399,7 +1425,7 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
  *	ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
  *
  * Same applies to ext4_get_block().  We will deadlock on various things like
- * lock_journal and i_truncate_mutex.
+ * lock_journal and i_data_sem
  *
  * Setting PF_MEMALLOC here doesn't work - too many internal memory
  * allocations fail.
@@ -2325,7 +2351,7 @@ void ext4_truncate(struct inode *inode)
 	 * From here we block out all ext4_get_block() callers who want to
 	 * modify the block allocation tree.
 	 */
-	mutex_lock(&ei->truncate_mutex);
+	down_write(&ei->i_data_sem);
 
 	if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
@@ -2389,7 +2415,7 @@ do_indirects:
 
 	ext4_discard_reservation(inode);
 
-	mutex_unlock(&ei->truncate_mutex);
+	up_write(&ei->i_data_sem);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e7f894bdb420..c0e5b8cf635c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -199,7 +199,7 @@ flags_err:
 		 * need to allocate reservation structure for this inode
 		 * before set the window size
 		 */
-		mutex_lock(&ei->truncate_mutex);
+		down_write(&ei->i_data_sem);
 		if (!ei->i_block_alloc_info)
 			ext4_init_block_alloc_info(inode);
 
@@ -207,7 +207,7 @@ flags_err:
 			struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
 			rsv->rsv_goal_size = rsv_window_size;
 		}
-		mutex_unlock(&ei->truncate_mutex);
+		up_write(&ei->i_data_sem);
 		return 0;
 	}
 	case EXT4_IOC_GROUP_EXTEND: {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index effd375ece80..c7305443e100 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -593,7 +593,7 @@ static void init_once(struct kmem_cache *cachep, void *foo)
 #ifdef CONFIG_EXT4DEV_FS_XATTR
 	init_rwsem(&ei->xattr_sem);
 #endif
-	mutex_init(&ei->truncate_mutex);
+	init_rwsem(&ei->i_data_sem);
 	inode_init_once(&ei->vfs_inode);
 }
 
-- 
cgit v1.2.3


From 4df3d265bf8f3762e1d77f554ee279c39dedb020 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:29 -0500
Subject: ext4: Take read lock during overwrite case.

When we are overwriting a file and not actually allocating new file system
blocks we need to take only the read lock on i_data_sem.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/inode.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a7eb8bb4bdd4..89cd35386ff5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -901,11 +901,31 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			int create, int extend_disksize)
 {
 	int retval;
-	if (create) {
-		down_write((&EXT4_I(inode)->i_data_sem));
+	/*
+	 * Try to see if we can get  the block without requesting
+	 * for new file system block.
+	 */
+	down_read((&EXT4_I(inode)->i_data_sem));
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+				bh, 0, 0);
 	} else {
-		down_read((&EXT4_I(inode)->i_data_sem));
+		retval = ext4_get_blocks_handle(handle,
+				inode, block, max_blocks, bh, 0, 0);
 	}
+	up_read((&EXT4_I(inode)->i_data_sem));
+	if (!create || (retval > 0))
+		return retval;
+
+	/*
+	 * We need to allocate new blocks which will result
+	 * in i_data update
+	 */
+	down_write((&EXT4_I(inode)->i_data_sem));
+	/*
+	 * We need to check for EXT4 here because migrate
+	 * could have changed the inode type in between
+	 */
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 				bh, create, extend_disksize);
@@ -913,11 +933,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 		retval = ext4_get_blocks_handle(handle, inode, block,
 				max_blocks, bh, create, extend_disksize);
 	}
-	if (create) {
-		up_write((&EXT4_I(inode)->i_data_sem));
-	} else {
-		up_read((&EXT4_I(inode)->i_data_sem));
-	}
+	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }
 
-- 
cgit v1.2.3


From 8e85fb3f305b24b79c6d9cb7a56d22b062335ad3 Mon Sep 17 00:00:00 2001
From: Johann Lombardi <johann.lombardi@bull.net>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: jbd2: jbd2 stats through procfs

The patch below updates the jbd stats patch to 2.6.20/jbd2.
The initial patch was posted by Alex Tomas in December 2005
(http://marc.info/?l=linux-ext4&m=113538565128617&w=2).
It provides statistics via procfs such as transaction lifetime and size.

Sometimes, investigating performance problems, i find useful to have
stats from jbd about transaction's lifetime, size, etc. here is a
patch for review and inclusion probably.

for example, stats after creation of 3M files in htree directory:

[root@bob ~]# cat /proc/fs/jbd/sda/history
R/C  tid   wait  run   lock  flush log   hndls  block inlog ctime write drop  close
R    261   8260  2720  0     0     750   9892   8170  8187
C    259                                                    750   0     4885  1
R    262   20    2200  10    0     770   9836   8170  8187
R    263   30    2200  10    0     3070  9812   8170  8187
R    264   0     5000  10    0     1340  0      0     0
C    261                                                    8240  3212  4957  0
R    265   8260  1470  0     0     4640  9854   8170  8187
R    266   0     5000  10    0     1460  0      0     0
C    262                                                    8210  2989  4868  0
R    267   8230  1490  10    0     4440  9875   8171  8188
R    268   0     5000  10    0     1260  0      0     0
C    263                                                    7710  2937  4908  0
R    269   7730  1470  10    0     3330  9841   8170  8187
R    270   0     5000  10    0     830   0      0     0
C    265                                                    8140  3234  4898  0
C    267                                                    720   0     4849  1
R    271   8630  2740  20    0     740   9819   8170  8187
C    269                                                    800   0     4214  1
R    272   40    2170  10    0     830   9716   8170  8187
R    273   40    2280  0     0     3530  9799   8170  8187
R    274   0     5000  10    0     990   0      0     0


where,

R     - line for transaction's life from T_RUNNING to T_FINISHED
C     - line for transaction's checkpointing
tid   - transaction's id
wait  - for how long we were waiting for new transaction to start
         (the longest period journal_start() took in this transaction)
run   - real transaction's lifetime (from T_RUNNING to T_LOCKED
lock  - how long we were waiting for all handles to close
         (time the transaction was in T_LOCKED)
flush - how long it took to flush all data (data=ordered)
log   - how long it took to write the transaction to the log
hndls - how many handles got to the transaction
block - how many blocks got to the transaction
inlog - how many blocks are written to the log (block + descriptors)
ctime - how long it took to checkpoint the transaction
write - how many blocks have been written during checkpointing
drop  - how many blocks have been dropped during checkpointing
close - how many running transactions have been closed to checkpoint this one

all times are in msec.


[root@bob ~]# cat /proc/fs/jbd/sda/info
280 transaction, each upto 8192 blocks
average:
  1633ms waiting for transaction
  3616ms running transaction
  5ms transaction was being locked
  1ms flushing data (in ordered mode)
  1799ms logging transaction
  11781 handles per transaction
  5629 blocks per transaction
  5641 logged blocks per transaction

Signed-off-by: Johann Lombardi <johann.lombardi@bull.net>
Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/jbd2/checkpoint.c  |  10 +-
 fs/jbd2/commit.c      |  49 ++++++++
 fs/jbd2/journal.c     | 338 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/jbd2/transaction.c |   9 ++
 4 files changed, 404 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 7e958c86242f..1b7f282c1ae9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -232,7 +232,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count)
+			struct buffer_head **bhs, int *batch_count,
+			transaction_t *transaction)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
@@ -250,6 +251,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		transaction_t *t = jh->b_transaction;
 		tid_t tid = t->t_tid;
 
+		transaction->t_chp_stats.cs_forced_to_close++;
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
 		jbd2_log_start_commit(journal, tid);
@@ -279,6 +281,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		bhs[*batch_count] = bh;
 		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
+		transaction->t_chp_stats.cs_written++;
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
 			spin_unlock(&journal->j_list_lock);
@@ -322,6 +325,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	if (!journal->j_checkpoint_transactions)
 		goto out;
 	transaction = journal->j_checkpoint_transactions;
+	if (transaction->t_chp_stats.cs_chp_time == 0)
+		transaction->t_chp_stats.cs_chp_time = jiffies;
 	this_tid = transaction->t_tid;
 restart:
 	/*
@@ -346,7 +351,8 @@ restart:
 				retry = 1;
 				break;
 			}
-			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			retry = __process_buffer(journal, jh, bhs, &batch_count,
+						 transaction);
 			if (!retry && lock_need_resched(&journal->j_list_lock)){
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 39b5cee3dd8a..8749a86f4175 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/jiffies.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -290,6 +291,7 @@ static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
  */
 void jbd2_journal_commit_transaction(journal_t *journal)
 {
+	struct transaction_stats_s stats;
 	transaction_t *commit_transaction;
 	struct journal_head *jh, *new_jh, *descriptor;
 	struct buffer_head **wbuf = journal->j_wbuf;
@@ -337,6 +339,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	stats.u.run.rs_wait = commit_transaction->t_max_wait;
+	stats.u.run.rs_locked = jiffies;
+	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
+						stats.u.run.rs_locked);
+
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
@@ -407,6 +414,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	jbd2_journal_switch_revoke_table(journal);
 
+	stats.u.run.rs_flushing = jiffies;
+	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
+					       stats.u.run.rs_flushing);
+
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
 	journal->j_running_transaction = NULL;
@@ -498,6 +509,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	commit_transaction->t_state = T_COMMIT;
 
+	stats.u.run.rs_logging = jiffies;
+	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
+						 stats.u.run.rs_logging);
+	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
+	stats.u.run.rs_blocks_logged = 0;
+
 	descriptor = NULL;
 	bufs = 0;
 	while (commit_transaction->t_buffers) {
@@ -646,6 +663,7 @@ start_journal_io:
 				submit_bh(WRITE, bh);
 			}
 			cond_resched();
+			stats.u.run.rs_blocks_logged += bufs;
 
 			/* Force a new descriptor to be generated next
                            time round the loop. */
@@ -816,6 +834,7 @@ restart_loop:
 		cp_transaction = jh->b_cp_transaction;
 		if (cp_transaction) {
 			JBUFFER_TRACE(jh, "remove from old cp transaction");
+			cp_transaction->t_chp_stats.cs_dropped++;
 			__jbd2_journal_remove_checkpoint(jh);
 		}
 
@@ -890,6 +909,36 @@ restart_loop:
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
+	commit_transaction->t_start = jiffies;
+	stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
+						commit_transaction->t_start);
+
+	/*
+	 * File the transaction for history
+	 */
+	stats.ts_type = JBD2_STATS_RUN;
+	stats.ts_tid = commit_transaction->t_tid;
+	stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
+	spin_lock(&journal->j_history_lock);
+	memcpy(journal->j_history + journal->j_history_cur, &stats,
+			sizeof(stats));
+	if (++journal->j_history_cur == journal->j_history_max)
+		journal->j_history_cur = 0;
+
+	/*
+	 * Calculate overall stats
+	 */
+	journal->j_stats.ts_tid++;
+	journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
+	journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
+	journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
+	journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
+	journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
+	journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
+	journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
+	journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
+	spin_unlock(&journal->j_history_lock);
+
 	commit_transaction->t_state = T_FINISHED;
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 6ddc5531587c..3667c91bc786 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -36,6 +36,7 @@
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -640,6 +641,312 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
 	return jbd2_journal_add_journal_head(bh);
 }
 
+struct jbd2_stats_proc_session {
+	journal_t *journal;
+	struct transaction_stats_s *stats;
+	int start;
+	int max;
+};
+
+static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
+					struct transaction_stats_s *ts,
+					int first)
+{
+	if (ts == s->stats + s->max)
+		ts = s->stats;
+	if (!first && ts == s->stats + s->start)
+		return NULL;
+	while (ts->ts_type == 0) {
+		ts++;
+		if (ts == s->stats + s->max)
+			ts = s->stats;
+		if (ts == s->stats + s->start)
+			return NULL;
+	}
+	return ts;
+
+}
+
+static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
+{
+	struct jbd2_stats_proc_session *s = seq->private;
+	struct transaction_stats_s *ts;
+	int l = *pos;
+
+	if (l == 0)
+		return SEQ_START_TOKEN;
+	ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
+	if (!ts)
+		return NULL;
+	l--;
+	while (l) {
+		ts = jbd2_history_skip_empty(s, ++ts, 0);
+		if (!ts)
+			break;
+		l--;
+	}
+	return ts;
+}
+
+static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct jbd2_stats_proc_session *s = seq->private;
+	struct transaction_stats_s *ts = v;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return jbd2_history_skip_empty(s, s->stats + s->start, 1);
+	else
+		return jbd2_history_skip_empty(s, ++ts, 0);
+}
+
+static int jbd2_seq_history_show(struct seq_file *seq, void *v)
+{
+	struct transaction_stats_s *ts = v;
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
+				"%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
+				"wait", "run", "lock", "flush", "log", "hndls",
+				"block", "inlog", "ctime", "write", "drop",
+				"close");
+		return 0;
+	}
+	if (ts->ts_type == JBD2_STATS_RUN)
+		seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
+				"%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
+				jiffies_to_msecs(ts->u.run.rs_wait),
+				jiffies_to_msecs(ts->u.run.rs_running),
+				jiffies_to_msecs(ts->u.run.rs_locked),
+				jiffies_to_msecs(ts->u.run.rs_flushing),
+				jiffies_to_msecs(ts->u.run.rs_logging),
+				ts->u.run.rs_handle_count,
+				ts->u.run.rs_blocks,
+				ts->u.run.rs_blocks_logged);
+	else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
+		seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
+				"C", ts->ts_tid, " ",
+				jiffies_to_msecs(ts->u.chp.cs_chp_time),
+				ts->u.chp.cs_written, ts->u.chp.cs_dropped,
+				ts->u.chp.cs_forced_to_close);
+	else
+		J_ASSERT(0);
+	return 0;
+}
+
+static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations jbd2_seq_history_ops = {
+	.start  = jbd2_seq_history_start,
+	.next   = jbd2_seq_history_next,
+	.stop   = jbd2_seq_history_stop,
+	.show   = jbd2_seq_history_show,
+};
+
+static int jbd2_seq_history_open(struct inode *inode, struct file *file)
+{
+	journal_t *journal = PDE(inode)->data;
+	struct jbd2_stats_proc_session *s;
+	int rc, size;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return -ENOMEM;
+	size = sizeof(struct transaction_stats_s) * journal->j_history_max;
+	s->stats = kmalloc(size, GFP_KERNEL);
+	if (s->stats == NULL) {
+		kfree(s);
+		return -ENOMEM;
+	}
+	spin_lock(&journal->j_history_lock);
+	memcpy(s->stats, journal->j_history, size);
+	s->max = journal->j_history_max;
+	s->start = journal->j_history_cur % s->max;
+	spin_unlock(&journal->j_history_lock);
+
+	rc = seq_open(file, &jbd2_seq_history_ops);
+	if (rc == 0) {
+		struct seq_file *m = file->private_data;
+		m->private = s;
+	} else {
+		kfree(s->stats);
+		kfree(s);
+	}
+	return rc;
+
+}
+
+static int jbd2_seq_history_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct jbd2_stats_proc_session *s = seq->private;
+
+	kfree(s->stats);
+	kfree(s);
+	return seq_release(inode, file);
+}
+
+static struct file_operations jbd2_seq_history_fops = {
+	.owner		= THIS_MODULE,
+	.open           = jbd2_seq_history_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = jbd2_seq_history_release,
+};
+
+static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? NULL : SEQ_START_TOKEN;
+}
+
+static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static int jbd2_seq_info_show(struct seq_file *seq, void *v)
+{
+	struct jbd2_stats_proc_session *s = seq->private;
+
+	if (v != SEQ_START_TOKEN)
+		return 0;
+	seq_printf(seq, "%lu transaction, each upto %u blocks\n",
+			s->stats->ts_tid,
+			s->journal->j_max_transaction_buffers);
+	if (s->stats->ts_tid == 0)
+		return 0;
+	seq_printf(seq, "average: \n  %ums waiting for transaction\n",
+	    jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid));
+	seq_printf(seq, "  %ums running transaction\n",
+	    jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid));
+	seq_printf(seq, "  %ums transaction was being locked\n",
+	    jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid));
+	seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
+	    jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
+	seq_printf(seq, "  %ums logging transaction\n",
+	    jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+	seq_printf(seq, "  %lu handles per transaction\n",
+	    s->stats->u.run.rs_handle_count / s->stats->ts_tid);
+	seq_printf(seq, "  %lu blocks per transaction\n",
+	    s->stats->u.run.rs_blocks / s->stats->ts_tid);
+	seq_printf(seq, "  %lu logged blocks per transaction\n",
+	    s->stats->u.run.rs_blocks_logged / s->stats->ts_tid);
+	return 0;
+}
+
+static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations jbd2_seq_info_ops = {
+	.start  = jbd2_seq_info_start,
+	.next   = jbd2_seq_info_next,
+	.stop   = jbd2_seq_info_stop,
+	.show   = jbd2_seq_info_show,
+};
+
+static int jbd2_seq_info_open(struct inode *inode, struct file *file)
+{
+	journal_t *journal = PDE(inode)->data;
+	struct jbd2_stats_proc_session *s;
+	int rc, size;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return -ENOMEM;
+	size = sizeof(struct transaction_stats_s);
+	s->stats = kmalloc(size, GFP_KERNEL);
+	if (s->stats == NULL) {
+		kfree(s);
+		return -ENOMEM;
+	}
+	spin_lock(&journal->j_history_lock);
+	memcpy(s->stats, &journal->j_stats, size);
+	s->journal = journal;
+	spin_unlock(&journal->j_history_lock);
+
+	rc = seq_open(file, &jbd2_seq_info_ops);
+	if (rc == 0) {
+		struct seq_file *m = file->private_data;
+		m->private = s;
+	} else {
+		kfree(s->stats);
+		kfree(s);
+	}
+	return rc;
+
+}
+
+static int jbd2_seq_info_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct jbd2_stats_proc_session *s = seq->private;
+	kfree(s->stats);
+	kfree(s);
+	return seq_release(inode, file);
+}
+
+static struct file_operations jbd2_seq_info_fops = {
+	.owner		= THIS_MODULE,
+	.open           = jbd2_seq_info_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = jbd2_seq_info_release,
+};
+
+static struct proc_dir_entry *proc_jbd2_stats;
+
+static void jbd2_stats_proc_init(journal_t *journal)
+{
+	char name[BDEVNAME_SIZE];
+
+	snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+	journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
+	if (journal->j_proc_entry) {
+		struct proc_dir_entry *p;
+		p = create_proc_entry("history", S_IRUGO,
+				journal->j_proc_entry);
+		if (p) {
+			p->proc_fops = &jbd2_seq_history_fops;
+			p->data = journal;
+			p = create_proc_entry("info", S_IRUGO,
+						journal->j_proc_entry);
+			if (p) {
+				p->proc_fops = &jbd2_seq_info_fops;
+				p->data = journal;
+			}
+		}
+	}
+}
+
+static void jbd2_stats_proc_exit(journal_t *journal)
+{
+	char name[BDEVNAME_SIZE];
+
+	snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+	remove_proc_entry("info", journal->j_proc_entry);
+	remove_proc_entry("history", journal->j_proc_entry);
+	remove_proc_entry(name, proc_jbd2_stats);
+}
+
+static void journal_init_stats(journal_t *journal)
+{
+	int size;
+
+	if (!proc_jbd2_stats)
+		return;
+
+	journal->j_history_max = 100;
+	size = sizeof(struct transaction_stats_s) * journal->j_history_max;
+	journal->j_history = kzalloc(size, GFP_KERNEL);
+	if (!journal->j_history) {
+		journal->j_history_max = 0;
+		return;
+	}
+	spin_lock_init(&journal->j_history_lock);
+}
+
 /*
  * Management for journal control blocks: functions to create and
  * destroy journal_t structures, and to initialise and read existing
@@ -681,6 +988,9 @@ static journal_t * journal_init_common (void)
 		kfree(journal);
 		goto fail;
 	}
+
+	journal_init_stats(journal);
+
 	return journal;
 fail:
 	return NULL;
@@ -735,6 +1045,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 	journal->j_fs_dev = fs_dev;
 	journal->j_blk_offset = start;
 	journal->j_maxlen = len;
+	jbd2_stats_proc_init(journal);
 
 	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
 	J_ASSERT(bh != NULL);
@@ -773,6 +1084,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 
 	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
 	journal->j_blocksize = inode->i_sb->s_blocksize;
+	jbd2_stats_proc_init(journal);
 
 	/* journal descriptor can store up to n blocks -bzzz */
 	n = journal->j_blocksize / sizeof(journal_block_tag_t);
@@ -1153,6 +1465,8 @@ void jbd2_journal_destroy(journal_t *journal)
 		brelse(journal->j_sb_buffer);
 	}
 
+	if (journal->j_proc_entry)
+		jbd2_stats_proc_exit(journal);
 	if (journal->j_inode)
 		iput(journal->j_inode);
 	if (journal->j_revoke)
@@ -1900,6 +2214,28 @@ static void __exit jbd2_remove_debugfs_entry(void)
 
 #endif
 
+#ifdef CONFIG_PROC_FS
+
+#define JBD2_STATS_PROC_NAME "fs/jbd2"
+
+static void __init jbd2_create_jbd_stats_proc_entry(void)
+{
+	proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
+}
+
+static void __exit jbd2_remove_jbd_stats_proc_entry(void)
+{
+	if (proc_jbd2_stats)
+		remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
+}
+
+#else
+
+#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
+#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
+
+#endif
+
 struct kmem_cache *jbd2_handle_cache;
 
 static int __init journal_init_handle_cache(void)
@@ -1955,6 +2291,7 @@ static int __init journal_init(void)
 	if (ret != 0)
 		jbd2_journal_destroy_caches();
 	jbd2_create_debugfs_entry();
+	jbd2_create_jbd_stats_proc_entry();
 	return ret;
 }
 
@@ -1966,6 +2303,7 @@ static void __exit journal_exit(void)
 		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
 #endif
 	jbd2_remove_debugfs_entry();
+	jbd2_remove_jbd_stats_proc_entry();
 	jbd2_journal_destroy_caches();
 }
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b1fcf2b3dca3..f30802aeefae 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -59,6 +59,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 
 	J_ASSERT(journal->j_running_transaction == NULL);
 	journal->j_running_transaction = transaction;
+	transaction->t_max_wait = 0;
+	transaction->t_start = jiffies;
 
 	return transaction;
 }
@@ -85,6 +87,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 	int nblocks = handle->h_buffer_credits;
 	transaction_t *new_transaction = NULL;
 	int ret = 0;
+	unsigned long ts = jiffies;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
 		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -217,6 +220,12 @@ repeat_locked:
 	/* OK, account for the buffers that this operation expects to
 	 * use and add the handle to the running transaction. */
 
+	if (time_after(transaction->t_start, ts)) {
+		ts = jbd2_time_diff(ts, transaction->t_start);
+		if (ts > transaction->t_max_wait)
+			transaction->t_max_wait = ts;
+	}
+
 	handle->h_transaction = transaction;
 	transaction->t_outstanding_credits += nblocks;
 	transaction->t_updates++;
-- 
cgit v1.2.3


From 818d276ceb83aa9fdebb5e0a53188290312de987 Mon Sep 17 00:00:00 2001
From: Girish Shilamkar <girish@clusterfs.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Add the journal checksum feature

The journal checksum feature adds two new flags i.e
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT and JBD2_FEATURE_COMPAT_CHECKSUM.

JBD2_FEATURE_CHECKSUM flag indicates that the commit block contains the
checksum for the blocks described by the descriptor blocks.
Due to checksums, writing of the commit record no longer needs to be
synchronous. Now commit record can be sent to disk without waiting for
descriptor blocks to be written to disk. This behavior is controlled
using JBD2_FEATURE_ASYNC_COMMIT flag. Older kernels/e2fsck should not be
able to recover the journal with _ASYNC_COMMIT hence it is made
incompat.
The commit header has been extended to hold the checksum along with the
type of the checksum.

For recovery in pass scan checksums are verified to ensure the sanity
and completeness(in case of _ASYNC_COMMIT) of every transaction.

Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Girish Shilamkar <girish@clusterfs.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/Kconfig         |   1 +
 fs/ext4/super.c    |  25 +++++++
 fs/jbd2/commit.c   | 198 ++++++++++++++++++++++++++++++++++++++++-------------
 fs/jbd2/journal.c  |  26 +++++++
 fs/jbd2/recovery.c | 151 +++++++++++++++++++++++++++++++++++++---
 5 files changed, 344 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9656139d2e99..219ec06a8c7e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -236,6 +236,7 @@ config JBD_DEBUG
 
 config JBD2
 	tristate
+	select CRC32
 	help
 	  This is a generic journaling layer for block devices that support
 	  both 32-bit and 64-bit block numbers.  It is currently used by
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c7305443e100..f7479d30735e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -869,6 +869,7 @@ enum {
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
@@ -908,6 +909,8 @@ static match_table_t tokens = {
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_inum, "journal=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
+	{Opt_journal_checksum, "journal_checksum"},
+	{Opt_journal_async_commit, "journal_async_commit"},
 	{Opt_abort, "abort"},
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
@@ -1095,6 +1098,13 @@ static int parse_options (char *options, struct super_block *sb,
 				return 0;
 			*journal_devnum = option;
 			break;
+		case Opt_journal_checksum:
+			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+			break;
+		case Opt_journal_async_commit:
+			set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+			break;
 		case Opt_noload:
 			set_opt (sbi->s_mount_opt, NOLOAD);
 			break;
@@ -2114,6 +2124,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount4;
 	}
 
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+		jbd2_journal_set_features(sbi->s_journal,
+				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+		jbd2_journal_set_features(sbi->s_journal,
+				JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	} else {
+		jbd2_journal_clear_features(sbi->s_journal,
+				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	}
+
 	/* We have now updated the journal if required, so we can
 	 * validate the data journaling mode. */
 	switch (test_opt(sb, DATA_FLAGS)) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8749a86f4175..da8d0eb3b7b9 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
+#include <linux/crc32.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -93,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh)
 	return 1;
 }
 
-/* Done it all: now write the commit record.  We should have
+/*
+ * Done it all: now submit the commit record.  We should have
  * cleaned up our previous buffers by now, so if we are in abort
  * mode we can now just skip the rest of the journal write
  * entirely.
  *
  * Returns 1 if the journal needs to be aborted or 0 on success
  */
-static int journal_write_commit_record(journal_t *journal,
-					transaction_t *commit_transaction)
+static int journal_submit_commit_record(journal_t *journal,
+					transaction_t *commit_transaction,
+					struct buffer_head **cbh,
+					__u32 crc32_sum)
 {
 	struct journal_head *descriptor;
+	struct commit_header *tmp;
 	struct buffer_head *bh;
-	int i, ret;
+	int ret;
 	int barrier_done = 0;
 
 	if (is_journal_aborted(journal))
@@ -117,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal,
 
 	bh = jh2bh(descriptor);
 
-	/* AKPM: buglet - add `i' to tmp! */
-	for (i = 0; i < bh->b_size; i += 512) {
-		journal_header_t *tmp = (journal_header_t*)bh->b_data;
-		tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
-		tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
-		tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	tmp = (struct commit_header *)bh->b_data;
+	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
+	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+
+	if (JBD2_HAS_COMPAT_FEATURE(journal,
+				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
+		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
+		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
 	}
 
-	JBUFFER_TRACE(descriptor, "write commit block");
+	JBUFFER_TRACE(descriptor, "submit commit block");
+	lock_buffer(bh);
+
 	set_buffer_dirty(bh);
-	if (journal->j_flags & JBD2_BARRIER) {
+	set_buffer_uptodate(bh);
+	bh->b_end_io = journal_end_buffer_io_sync;
+
+	if (journal->j_flags & JBD2_BARRIER &&
+		!JBD2_HAS_COMPAT_FEATURE(journal,
+					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 		set_buffer_ordered(bh);
 		barrier_done = 1;
 	}
-	ret = sync_dirty_buffer(bh);
+	ret = submit_bh(WRITE, bh);
+
 	/* is it possible for another commit to fail at roughly
 	 * the same time as this one?  If so, we don't want to
 	 * trust the barrier flag in the super, but instead want
@@ -152,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal,
 		clear_buffer_ordered(bh);
 		set_buffer_uptodate(bh);
 		set_buffer_dirty(bh);
-		ret = sync_dirty_buffer(bh);
+		ret = submit_bh(WRITE, bh);
 	}
-	put_bh(bh);		/* One for getblk() */
-	jbd2_journal_put_journal_head(descriptor);
+	*cbh = bh;
+	return ret;
+}
+
+/*
+ * This function along with journal_submit_commit_record
+ * allows to write the commit record asynchronously.
+ */
+static int journal_wait_on_commit_record(struct buffer_head *bh)
+{
+	int ret = 0;
+
+	clear_buffer_dirty(bh);
+	wait_on_buffer(bh);
 
-	return (ret == -EIO);
+	if (unlikely(!buffer_uptodate(bh)))
+		ret = -EIO;
+	put_bh(bh);            /* One for getblk() */
+	jbd2_journal_put_journal_head(bh2jh(bh));
+
+	return ret;
 }
 
+/*
+ * Wait for all submitted IO to complete.
+ */
+static int journal_wait_on_locked_list(journal_t *journal,
+				       transaction_t *commit_transaction)
+{
+	int ret = 0;
+	struct journal_head *jh;
+
+	while (commit_transaction->t_locked_list) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_locked_list->b_tprev;
+		bh = jh2bh(jh);
+		get_bh(bh);
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			wait_on_buffer(bh);
+			if (unlikely(!buffer_uptodate(bh)))
+				ret = -EIO;
+			spin_lock(&journal->j_list_lock);
+		}
+		if (!inverted_lock(journal, bh)) {
+			put_bh(bh);
+			spin_lock(&journal->j_list_lock);
+			continue;
+		}
+		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+			__jbd2_journal_unfile_buffer(jh);
+			jbd_unlock_bh_state(bh);
+			jbd2_journal_remove_journal_head(bh);
+			put_bh(bh);
+		} else {
+			jbd_unlock_bh_state(bh);
+		}
+		put_bh(bh);
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	return ret;
+  }
+
 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 {
 	int i;
@@ -275,7 +350,21 @@ write_out_data:
 	journal_do_submit_data(wbuf, bufs);
 }
 
-static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+	char *addr;
+	__u32 checksum;
+
+	addr = kmap_atomic(page, KM_USER0);
+	checksum = crc32_be(crc32_sum,
+		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
+	kunmap_atomic(addr, KM_USER0);
+
+	return checksum;
+}
+
+static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 				   unsigned long long block)
 {
 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
@@ -307,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_flag;
 	int i;
 	int tag_bytes = journal_tag_bytes(journal);
+	struct buffer_head *cbh = NULL; /* For transactional checksums */
+	__u32 crc32_sum = ~0;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -451,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	journal_submit_data_buffers(journal, commit_transaction);
 
 	/*
-	 * Wait for all previously submitted IO to complete.
+	 * Wait for all previously submitted IO to complete if commit
+	 * record is to be written synchronously.
 	 */
 	spin_lock(&journal->j_list_lock);
-	while (commit_transaction->t_locked_list) {
-		struct buffer_head *bh;
+	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+		err = journal_wait_on_locked_list(journal,
+						commit_transaction);
 
-		jh = commit_transaction->t_locked_list->b_tprev;
-		bh = jh2bh(jh);
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			wait_on_buffer(bh);
-			if (unlikely(!buffer_uptodate(bh)))
-				err = -EIO;
-			spin_lock(&journal->j_list_lock);
-		}
-		if (!inverted_lock(journal, bh)) {
-			put_bh(bh);
-			spin_lock(&journal->j_list_lock);
-			continue;
-		}
-		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			jbd2_journal_remove_journal_head(bh);
-			put_bh(bh);
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-		put_bh(bh);
-		cond_resched_lock(&journal->j_list_lock);
-	}
 	spin_unlock(&journal->j_list_lock);
 
 	if (err)
@@ -656,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 start_journal_io:
 			for (i = 0; i < bufs; i++) {
 				struct buffer_head *bh = wbuf[i];
+				/*
+				 * Compute checksum.
+				 */
+				if (JBD2_HAS_COMPAT_FEATURE(journal,
+					JBD2_FEATURE_COMPAT_CHECKSUM)) {
+					crc32_sum =
+					    jbd2_checksum_data(crc32_sum, bh);
+				}
+
 				lock_buffer(bh);
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
@@ -672,6 +749,23 @@ start_journal_io:
 		}
 	}
 
+	/* Done it all: now write the commit record asynchronously. */
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+		err = journal_submit_commit_record(journal, commit_transaction,
+						 &cbh, crc32_sum);
+		if (err)
+			__jbd2_journal_abort_hard(journal);
+
+		spin_lock(&journal->j_list_lock);
+		err = journal_wait_on_locked_list(journal,
+						commit_transaction);
+		spin_unlock(&journal->j_list_lock);
+		if (err)
+			__jbd2_journal_abort_hard(journal);
+	}
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
@@ -771,8 +865,14 @@ wait_for_iobuf:
 
 	jbd_debug(3, "JBD: commit phase 6\n");
 
-	if (journal_write_commit_record(journal, commit_transaction))
-		err = -EIO;
+	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+		err = journal_submit_commit_record(journal, commit_transaction,
+						&cbh, crc32_sum);
+		if (err)
+			__jbd2_journal_abort_hard(journal);
+	}
+	err = journal_wait_on_commit_record(cbh);
 
 	if (err)
 		jbd2_journal_abort(journal, err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 3667c91bc786..59ba2494dcaf 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1578,6 +1578,32 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
 	return 1;
 }
 
+/*
+ * jbd2_journal_clear_features () - Clear a given journal feature in the
+ * 				    superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Clear a given journal feature as present on the
+ * superblock.
+ */
+void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
+				unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+		  compat, ro, incompat);
+
+	sb = journal->j_superblock;
+
+	sb->s_feature_compat    &= ~cpu_to_be32(compat);
+	sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
+	sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
+}
+EXPORT_SYMBOL(jbd2_journal_clear_features);
 
 /**
  * int jbd2_journal_update_format () - Update on-disk journal structure.
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d0ce627539ef..921680663fa2 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/crc32.h>
 #endif
 
 /*
@@ -316,6 +317,37 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag
 	return block;
 }
 
+/*
+ * calc_chksums calculates the checksums for the blocks described in the
+ * descriptor block.
+ */
+static int calc_chksums(journal_t *journal, struct buffer_head *bh,
+			unsigned long *next_log_block, __u32 *crc32_sum)
+{
+	int i, num_blks, err;
+	unsigned long io_block;
+	struct buffer_head *obh;
+
+	num_blks = count_tags(journal, bh);
+	/* Calculate checksum of the descriptor block. */
+	*crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
+
+	for (i = 0; i < num_blks; i++) {
+		io_block = (*next_log_block)++;
+		wrap(journal, *next_log_block);
+		err = jread(&obh, journal, io_block);
+		if (err) {
+			printk(KERN_ERR "JBD: IO error %d recovering block "
+				"%lu in log\n", err, io_block);
+			return 1;
+		} else {
+			*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
+				     obh->b_size);
+		}
+	}
+	return 0;
+}
+
 static int do_one_pass(journal_t *journal,
 			struct recovery_info *info, enum passtype pass)
 {
@@ -328,6 +360,7 @@ static int do_one_pass(journal_t *journal,
 	unsigned int		sequence;
 	int			blocktype;
 	int			tag_bytes = journal_tag_bytes(journal);
+	__u32			crc32_sum = ~0; /* Transactional Checksums */
 
 	/* Precompute the maximum metadata descriptors in a descriptor block */
 	int			MAX_BLOCKS_PER_DESC;
@@ -419,12 +452,26 @@ static int do_one_pass(journal_t *journal,
 		switch(blocktype) {
 		case JBD2_DESCRIPTOR_BLOCK:
 			/* If it is a valid descriptor block, replay it
-			 * in pass REPLAY; otherwise, just skip over the
-			 * blocks it describes. */
+			 * in pass REPLAY; if journal_checksums enabled, then
+			 * calculate checksums in PASS_SCAN, otherwise,
+			 * just skip over the blocks it describes. */
 			if (pass != PASS_REPLAY) {
+				if (pass == PASS_SCAN &&
+				    JBD2_HAS_COMPAT_FEATURE(journal,
+					    JBD2_FEATURE_COMPAT_CHECKSUM) &&
+				    !info->end_transaction) {
+					if (calc_chksums(journal, bh,
+							&next_log_block,
+							&crc32_sum)) {
+						put_bh(bh);
+						break;
+					}
+					put_bh(bh);
+					continue;
+				}
 				next_log_block += count_tags(journal, bh);
 				wrap(journal, next_log_block);
-				brelse(bh);
+				put_bh(bh);
 				continue;
 			}
 
@@ -516,9 +563,96 @@ static int do_one_pass(journal_t *journal,
 			continue;
 
 		case JBD2_COMMIT_BLOCK:
-			/* Found an expected commit block: not much to
-			 * do other than move on to the next sequence
+			/*     How to differentiate between interrupted commit
+			 *               and journal corruption ?
+			 *
+			 * {nth transaction}
+			 *        Checksum Verification Failed
+			 *			 |
+			 *		 ____________________
+			 *		|		     |
+			 * 	async_commit             sync_commit
+			 *     		|                    |
+			 *		| GO TO NEXT    "Journal Corruption"
+			 *		| TRANSACTION
+			 *		|
+			 * {(n+1)th transanction}
+			 *		|
+			 * 	 _______|______________
+			 * 	|	 	      |
+			 * Commit block found	Commit block not found
+			 *      |		      |
+			 * "Journal Corruption"       |
+			 *		 _____________|_________
+			 *     		|	           	|
+			 *	nth trans corrupt	OR   nth trans
+			 *	and (n+1)th interrupted     interrupted
+			 *	before commit block
+			 *      could reach the disk.
+			 *	(Cannot find the difference in above
+			 *	 mentioned conditions. Hence assume
+			 *	 "Interrupted Commit".)
+			 */
+
+			/* Found an expected commit block: if checksums
+			 * are present verify them in PASS_SCAN; else not
+			 * much to do other than move on to the next sequence
 			 * number. */
+			if (pass == PASS_SCAN &&
+			    JBD2_HAS_COMPAT_FEATURE(journal,
+				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
+				int chksum_err, chksum_seen;
+				struct commit_header *cbh =
+					(struct commit_header *)bh->b_data;
+				unsigned found_chksum =
+					be32_to_cpu(cbh->h_chksum[0]);
+
+				chksum_err = chksum_seen = 0;
+
+				if (info->end_transaction) {
+					printk(KERN_ERR "JBD: Transaction %u "
+						"found to be corrupt.\n",
+						next_commit_ID - 1);
+					brelse(bh);
+					break;
+				}
+
+				if (crc32_sum == found_chksum &&
+				    cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
+				    cbh->h_chksum_size ==
+						JBD2_CRC32_CHKSUM_SIZE)
+				       chksum_seen = 1;
+				else if (!(cbh->h_chksum_type == 0 &&
+					     cbh->h_chksum_size == 0 &&
+					     found_chksum == 0 &&
+					     !chksum_seen))
+				/*
+				 * If fs is mounted using an old kernel and then
+				 * kernel with journal_chksum is used then we
+				 * get a situation where the journal flag has
+				 * checksum flag set but checksums are not
+				 * present i.e chksum = 0, in the individual
+				 * commit blocks.
+				 * Hence to avoid checksum failures, in this
+				 * situation, this extra check is added.
+				 */
+						chksum_err = 1;
+
+				if (chksum_err) {
+					info->end_transaction = next_commit_ID;
+
+					if (!JBD2_HAS_COMPAT_FEATURE(journal,
+					   JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+						printk(KERN_ERR
+						       "JBD: Transaction %u "
+						       "found to be corrupt.\n",
+						       next_commit_ID);
+						brelse(bh);
+						break;
+					}
+				}
+				crc32_sum = ~0;
+			}
 			brelse(bh);
 			next_commit_ID++;
 			continue;
@@ -554,9 +688,10 @@ static int do_one_pass(journal_t *journal,
 	 * transaction marks the end of the valid log.
 	 */
 
-	if (pass == PASS_SCAN)
-		info->end_transaction = next_commit_ID;
-	else {
+	if (pass == PASS_SCAN) {
+		if (!info->end_transaction)
+			info->end_transaction = next_commit_ID;
+	} else {
 		/* It's really bad news if different passes end up at
 		 * different places (but possible due to IO errors). */
 		if (info->end_transaction != next_commit_ID) {
-- 
cgit v1.2.3


From 7a224228ed79d587ece2304869000aad1b8e97dd Mon Sep 17 00:00:00 2001
From: Jean Noel Cordenner <jean-noel.cordenner@bull.net>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: vfs: Add 64 bit i_version support

The i_version field of the inode is changed to be a 64-bit counter that
is set on every inode creation and that is incremented every time the
inode data is modified (similarly to the "ctime" time-stamp).
The aim is to fulfill a NFSv4 requirement for rfc3530.
This first part concerns the vfs, it converts the 32-bit i_version in
the generic inode to a 64-bit, a flag is added in the super block in
order to check if the feature is enabled and the i_version is
incremented in the vfs.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Jean Noel Cordenner <jean-noel.cordenner@bull.net>
Signed-off-by: Kalpak Shah <kalpak@clusterfs.com>
---
 fs/afs/dir.c   |  9 +++++----
 fs/afs/inode.c |  3 ++-
 fs/inode.c     | 22 ++++++++++++++++++++++
 3 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 33fe39ad4e03..0cc3597c1197 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -546,11 +546,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	dentry->d_op = &afs_fs_dentry_operations;
 
 	d_add(dentry, inode);
-	_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }",
+	_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
 	       fid.vnode,
 	       fid.unique,
 	       dentry->d_inode->i_ino,
-	       dentry->d_inode->i_version);
+	       (unsigned long long)dentry->d_inode->i_version);
 
 	return NULL;
 }
@@ -630,9 +630,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 		 * been deleted and replaced, and the original vnode ID has
 		 * been reused */
 		if (fid.unique != vnode->fid.unique) {
-			_debug("%s: file deleted (uq %u -> %u I:%lu)",
+			_debug("%s: file deleted (uq %u -> %u I:%llu)",
 			       dentry->d_name.name, fid.unique,
-			       vnode->fid.unique, dentry->d_inode->i_version);
+			       vnode->fid.unique,
+			       (unsigned long long)dentry->d_inode->i_version);
 			spin_lock(&vnode->lock);
 			set_bit(AFS_VNODE_DELETED, &vnode->flags);
 			spin_unlock(&vnode->lock);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d196840127c6..84750c8e9f95 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -301,7 +301,8 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 	inode = dentry->d_inode;
 
-	_enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version);
+	_enter("{ ino=%lu v=%llu }", inode->i_ino,
+		(unsigned long long)inode->i_version);
 
 	generic_fillattr(inode, stat);
 	return 0;
diff --git a/fs/inode.c b/fs/inode.c
index ed35383d0b6c..b48324a94c2b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1242,6 +1242,23 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 }
 EXPORT_SYMBOL(touch_atime);
 
+/**
+ *     inode_inc_iversion      -       increments i_version
+ *     @inode: inode that need to be updated
+ *
+ *     Every time the inode is modified, the i_version field
+ *     will be incremented.
+ *     The filesystem has to be mounted with i_version flag
+ *
+ */
+
+void inode_inc_iversion(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	inode->i_version++;
+	spin_unlock(&inode->i_lock);
+}
+
 /**
  *	file_update_time	-	update mtime and ctime time
  *	@file: file accessed
@@ -1276,6 +1293,11 @@ void file_update_time(struct file *file)
 		sync_it = 1;
 	}
 
+	if (IS_I_VERSION(inode)) {
+		inode_inc_iversion(inode);
+		sync_it = 1;
+	}
+
 	if (sync_it)
 		mark_inode_dirty_sync(inode);
 }
-- 
cgit v1.2.3


From 25ec56b518257a56d2ff41a941d288e4b5ff9488 Mon Sep 17 00:00:00 2001
From: Jean Noel Cordenner <jean-noel.cordenner@bull.net>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Add inode version support in ext4

This patch adds 64-bit inode version support to ext4. The lower 32 bits
are stored in the osd1.linux1.l_i_version field while the high 32 bits
are stored in the i_version_hi field newly created in the ext4_inode.
This field is incremented in case the ext4_inode is large enough. A
i_version mount option has been added to enable the feature.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Kalpak Shah <kalpak@clusterfs.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Jean Noel Cordenner <jean-noel.cordenner@bull.net>
---
 fs/ext4/inode.c | 18 +++++++++++++++++-
 fs/ext4/super.c | 10 ++++++++--
 fs/inode.c      | 17 -----------------
 3 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89cd35386ff5..a06a3b7cfc34 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2781,6 +2781,13 @@ void ext4_read_inode(struct inode * inode)
 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 
+	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+			inode->i_version |=
+			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+	}
+
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
@@ -2963,8 +2970,14 @@ static int ext4_do_update_inode(handle_t *handle,
 	} else for (block = 0; block < EXT4_N_BLOCKS; block++)
 		raw_inode->i_block[block] = ei->i_data[block];
 
-	if (ei->i_extra_isize)
+	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+	if (ei->i_extra_isize) {
+		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+			raw_inode->i_version_hi =
+			cpu_to_le32(inode->i_version >> 32);
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+	}
+
 
 	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
 	rc = ext4_journal_dirty_metadata(handle, bh);
@@ -3191,6 +3204,9 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 {
 	int err = 0;
 
+	if (test_opt(inode->i_sb, I_VERSION))
+		inode_inc_iversion(inode);
+
 	/* the do_update_inode consumes one bh->b_count */
 	get_bh(iloc->bh);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7479d30735e..aa22acd6eb06 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -732,6 +732,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nobh");
 	if (!test_opt(sb, EXTENTS))
 		seq_puts(seq, ",noextents");
+	if (test_opt(sb, I_VERSION))
+		seq_puts(seq, ",i_version");
 
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
 		seq_puts(seq, ",data=journal");
@@ -874,7 +876,7 @@ enum {
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota, Opt_extents, Opt_noextents,
+	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
 };
 
 static match_table_t tokens = {
@@ -928,6 +930,7 @@ static match_table_t tokens = {
 	{Opt_barrier, "barrier=%u"},
 	{Opt_extents, "extents"},
 	{Opt_noextents, "noextents"},
+	{Opt_i_version, "i_version"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
 };
@@ -1273,6 +1276,10 @@ clear_qf_name:
 		case Opt_noextents:
 			clear_opt (sbi->s_mount_opt, EXTENTS);
 			break;
+		case Opt_i_version:
+			set_opt(sbi->s_mount_opt, I_VERSION);
+			sb->s_flags |= MS_I_VERSION;
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -3197,7 +3204,6 @@ out:
 		i_size_write(inode, off+len-towrite);
 		EXT4_I(inode)->i_disksize = inode->i_size;
 	}
-	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	ext4_mark_inode_dirty(handle, inode);
 	mutex_unlock(&inode->i_mutex);
diff --git a/fs/inode.c b/fs/inode.c
index b48324a94c2b..276ffd6b6fdd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1242,23 +1242,6 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 }
 EXPORT_SYMBOL(touch_atime);
 
-/**
- *     inode_inc_iversion      -       increments i_version
- *     @inode: inode that need to be updated
- *
- *     Every time the inode is modified, the i_version field
- *     will be incremented.
- *     The filesystem has to be mounted with i_version flag
- *
- */
-
-void inode_inc_iversion(struct inode *inode)
-{
-	spin_lock(&inode->i_lock);
-	inode->i_version++;
-	spin_unlock(&inode->i_lock);
-}
-
 /**
  *	file_update_time	-	update mtime and ctime time
  *	@file: file accessed
-- 
cgit v1.2.3


From c14c6fd5c56a0d0495d8a7c0f2bc330be658663e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:26 -0500
Subject: ext4: Add EXT4_IOC_MIGRATE ioctl

The below patch add ioctl for migrating ext3 indirect block mapped inode
to ext4 extent mapped inode.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/Makefile  |   2 +-
 fs/ext4/extents.c |   4 +-
 fs/ext4/ioctl.c   |   3 +
 fs/ext4/migrate.c | 560 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 566 insertions(+), 3 deletions(-)
 create mode 100644 fs/ext4/migrate.c

(limited to 'fs')

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ae6e7e502ac9..d5fd80bc0d04 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
 
 ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o
+		   ext4_jbd2.o migrate.o
 
 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 03d1bbb78a2f..01eda5c5281e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -61,7 +61,7 @@ static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
  * idx_pblock:
  * combine low and high parts of a leaf physical block number into ext4_fsblk_t
  */
-static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
+ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
 {
 	ext4_fsblk_t block;
 
@@ -75,7 +75,7 @@ static ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
  * stores a large physical block number into an extent struct,
  * breaking it into parts
  */
-static void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
 {
 	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
 	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c0e5b8cf635c..2ed7c37f897e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -254,6 +254,9 @@ flags_err:
 		return err;
 	}
 
+	case EXT4_IOC_MIGRATE:
+		return ext4_ext_migrate(inode, filp, cmd, arg);
+
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
new file mode 100644
index 000000000000..ec7cb567a7da
--- /dev/null
+++ b/fs/ext4/migrate.c
@@ -0,0 +1,560 @@
+/*
+ * Copyright IBM Corporation, 2007
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs_extents.h>
+
+/*
+ * The contiguous blocks details which can be
+ * represented by a single extent
+ */
+struct list_blocks_struct {
+	ext4_lblk_t first_block, last_block;
+	ext4_fsblk_t first_pblock, last_pblock;
+};
+
+static int finish_range(handle_t *handle, struct inode *inode,
+				struct list_blocks_struct *lb)
+
+{
+	int retval = 0, needed;
+	struct ext4_extent newext;
+	struct ext4_ext_path *path;
+	if (lb->first_pblock == 0)
+		return 0;
+
+	/* Add the extent to temp inode*/
+	newext.ee_block = cpu_to_le32(lb->first_block);
+	newext.ee_len   = cpu_to_le16(lb->last_block - lb->first_block + 1);
+	ext4_ext_store_pblock(&newext, lb->first_pblock);
+	path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+
+	if (IS_ERR(path)) {
+		retval = PTR_ERR(path);
+		goto err_out;
+	}
+
+	/*
+	 * Calculate the credit needed to inserting this extent
+	 * Since we are doing this in loop we may accumalate extra
+	 * credit. But below we try to not accumalate too much
+	 * of them by restarting the journal.
+	 */
+	needed = ext4_ext_calc_credits_for_insert(inode, path);
+
+	/*
+	 * Make sure the credit we accumalated is not really high
+	 */
+	if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+		retval = ext4_journal_restart(handle, needed);
+		if (retval)
+			goto err_out;
+	}
+	if (needed) {
+		retval = ext4_journal_extend(handle, needed);
+		if (retval != 0) {
+			/*
+			 * IF not able to extend the journal restart the journal
+			 */
+			retval = ext4_journal_restart(handle, needed);
+			if (retval)
+				goto err_out;
+		}
+	}
+	retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+err_out:
+	lb->first_pblock = 0;
+	return retval;
+}
+
+static int update_extent_range(handle_t *handle, struct inode *inode,
+				ext4_fsblk_t pblock, ext4_lblk_t blk_num,
+				struct list_blocks_struct *lb)
+{
+	int retval;
+	/*
+	 * See if we can add on to the existing range (if it exists)
+	 */
+	if (lb->first_pblock &&
+		(lb->last_pblock+1 == pblock) &&
+		(lb->last_block+1 == blk_num)) {
+		lb->last_pblock = pblock;
+		lb->last_block = blk_num;
+		return 0;
+	}
+	/*
+	 * Start a new range.
+	 */
+	retval = finish_range(handle, inode, lb);
+	lb->first_pblock = lb->last_pblock = pblock;
+	lb->first_block = lb->last_block = blk_num;
+
+	return retval;
+}
+
+static int update_ind_extent_range(handle_t *handle, struct inode *inode,
+				   ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+				   struct list_blocks_struct *lb)
+{
+	struct buffer_head *bh;
+	__le32 *i_data;
+	int i, retval = 0;
+	ext4_lblk_t blk_count = *blk_nump;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	if (!pblock) {
+		/* Only update the file block number */
+		*blk_nump += max_entries;
+		return 0;
+	}
+
+	bh = sb_bread(inode->i_sb, pblock);
+	if (!bh)
+		return -EIO;
+
+	i_data = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++, blk_count++) {
+		if (i_data[i]) {
+			retval = update_extent_range(handle, inode,
+						le32_to_cpu(i_data[i]),
+						blk_count, lb);
+			if (retval)
+				break;
+		}
+	}
+
+	/* Update the file block number */
+	*blk_nump = blk_count;
+	put_bh(bh);
+	return retval;
+
+}
+
+static int update_dind_extent_range(handle_t *handle, struct inode *inode,
+				    ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+				    struct list_blocks_struct *lb)
+{
+	struct buffer_head *bh;
+	__le32 *i_data;
+	int i, retval = 0;
+	ext4_lblk_t blk_count = *blk_nump;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	if (!pblock) {
+		/* Only update the file block number */
+		*blk_nump += max_entries * max_entries;
+		return 0;
+	}
+	bh = sb_bread(inode->i_sb, pblock);
+	if (!bh)
+		return -EIO;
+
+	i_data = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (i_data[i]) {
+			retval = update_ind_extent_range(handle, inode,
+						le32_to_cpu(i_data[i]),
+						&blk_count, lb);
+			if (retval)
+				break;
+		} else {
+			/* Only update the file block number */
+			blk_count += max_entries;
+		}
+	}
+
+	/* Update the file block number */
+	*blk_nump = blk_count;
+	put_bh(bh);
+	return retval;
+
+}
+
+static int update_tind_extent_range(handle_t *handle, struct inode *inode,
+				     ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
+				     struct list_blocks_struct *lb)
+{
+	struct buffer_head *bh;
+	__le32 *i_data;
+	int i, retval = 0;
+	ext4_lblk_t blk_count = *blk_nump;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	if (!pblock) {
+		/* Only update the file block number */
+		*blk_nump += max_entries * max_entries * max_entries;
+		return 0;
+	}
+	bh = sb_bread(inode->i_sb, pblock);
+	if (!bh)
+		return -EIO;
+
+	i_data = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (i_data[i]) {
+			retval = update_dind_extent_range(handle, inode,
+						le32_to_cpu(i_data[i]),
+						&blk_count, lb);
+			if (retval)
+				break;
+		} else
+			/* Only update the file block number */
+			blk_count += max_entries * max_entries;
+	}
+	/* Update the file block number */
+	*blk_nump = blk_count;
+	put_bh(bh);
+	return retval;
+
+}
+
+static int free_dind_blocks(handle_t *handle,
+				struct inode *inode, __le32 i_data)
+{
+	int i;
+	__le32 *tmp_idata;
+	struct buffer_head *bh;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+	if (!bh)
+		return -EIO;
+
+	tmp_idata = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (tmp_idata[i])
+			ext4_free_blocks(handle, inode,
+					le32_to_cpu(tmp_idata[i]), 1);
+	}
+	put_bh(bh);
+	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1);
+	return 0;
+}
+
+static int free_tind_blocks(handle_t *handle,
+				struct inode *inode, __le32 i_data)
+{
+	int i, retval = 0;
+	__le32 *tmp_idata;
+	struct buffer_head *bh;
+	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+	if (!bh)
+		return -EIO;
+
+	tmp_idata = (__le32 *)bh->b_data;
+	for (i = 0; i < max_entries; i++) {
+		if (tmp_idata[i]) {
+			retval = free_dind_blocks(handle,
+					inode, tmp_idata[i]);
+			if (retval) {
+				put_bh(bh);
+				return retval;
+			}
+		}
+	}
+	put_bh(bh);
+	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1);
+	return 0;
+}
+
+static int free_ind_block(handle_t *handle, struct inode *inode)
+{
+	int retval;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (ei->i_data[EXT4_IND_BLOCK])
+		ext4_free_blocks(handle, inode,
+				le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1);
+
+	if (ei->i_data[EXT4_DIND_BLOCK]) {
+		retval = free_dind_blocks(handle, inode,
+						ei->i_data[EXT4_DIND_BLOCK]);
+		if (retval)
+			return retval;
+	}
+
+	if (ei->i_data[EXT4_TIND_BLOCK]) {
+		retval = free_tind_blocks(handle, inode,
+						ei->i_data[EXT4_TIND_BLOCK]);
+		if (retval)
+			return retval;
+	}
+	return 0;
+}
+
+static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
+				struct inode *tmp_inode, int retval)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
+
+	retval = free_ind_block(handle, inode);
+	if (retval)
+		goto err_out;
+
+	/*
+	 * One credit accounted for writing the
+	 * i_data field of the original inode
+	 */
+	retval = ext4_journal_extend(handle, 1);
+	if (retval != 0) {
+		retval = ext4_journal_restart(handle, 1);
+		if (retval)
+			goto err_out;
+	}
+
+	/*
+	 * We have the extent map build with the tmp inode.
+	 * Now copy the i_data across
+	 */
+	ei->i_flags |= EXT4_EXTENTS_FL;
+	memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
+
+	/*
+	 * Update i_blocks with the new blocks that got
+	 * allocated while adding extents for extent index
+	 * blocks.
+	 *
+	 * While converting to extents we need not
+	 * update the orignal inode i_blocks for extent blocks
+	 * via quota APIs. The quota update happened via tmp_inode already.
+	 */
+	spin_lock(&inode->i_lock);
+	inode->i_blocks += tmp_inode->i_blocks;
+	spin_unlock(&inode->i_lock);
+
+	ext4_mark_inode_dirty(handle, inode);
+err_out:
+	return retval;
+}
+
+static int free_ext_idx(handle_t *handle, struct inode *inode,
+					struct ext4_extent_idx *ix)
+{
+	int i, retval = 0;
+	ext4_fsblk_t block;
+	struct buffer_head *bh;
+	struct ext4_extent_header *eh;
+
+	block = idx_pblock(ix);
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		return -EIO;
+
+	eh = (struct ext4_extent_header *)bh->b_data;
+	if (eh->eh_depth != 0) {
+		ix = EXT_FIRST_INDEX(eh);
+		for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+			retval = free_ext_idx(handle, inode, ix);
+			if (retval)
+				break;
+		}
+	}
+	put_bh(bh);
+	ext4_free_blocks(handle, inode, block, 1);
+	return retval;
+}
+
+/*
+ * Free the extent meta data blocks only
+ */
+static int free_ext_block(handle_t *handle, struct inode *inode)
+{
+	int i, retval = 0;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data;
+	struct ext4_extent_idx *ix;
+	if (eh->eh_depth == 0)
+		/*
+		 * No extra blocks allocated for extent meta data
+		 */
+		return 0;
+	ix = EXT_FIRST_INDEX(eh);
+	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+		retval = free_ext_idx(handle, inode, ix);
+		if (retval)
+			return retval;
+	}
+	return retval;
+
+}
+
+int ext4_ext_migrate(struct inode *inode, struct file *filp,
+				unsigned int cmd, unsigned long arg)
+{
+	handle_t *handle;
+	int retval = 0, i;
+	__le32 *i_data;
+	ext4_lblk_t blk_count = 0;
+	struct ext4_inode_info *ei;
+	struct inode *tmp_inode = NULL;
+	struct list_blocks_struct lb;
+	unsigned long max_entries;
+
+	if (!test_opt(inode->i_sb, EXTENTS))
+		/*
+		 * if mounted with noextents we don't allow the migrate
+		 */
+		return -EINVAL;
+
+	if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+		return -EINVAL;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	handle = ext4_journal_start(inode,
+					EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+					+ 1);
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		goto err_out;
+	}
+	tmp_inode = ext4_new_inode(handle,
+				inode->i_sb->s_root->d_inode,
+				S_IFREG);
+	if (IS_ERR(tmp_inode)) {
+		retval = -ENOMEM;
+		ext4_journal_stop(handle);
+		tmp_inode = NULL;
+		goto err_out;
+	}
+	i_size_write(tmp_inode, i_size_read(inode));
+	/*
+	 * We don't want the inode to be reclaimed
+	 * if we got interrupted in between. We have
+	 * this tmp inode carrying reference to the
+	 * data blocks of the original file. We set
+	 * the i_nlink to zero at the last stage after
+	 * switching the original file to extent format
+	 */
+	tmp_inode->i_nlink = 1;
+
+	ext4_ext_tree_init(handle, tmp_inode);
+	ext4_orphan_add(handle, tmp_inode);
+	ext4_journal_stop(handle);
+
+	ei = EXT4_I(inode);
+	i_data = ei->i_data;
+	memset(&lb, 0, sizeof(lb));
+
+	/* 32 bit block address 4 bytes */
+	max_entries = inode->i_sb->s_blocksize >> 2;
+
+	/*
+	 * start with one credit accounted for
+	 * superblock modification.
+	 *
+	 * For the tmp_inode we already have commited the
+	 * trascation that created the inode. Later as and
+	 * when we add extents we extent the journal
+	 */
+	handle = ext4_journal_start(inode, 1);
+	for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+		if (i_data[i]) {
+			retval = update_extent_range(handle, tmp_inode,
+						le32_to_cpu(i_data[i]),
+						blk_count, &lb);
+			if (retval)
+				goto err_out;
+		}
+	}
+	if (i_data[EXT4_IND_BLOCK]) {
+		retval = update_ind_extent_range(handle, tmp_inode,
+					le32_to_cpu(i_data[EXT4_IND_BLOCK]),
+					&blk_count, &lb);
+			if (retval)
+				goto err_out;
+	} else
+		blk_count +=  max_entries;
+	if (i_data[EXT4_DIND_BLOCK]) {
+		retval = update_dind_extent_range(handle, tmp_inode,
+					le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
+					&blk_count, &lb);
+			if (retval)
+				goto err_out;
+	} else
+		blk_count += max_entries * max_entries;
+	if (i_data[EXT4_TIND_BLOCK]) {
+		retval = update_tind_extent_range(handle, tmp_inode,
+					le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
+					&blk_count, &lb);
+			if (retval)
+				goto err_out;
+	}
+	/*
+	 * Build the last extent
+	 */
+	retval = finish_range(handle, tmp_inode, &lb);
+err_out:
+	/*
+	 * We are either freeing extent information or indirect
+	 * blocks. During this we touch superblock, group descriptor
+	 * and block bitmap. Later we mark the tmp_inode dirty
+	 * via ext4_ext_tree_init. So allocate a credit of 4
+	 * We may update quota (user and group).
+	 *
+	 * FIXME!! we may be touching bitmaps in different block groups.
+	 */
+	if (ext4_journal_extend(handle,
+			4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)) != 0)
+		ext4_journal_restart(handle,
+				4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+	if (retval)
+		/*
+		 * Failure case delete the extent information with the
+		 * tmp_inode
+		 */
+		free_ext_block(handle, tmp_inode);
+	else
+		retval = ext4_ext_swap_inode_data(handle, inode,
+							tmp_inode, retval);
+
+	/*
+	 * Mark the tmp_inode as of size zero
+	 */
+	i_size_write(tmp_inode, 0);
+
+	/*
+	 * set the  i_blocks count to zero
+	 * so that the ext4_delete_inode does the
+	 * right job
+	 *
+	 * We don't need to take the i_lock because
+	 * the inode is not visible to user space.
+	 */
+	tmp_inode->i_blocks = 0;
+
+	/* Reset the extent details */
+	ext4_ext_tree_init(handle, tmp_inode);
+
+	/*
+	 * Set the i_nlink to zero so that
+	 * generic_drop_inode really deletes the
+	 * inode
+	 */
+	tmp_inode->i_nlink = 0;
+
+	ext4_journal_stop(handle);
+
+	up_write(&EXT4_I(inode)->i_data_sem);
+
+	if (tmp_inode)
+		iput(tmp_inode);
+
+	return retval;
+}
-- 
cgit v1.2.3


From aa22df2cc84011808ad7227437ac8f0e01030480 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Fix ext4_show_options to show the correct mount options.

We need to look at the default value and make sure
the mount options are not set via default value
before showing them via ext4_show_options

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/super.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index aa22acd6eb06..64fc7f111734 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -665,18 +665,20 @@ static inline void ext4_show_quota_options(struct seq_file *seq, struct super_bl
  */
 static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
+	int def_errors;
+	unsigned long def_mount_opts;
 	struct super_block *sb = vfs->mnt_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	unsigned long def_mount_opts;
 
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	def_errors     = le16_to_cpu(es->s_errors);
 
 	if (sbi->s_sb_block != 1)
 		seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
 	if (test_opt(sb, MINIX_DF))
 		seq_puts(seq, ",minixdf");
-	if (test_opt(sb, GRPID))
+	if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
 		seq_puts(seq, ",grpid");
 	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
 		seq_puts(seq, ",nogrpid");
@@ -689,25 +691,24 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",resgid=%u", sbi->s_resgid);
 	}
 	if (test_opt(sb, ERRORS_RO)) {
-		int def_errors = le16_to_cpu(es->s_errors);
-
 		if (def_errors == EXT4_ERRORS_PANIC ||
 		    def_errors == EXT4_ERRORS_CONTINUE) {
 			seq_puts(seq, ",errors=remount-ro");
 		}
 	}
-	if (test_opt(sb, ERRORS_CONT))
+	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
 		seq_puts(seq, ",errors=continue");
-	if (test_opt(sb, ERRORS_PANIC))
+	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
 		seq_puts(seq, ",errors=panic");
-	if (test_opt(sb, NO_UID32))
+	if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
 		seq_puts(seq, ",nouid32");
-	if (test_opt(sb, DEBUG))
+	if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
 		seq_puts(seq, ",debug");
 	if (test_opt(sb, OLDALLOC))
 		seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT4DEV_FS_XATTR
-	if (test_opt(sb, XATTR_USER))
+	if (test_opt(sb, XATTR_USER) &&
+		!(def_mount_opts & EXT4_DEFM_XATTR_USER))
 		seq_puts(seq, ",user_xattr");
 	if (!test_opt(sb, XATTR_USER) &&
 	    (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
@@ -715,7 +716,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	}
 #endif
 #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-	if (test_opt(sb, POSIX_ACL))
+	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
 		seq_puts(seq, ",acl");
 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
 		seq_puts(seq, ",noacl");
@@ -735,6 +736,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
 
+	/*
+	 * journal mode get enabled in different ways
+	 * So just print the value even if we didn't specify it
+	 */
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
 		seq_puts(seq, ",data=journal");
 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
@@ -743,7 +748,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",data=writeback");
 
 	ext4_show_quota_options(seq, sb);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From c549a95d40efd83fc054785dd1634e8b71fba890 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: fix up EXT4FS_DEBUG builds

Builds with EXT4FS_DEBUG defined (to enable ext4_debug()) fail
without these changes.  Clean up some format warnings too.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/balloc.c |  6 +++---
 fs/ext4/ialloc.c |  2 +-
 fs/ext4/resize.c | 16 ++++++++--------
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 7ae223ed152f..80a4616c8244 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1630,7 +1630,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 
 	sbi = EXT4_SB(sb);
 	es = EXT4_SB(sb)->s_es;
-	ext4_debug("goal=%lu.\n", goal);
+	ext4_debug("goal=%llu.\n", goal);
 	/*
 	 * Allocate a block from reservation only when
 	 * filesystem is mounted with reservation(default,-o reservation), and
@@ -1740,7 +1740,7 @@ retry_alloc:
 
 allocated:
 
-	ext4_debug("using block group %d(%d)\n",
+	ext4_debug("using block group %lu(%d)\n",
 			group_no, gdp->bg_free_blocks_count);
 
 	BUFFER_TRACE(gdp_bh, "get_write_access");
@@ -1898,7 +1898,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	brelse(bitmap_bh);
 	printk("ext4_count_free_blocks: stored = %llu"
 		", computed = %llu, %llu\n",
-	       EXT4_FREE_BLOCKS_COUNT(es),
+		ext4_free_blocks_count(es),
 		desc_count, bitmap_count);
 	return bitmap_count;
 #else
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 17b5df14f85b..575b5215c808 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -857,7 +857,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 			continue;
 
 		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
-		printk("group %d: stored = %d, counted = %lu\n",
+		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
 			i, le16_to_cpu(gdp->bg_free_inodes_count), x);
 		bitmap_count += x;
 	}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 7090c2d25c76..4fbba60816f4 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -206,7 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	}
 
 	if (ext4_bg_has_super(sb, input->group)) {
-		ext4_debug("mark backup superblock %#04lx (+0)\n", start);
+		ext4_debug("mark backup superblock %#04llx (+0)\n", start);
 		ext4_set_bit(0, bh->b_data);
 	}
 
@@ -215,7 +215,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	     i < gdblocks; i++, block++, bit++) {
 		struct buffer_head *gdb;
 
-		ext4_debug("update backup group %#04lx (+%d)\n", block, bit);
+		ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
 
 		if ((err = extend_or_restart_transaction(handle, 1, bh)))
 			goto exit_bh;
@@ -243,7 +243,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	     i < reserved_gdb; i++, block++, bit++) {
 		struct buffer_head *gdb;
 
-		ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit);
+		ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
 
 		if ((err = extend_or_restart_transaction(handle, 1, bh)))
 			goto exit_bh;
@@ -256,10 +256,10 @@ static int setup_new_group_blocks(struct super_block *sb,
 		ext4_set_bit(bit, bh->b_data);
 		brelse(gdb);
 	}
-	ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
+	ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
 		   input->block_bitmap - start);
 	ext4_set_bit(input->block_bitmap - start, bh->b_data);
-	ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
+	ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
 		   input->inode_bitmap - start);
 	ext4_set_bit(input->inode_bitmap - start, bh->b_data);
 
@@ -268,7 +268,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	     i < sbi->s_itb_per_group; i++, bit++, block++) {
 		struct buffer_head *it;
 
-		ext4_debug("clear inode block %#04lx (+%d)\n", block, bit);
+		ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
 
 		if ((err = extend_or_restart_transaction(handle, 1, bh)))
 			goto exit_bh;
@@ -291,7 +291,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	brelse(bh);
 
 	/* Mark unused entries in inode bitmap used */
-	ext4_debug("clear inode bitmap %#04x (+%ld)\n",
+	ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
 		   input->inode_bitmap, input->inode_bitmap - start);
 	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
 		err = PTR_ERR(bh);
@@ -1054,7 +1054,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
-	ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count,
+	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
 	ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
 	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
-- 
cgit v1.2.3


From 1988b51e476bd097d910c9245b53f2e38aedaf0d Mon Sep 17 00:00:00 2001
From: Alex Tomas <alex@clusterfs.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Add new functions for searching extent tree

Add the functions ext4_ext_search_left() and ext4_ext_search_right(),
which are used by mballoc during ext4_ext_get_blocks to decided whether
to merge extent information.

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Johann Lombardi <johann@clusterfs.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 01eda5c5281e..f5cf2a94b6fc 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1016,6 +1016,148 @@ out:
 	return err;
 }
 
+/*
+ * search the closest allocated block to the left for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+int
+ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
+			ext4_lblk_t *logical, ext4_fsblk_t *phys)
+{
+	struct ext4_extent_idx *ix;
+	struct ext4_extent *ex;
+	int depth;
+
+	BUG_ON(path == NULL);
+	depth = path->p_depth;
+	*phys = 0;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return 0;
+
+	/* usually extent in the path covers blocks smaller
+	 * then *logical, but it can be that extent is the
+	 * first one in the file */
+
+	ex = path[depth].p_ext;
+	if (*logical < le32_to_cpu(ex->ee_block)) {
+		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+		while (--depth >= 0) {
+			ix = path[depth].p_idx;
+			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+		}
+		return 0;
+	}
+
+	BUG_ON(*logical < le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len));
+
+	*logical = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1;
+	*phys = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - 1;
+	return 0;
+}
+
+/*
+ * search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+int
+ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
+			ext4_lblk_t *logical, ext4_fsblk_t *phys)
+{
+	struct buffer_head *bh = NULL;
+	struct ext4_extent_header *eh;
+	struct ext4_extent_idx *ix;
+	struct ext4_extent *ex;
+	ext4_fsblk_t block;
+	int depth;
+
+	BUG_ON(path == NULL);
+	depth = path->p_depth;
+	*phys = 0;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return 0;
+
+	/* usually extent in the path covers blocks smaller
+	 * then *logical, but it can be that extent is the
+	 * first one in the file */
+
+	ex = path[depth].p_ext;
+	if (*logical < le32_to_cpu(ex->ee_block)) {
+		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+		while (--depth >= 0) {
+			ix = path[depth].p_idx;
+			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+		}
+		*logical = le32_to_cpu(ex->ee_block);
+		*phys = ext_pblock(ex);
+		return 0;
+	}
+
+	BUG_ON(*logical < le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len));
+
+	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
+		/* next allocated block in this leaf */
+		ex++;
+		*logical = le32_to_cpu(ex->ee_block);
+		*phys = ext_pblock(ex);
+		return 0;
+	}
+
+	/* go up and search for index to the right */
+	while (--depth >= 0) {
+		ix = path[depth].p_idx;
+		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
+			break;
+	}
+
+	if (depth < 0) {
+		/* we've gone up to the root and
+		 * found no index to the right */
+		return 0;
+	}
+
+	/* we've found index to the right, let's
+	 * follow it and find the closest allocated
+	 * block to the right */
+	ix++;
+	block = idx_pblock(ix);
+	while (++depth < path->p_depth) {
+		bh = sb_bread(inode->i_sb, block);
+		if (bh == NULL)
+			return -EIO;
+		eh = ext_block_hdr(bh);
+		if (ext4_ext_check_header(inode, eh, depth)) {
+			put_bh(bh);
+			return -EIO;
+		}
+		ix = EXT_FIRST_INDEX(eh);
+		block = idx_pblock(ix);
+		put_bh(bh);
+	}
+
+	bh = sb_bread(inode->i_sb, block);
+	if (bh == NULL)
+		return -EIO;
+	eh = ext_block_hdr(bh);
+	if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+		put_bh(bh);
+		return -EIO;
+	}
+	ex = EXT_FIRST_EXTENT(eh);
+	*logical = le32_to_cpu(ex->ee_block);
+	*phys = ext_pblock(ex);
+	put_bh(bh);
+	return 0;
+
+}
+
 /*
  * ext4_ext_next_allocated_block:
  * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
-- 
cgit v1.2.3


From c9de560ded61faa5b754137b7753da252391c55a Mon Sep 17 00:00:00 2001
From: Alex Tomas <alex@clusterfs.com>
Date: Tue, 29 Jan 2008 00:19:52 -0500
Subject: ext4: Add multi block allocator for ext4

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Makefile  |    2 +-
 fs/ext4/balloc.c  |   67 +-
 fs/ext4/extents.c |   45 +-
 fs/ext4/inode.c   |   15 +-
 fs/ext4/mballoc.c | 4552 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/migrate.c |   10 +-
 fs/ext4/super.c   |   62 +-
 fs/ext4/xattr.c   |    4 +-
 8 files changed, 4721 insertions(+), 36 deletions(-)
 create mode 100644 fs/ext4/mballoc.c

(limited to 'fs')

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index d5fd80bc0d04..ac6fa8ca0a2f 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
 
 ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o migrate.o
+		   ext4_jbd2.o migrate.o mballoc.o
 
 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 80a4616c8244..ac75ea953d83 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -577,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode)
 	struct ext4_reserve_window_node *rsv;
 	spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
 
+	ext4_mb_discard_inode_preallocations(inode);
+
 	if (!block_i)
 		return;
 
@@ -785,19 +787,29 @@ error_return:
  * @inode:		inode
  * @block:		start physical block to free
  * @count:		number of blocks to count
+ * @metadata: 		Are these metadata blocks
  */
 void ext4_free_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t block, unsigned long count)
+			ext4_fsblk_t block, unsigned long count,
+			int metadata)
 {
 	struct super_block * sb;
 	unsigned long dquot_freed_blocks;
 
+	/* this isn't the right place to decide whether block is metadata
+	 * inode.c/extents.c knows better, but for safety ... */
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+			ext4_should_journal_data(inode))
+		metadata = 1;
+
 	sb = inode->i_sb;
-	if (!sb) {
-		printk ("ext4_free_blocks: nonexistent device");
-		return;
-	}
-	ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+
+	if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
+		ext4_free_blocks_sb(handle, sb, block, count,
+						&dquot_freed_blocks);
+	else
+		ext4_mb_free_blocks(handle, inode, block, count,
+						metadata, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
 		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
 	return;
@@ -1576,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 }
 
 /**
- * ext4_new_blocks() -- core block(s) allocation function
+ * ext4_new_blocks_old() -- core block(s) allocation function
  * @handle:		handle to this transaction
  * @inode:		file inode
  * @goal:		given target block(filesystem wide)
@@ -1589,7 +1601,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
  * any specific goal block.
  *
  */
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
@@ -1849,13 +1861,46 @@ out:
 }
 
 ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, int *errp)
+		ext4_fsblk_t goal, int *errp)
+{
+	struct ext4_allocation_request ar;
+	ext4_fsblk_t ret;
+
+	if (!test_opt(inode->i_sb, MBALLOC)) {
+		unsigned long count = 1;
+		ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
+		return ret;
+	}
+
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = 1;
+	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	return ret;
+}
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+		ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
-	unsigned long count = 1;
+	struct ext4_allocation_request ar;
+	ext4_fsblk_t ret;
 
-	return ext4_new_blocks(handle, inode, goal, &count, errp);
+	if (!test_opt(inode->i_sb, MBALLOC)) {
+		ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
+		return ret;
+	}
+
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = *count;
+	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	*count = ar.len;
+	return ret;
 }
 
+
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
  * @sb:		superblock
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f5cf2a94b6fc..0cffb59fff46 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -853,7 +853,7 @@ cleanup:
 		for (i = 0; i < depth; i++) {
 			if (!ablocks[i])
 				continue;
-			ext4_free_blocks(handle, inode, ablocks[i], 1);
+			ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
 		}
 	}
 	kfree(ablocks);
@@ -1698,7 +1698,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
 	bh = sb_find_get_block(inode->i_sb, leaf);
 	ext4_forget(handle, 1, inode, bh, leaf);
-	ext4_free_blocks(handle, inode, leaf, 1);
+	ext4_free_blocks(handle, inode, leaf, 1, 1);
 	return err;
 }
 
@@ -1759,8 +1759,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head *bh;
 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
-	int i;
+	int i, metadata = 0;
 
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		metadata = 1;
 #ifdef EXTENTS_STATS
 	{
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1789,7 +1791,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 			bh = sb_find_get_block(inode->i_sb, start + i);
 			ext4_forget(handle, 0, inode, bh, start + i);
 		}
-		ext4_free_blocks(handle, inode, start, num);
+		ext4_free_blocks(handle, inode, start, num, metadata);
 	} else if (from == le32_to_cpu(ex->ee_block)
 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2287,6 +2289,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t goal, newblock;
 	int err = 0, depth, ret;
 	unsigned long allocated = 0;
+	struct ext4_allocation_request ar;
 
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2397,8 +2400,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
 		ext4_init_block_alloc_info(inode);
 
-	/* allocate new block */
-	goal = ext4_ext_find_goal(inode, path, iblock);
+	/* find neighbour allocated blocks */
+	ar.lleft = iblock;
+	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
+	if (err)
+		goto out2;
+	ar.lright = iblock;
+	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+	if (err)
+		goto out2;
 
 	/*
 	 * See if request is beyond maximum number of blocks we can have in
@@ -2421,7 +2431,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		allocated = le16_to_cpu(newex.ee_len);
 	else
 		allocated = max_blocks;
-	newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
+
+	/* allocate new block */
+	ar.inode = inode;
+	ar.goal = ext4_ext_find_goal(inode, path, iblock);
+	ar.logical = iblock;
+	ar.len = allocated;
+	if (S_ISREG(inode->i_mode))
+		ar.flags = EXT4_MB_HINT_DATA;
+	else
+		/* disable in-core preallocation for non-regular files */
+		ar.flags = 0;
+	newblock = ext4_mb_new_blocks(handle, &ar, &err);
 	if (!newblock)
 		goto out2;
 	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
@@ -2429,14 +2450,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 
 	/* try to insert new extent into found leaf and return */
 	ext4_ext_store_pblock(&newex, newblock);
-	newex.ee_len = cpu_to_le16(allocated);
+	newex.ee_len = cpu_to_le16(ar.len);
 	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
 		ext4_ext_mark_uninitialized(&newex);
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
 	if (err) {
 		/* free data blocks we just allocated */
+		/* not a good idea to call discard here directly,
+		 * but otherwise we'd need to call it every free() */
+		ext4_mb_discard_inode_preallocations(inode);
 		ext4_free_blocks(handle, inode, ext_pblock(&newex),
-					le16_to_cpu(newex.ee_len));
+					le16_to_cpu(newex.ee_len), 0);
 		goto out2;
 	}
 
@@ -2445,6 +2469,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
+	allocated = le16_to_cpu(newex.ee_len);
 outnew:
 	__set_bit(BH_New, &bh_result->b_state);
 
@@ -2496,6 +2521,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
 
+	ext4_mb_discard_inode_preallocations(inode);
+
 	/*
 	 * TODO: optimization is possible here.
 	 * Probably we need not scan at all,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a06a3b7cfc34..bb717cbb749c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -551,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	return ret;
 failed_out:
 	for (i = 0; i <index; i++)
-		ext4_free_blocks(handle, inode, new_blocks[i], 1);
+		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 	return ret;
 }
 
@@ -650,9 +650,9 @@ failed:
 		ext4_journal_forget(handle, branch[i].bh);
 	}
 	for (i = 0; i <indirect_blks; i++)
-		ext4_free_blocks(handle, inode, new_blocks[i], 1);
+		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 
-	ext4_free_blocks(handle, inode, new_blocks[i], num);
+	ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
 
 	return err;
 }
@@ -749,9 +749,10 @@ err_out:
 	for (i = 1; i <= num; i++) {
 		BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
 		ext4_journal_forget(handle, where[i].bh);
-		ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
+		ext4_free_blocks(handle, inode,
+					le32_to_cpu(where[i-1].key), 1, 0);
 	}
-	ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+	ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
 
 	return err;
 }
@@ -2052,7 +2053,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 		}
 	}
 
-	ext4_free_blocks(handle, inode, block_to_free, count);
+	ext4_free_blocks(handle, inode, block_to_free, count, 0);
 }
 
 /**
@@ -2225,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 				ext4_journal_test_restart(handle, inode);
 			}
 
-			ext4_free_blocks(handle, inode, nr, 1);
+			ext4_free_blocks(handle, inode, nr, 1, 1);
 
 			if (parent_bh) {
 				/*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
new file mode 100644
index 000000000000..76e5fedc0a0b
--- /dev/null
+++ b/fs/ext4/mballoc.c
@@ -0,0 +1,4552 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+
+/*
+ * mballoc.c contains the multiblocks allocation routines
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/version.h>
+#include "group.h"
+
+/*
+ * MUSTDO:
+ *   - test ext4_ext_search_left() and ext4_ext_search_right()
+ *   - search for metadata in few groups
+ *
+ * TODO v4:
+ *   - normalization should take into account whether file is still open
+ *   - discard preallocations if no free space left (policy?)
+ *   - don't normalize tails
+ *   - quota
+ *   - reservation for superuser
+ *
+ * TODO v3:
+ *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ *   - track min/max extents in each group for better group selection
+ *   - mb_mark_used() may allocate chunk right after splitting buddy
+ *   - tree of groups sorted by number of free blocks
+ *   - error handling
+ */
+
+/*
+ * The allocation request involve request for multiple number of blocks
+ * near to the goal(block) value specified.
+ *
+ * During initialization phase of the allocator we decide to use the group
+ * preallocation or inode preallocation depending on the size file. The
+ * size of the file could be the resulting file size we would have after
+ * allocation or the current file size which ever is larger. If the size is
+ * less that sbi->s_mb_stream_request we select the group
+ * preallocation. The default value of s_mb_stream_request is 16
+ * blocks. This can also be tuned via
+ * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
+ * of number of blocks.
+ *
+ * The main motivation for having small file use group preallocation is to
+ * ensure that we have small file closer in the disk.
+ *
+ * First stage the allocator looks at the inode prealloc list
+ * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
+ * this particular inode. The inode prealloc space is represented as:
+ *
+ * pa_lstart -> the logical start block for this prealloc space
+ * pa_pstart -> the physical start block for this prealloc space
+ * pa_len    -> lenght for this prealloc space
+ * pa_free   ->  free space available in this prealloc space
+ *
+ * The inode preallocation space is used looking at the _logical_ start
+ * block. If only the logical file block falls within the range of prealloc
+ * space we will consume the particular prealloc space. This make sure that
+ * that the we have contiguous physical blocks representing the file blocks
+ *
+ * The important thing to be noted in case of inode prealloc space is that
+ * we don't modify the values associated to inode prealloc space except
+ * pa_free.
+ *
+ * If we are not able to find blocks in the inode prealloc space and if we
+ * have the group allocation flag set then we look at the locality group
+ * prealloc space. These are per CPU prealloc list repreasented as
+ *
+ * ext4_sb_info.s_locality_groups[smp_processor_id()]
+ *
+ * The reason for having a per cpu locality group is to reduce the contention
+ * between CPUs. It is possible to get scheduled at this point.
+ *
+ * The locality group prealloc space is used looking at whether we have
+ * enough free space (pa_free) withing the prealloc space.
+ *
+ * If we can't allocate blocks via inode prealloc or/and locality group
+ * prealloc then we look at the buddy cache. The buddy cache is represented
+ * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
+ * mapped to the buddy and bitmap information regarding different
+ * groups. The buddy information is attached to buddy cache inode so that
+ * we can access them through the page cache. The information regarding
+ * each group is loaded via ext4_mb_load_buddy.  The information involve
+ * block bitmap and buddy information. The information are stored in the
+ * inode as:
+ *
+ *  {                        page                        }
+ *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.  So for each group we
+ * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
+ * blocksize) blocks.  So it can have information regarding groups_per_page
+ * which is blocks_per_page/2
+ *
+ * The buddy cache inode is not stored on disk. The inode is thrown
+ * away when the filesystem is unmounted.
+ *
+ * We look for count number of blocks in the buddy cache. If we were able
+ * to locate that many free blocks we return with additional information
+ * regarding rest of the contiguous physical block available
+ *
+ * Before allocating blocks via buddy cache we normalize the request
+ * blocks. This ensure we ask for more blocks that we needed. The extra
+ * blocks that we get after allocation is added to the respective prealloc
+ * list. In case of inode preallocation we follow a list of heuristics
+ * based on file size. This can be found in ext4_mb_normalize_request. If
+ * we are doing a group prealloc we try to normalize the request to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * 512 blocks. This can be tuned via
+ * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * terms of number of blocks. If we have mounted the file system with -O
+ * stripe=<value> option the group prealloc request is normalized to the
+ * stripe value (sbi->s_stripe)
+ *
+ * The regular allocator(using the buddy cache) support few tunables.
+ *
+ * /proc/fs/ext4/<partition>/min_to_scan
+ * /proc/fs/ext4/<partition>/max_to_scan
+ * /proc/fs/ext4/<partition>/order2_req
+ *
+ * The regular allocator use buddy scan only if the request len is power of
+ * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
+ * value of s_mb_order2_reqs can be tuned via
+ * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
+ * stripe size (sbi->s_stripe), we try to search for contigous block in
+ * stripe size. This should result in better allocation on RAID setup. If
+ * not we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * min_to_scan indicate how long the mballoc __must__ look for a best
+ * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * best extent in the found extents. Searching for the blocks starts with
+ * the group specified as the goal value in allocation context via
+ * ac_g_ex. Each group is first checked based on the criteria whether it
+ * can used for allocation. ext4_mb_good_group explains how the groups are
+ * checked.
+ *
+ * Both the prealloc space are getting populated as above. So for the first
+ * request we will hit the buddy cache which will result in this prealloc
+ * space getting filled. The prealloc space is then later used for the
+ * subsequent request.
+ */
+
+/*
+ * mballoc operates on the following data:
+ *  - on-disk bitmap
+ *  - in-core buddy (actually includes buddy and bitmap)
+ *  - preallocation descriptors (PAs)
+ *
+ * there are two types of preallocations:
+ *  - inode
+ *    assiged to specific inode and can be used for this inode only.
+ *    it describes part of inode's space preallocated to specific
+ *    physical blocks. any block from that preallocated can be used
+ *    independent. the descriptor just tracks number of blocks left
+ *    unused. so, before taking some block from descriptor, one must
+ *    make sure corresponded logical block isn't allocated yet. this
+ *    also means that freeing any block within descriptor's range
+ *    must discard all preallocated blocks.
+ *  - locality group
+ *    assigned to specific locality group which does not translate to
+ *    permanent set of inodes: inode can join and leave group. space
+ *    from this type of preallocation can be used for any inode. thus
+ *    it's consumed from the beginning to the end.
+ *
+ * relation between them can be expressed as:
+ *    in-core buddy = on-disk bitmap + preallocation descriptors
+ *
+ * this mean blocks mballoc considers used are:
+ *  - allocated blocks (persistent)
+ *  - preallocated blocks (non-persistent)
+ *
+ * consistency in mballoc world means that at any time a block is either
+ * free or used in ALL structures. notice: "any time" should not be read
+ * literally -- time is discrete and delimited by locks.
+ *
+ *  to keep it simple, we don't use block numbers, instead we count number of
+ *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
+ *
+ * all operations can be expressed as:
+ *  - init buddy:			buddy = on-disk + PAs
+ *  - new PA:				buddy += N; PA = N
+ *  - use inode PA:			on-disk += N; PA -= N
+ *  - discard inode PA			buddy -= on-disk - PA; PA = 0
+ *  - use locality group PA		on-disk += N; PA -= N
+ *  - discard locality group PA		buddy -= PA; PA = 0
+ *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
+ *        is used in real operation because we can't know actual used
+ *        bits from PA, only from on-disk bitmap
+ *
+ * if we follow this strict logic, then all operations above should be atomic.
+ * given some of them can block, we'd have to use something like semaphores
+ * killing performance on high-end SMP hardware. let's try to relax it using
+ * the following knowledge:
+ *  1) if buddy is referenced, it's already initialized
+ *  2) while block is used in buddy and the buddy is referenced,
+ *     nobody can re-allocate that block
+ *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
+ *     bit set and PA claims same block, it's OK. IOW, one can set bit in
+ *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
+ *     block
+ *
+ * so, now we're building a concurrency table:
+ *  - init buddy vs.
+ *    - new PA
+ *      blocks for PA are allocated in the buddy, buddy must be referenced
+ *      until PA is linked to allocation group to avoid concurrent buddy init
+ *    - use inode PA
+ *      we need to make sure that either on-disk bitmap or PA has uptodate data
+ *      given (3) we care that PA-=N operation doesn't interfere with init
+ *    - discard inode PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *    - use locality group PA
+ *      again PA-=N must be serialized with init
+ *    - discard locality group PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *  - new PA vs.
+ *    - use inode PA
+ *      i_data_sem serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      some mutex should serialize them
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *  - use inode PA
+ *    - use inode PA
+ *      i_data_sem or another mutex should serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      nothing wrong here -- they're different PAs covering different blocks
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *
+ * now we're ready to make few consequences:
+ *  - PA is referenced and while it is no discard is possible
+ *  - PA is referenced until block isn't marked in on-disk bitmap
+ *  - PA changes only after on-disk bitmap
+ *  - discard must not compete with init. either init is done before
+ *    any discard or they're serialized somehow
+ *  - buddy init as sum of on-disk bitmap and PAs is done atomically
+ *
+ * a special case when we've used PA to emptiness. no need to modify buddy
+ * in this case, but we should care about concurrent init
+ *
+ */
+
+ /*
+ * Logic in few words:
+ *
+ *  - allocation:
+ *    load group
+ *    find blocks
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - use preallocation:
+ *    find proper PA (per-inode or group)
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *    release PA
+ *
+ *  - free:
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - discard preallocations in group:
+ *    mark PAs deleted
+ *    move them onto local list
+ *    load on-disk bitmap
+ *    load group
+ *    remove PA from object (inode or locality group)
+ *    mark free blocks in-core
+ *
+ *  - discard inode's preallocations:
+ */
+
+/*
+ * Locking rules
+ *
+ * Locks:
+ *  - bitlock on a group	(group)
+ *  - object (inode/locality)	(object)
+ *  - per-pa lock		(pa)
+ *
+ * Paths:
+ *  - new pa
+ *    object
+ *    group
+ *
+ *  - find and use pa:
+ *    pa
+ *
+ *  - release consumed pa:
+ *    pa
+ *    group
+ *    object
+ *
+ *  - generate in-core bitmap:
+ *    group
+ *        pa
+ *
+ *  - discard all for given object (inode, locality group):
+ *    object
+ *        pa
+ *    group
+ *
+ *  - discard all for given group:
+ *    group
+ *        pa
+ *    group
+ *        object
+ *
+ */
+
+/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
+ */
+#define MB_DEBUG__
+#ifdef MB_DEBUG
+#define mb_debug(fmt, a...)	printk(fmt, ##a)
+#else
+#define mb_debug(fmt, a...)
+#endif
+
+/*
+ * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
+ * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
+ */
+#define EXT4_MB_HISTORY
+#define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
+#define EXT4_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */
+#define EXT4_MB_HISTORY_DISCARD		4	/* preallocation discarded */
+#define EXT4_MB_HISTORY_FREE		8	/* free */
+
+#define EXT4_MB_HISTORY_DEFAULT		(EXT4_MB_HISTORY_ALLOC | \
+					 EXT4_MB_HISTORY_PREALLOC)
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN		200
+
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN		10
+
+/*
+ * How many groups mballoc will scan looking for the best chunk
+ */
+#define MB_DEFAULT_MAX_GROUPS_TO_SCAN	5
+
+/*
+ * with 'ext4_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS		1
+
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD	16	/* 64K */
+
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS		2
+
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC	512
+
+static struct kmem_cache *ext4_pspace_cachep;
+
+#ifdef EXT4_BB_MAX_BLOCKS
+#undef EXT4_BB_MAX_BLOCKS
+#endif
+#define EXT4_BB_MAX_BLOCKS	30
+
+struct ext4_free_metadata {
+	ext4_group_t group;
+	unsigned short num;
+	ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
+	struct list_head list;
+};
+
+struct ext4_group_info {
+	unsigned long	bb_state;
+	unsigned long	bb_tid;
+	struct ext4_free_metadata *bb_md_cur;
+	unsigned short	bb_first_free;
+	unsigned short	bb_free;
+	unsigned short	bb_fragments;
+	struct		list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+	void		*bb_bitmap;
+#endif
+	unsigned short	bb_counters[];
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT	0
+#define EXT4_GROUP_INFO_LOCKED_BIT	1
+
+#define EXT4_MB_GRP_NEED_INIT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+
+struct ext4_prealloc_space {
+	struct list_head	pa_inode_list;
+	struct list_head	pa_group_list;
+	union {
+		struct list_head pa_tmp_list;
+		struct rcu_head	pa_rcu;
+	} u;
+	spinlock_t		pa_lock;
+	atomic_t		pa_count;
+	unsigned		pa_deleted;
+	ext4_fsblk_t		pa_pstart;	/* phys. block */
+	ext4_lblk_t		pa_lstart;	/* log. block */
+	unsigned short		pa_len;		/* len of preallocated chunk */
+	unsigned short		pa_free;	/* how many blocks are free */
+	unsigned short		pa_linear;	/* consumed in one direction
+						 * strictly, for grp prealloc */
+	spinlock_t		*pa_obj_lock;
+	struct inode		*pa_inode;	/* hack, for history only */
+};
+
+
+struct ext4_free_extent {
+	ext4_lblk_t fe_logical;
+	ext4_grpblk_t fe_start;
+	ext4_group_t fe_group;
+	int fe_len;
+};
+
+/*
+ * Locality group:
+ *   we try to group all related changes together
+ *   so that writeback can flush/allocate them together as well
+ */
+struct ext4_locality_group {
+	/* for allocator */
+	struct mutex		lg_mutex;	/* to serialize allocates */
+	struct list_head	lg_prealloc_list;/* list of preallocations */
+	spinlock_t		lg_prealloc_lock;
+};
+
+struct ext4_allocation_context {
+	struct inode *ac_inode;
+	struct super_block *ac_sb;
+
+	/* original request */
+	struct ext4_free_extent ac_o_ex;
+
+	/* goal request (after normalization) */
+	struct ext4_free_extent ac_g_ex;
+
+	/* the best found extent */
+	struct ext4_free_extent ac_b_ex;
+
+	/* copy of the bext found extent taken before preallocation efforts */
+	struct ext4_free_extent ac_f_ex;
+
+	/* number of iterations done. we have to track to limit searching */
+	unsigned long ac_ex_scanned;
+	__u16 ac_groups_scanned;
+	__u16 ac_found;
+	__u16 ac_tail;
+	__u16 ac_buddy;
+	__u16 ac_flags;		/* allocation hints */
+	__u8 ac_status;
+	__u8 ac_criteria;
+	__u8 ac_repeats;
+	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
+				 * N > 0, the field stores N, otherwise 0 */
+	__u8 ac_op;		/* operation, for history only */
+	struct page *ac_bitmap_page;
+	struct page *ac_buddy_page;
+	struct ext4_prealloc_space *ac_pa;
+	struct ext4_locality_group *ac_lg;
+};
+
+#define AC_STATUS_CONTINUE	1
+#define AC_STATUS_FOUND		2
+#define AC_STATUS_BREAK		3
+
+struct ext4_mb_history {
+	struct ext4_free_extent orig;	/* orig allocation */
+	struct ext4_free_extent goal;	/* goal allocation */
+	struct ext4_free_extent result;	/* result allocation */
+	unsigned pid;
+	unsigned ino;
+	__u16 found;	/* how many extents have been found */
+	__u16 groups;	/* how many groups have been scanned */
+	__u16 tail;	/* what tail broke some buddy */
+	__u16 buddy;	/* buddy the tail ^^^ broke */
+	__u16 flags;
+	__u8 cr:3;	/* which phase the result extent was found at */
+	__u8 op:4;
+	__u8 merged:1;
+};
+
+struct ext4_buddy {
+	struct page *bd_buddy_page;
+	void *bd_buddy;
+	struct page *bd_bitmap_page;
+	void *bd_bitmap;
+	struct ext4_group_info *bd_info;
+	struct super_block *bd_sb;
+	__u16 bd_blkbits;
+	ext4_group_t bd_group;
+};
+#define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
+#define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
+
+#ifndef EXT4_MB_HISTORY
+static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+	return;
+}
+#else
+static void ext4_mb_store_history(struct ext4_allocation_context *ac);
+#endif
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+static struct proc_dir_entry *proc_root_ext4;
+struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
+ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+			ext4_fsblk_t goal, unsigned long *count, int *errp);
+
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+					ext4_group_t group);
+static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
+static void ext4_mb_free_committed_blocks(struct super_block *);
+static void ext4_mb_return_to_preallocation(struct inode *inode,
+					struct ext4_buddy *e4b, sector_t block,
+					int count);
+static void ext4_mb_put_pa(struct ext4_allocation_context *,
+			struct super_block *, struct ext4_prealloc_space *pa);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline int ext4_is_group_locked(struct super_block *sb,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+						&(grinfo->bb_state));
+}
+
+static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+					struct ext4_free_extent *fex)
+{
+	ext4_fsblk_t block;
+
+	block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
+			+ fex->fe_start
+			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+	return block;
+}
+
+#if BITS_PER_LONG == 64
+#define mb_correct_addr_and_bit(bit, addr)		\
+{							\
+	bit += ((unsigned long) addr & 7UL) << 3;	\
+	addr = (void *) ((unsigned long) addr & ~7UL);	\
+}
+#elif BITS_PER_LONG == 32
+#define mb_correct_addr_and_bit(bit, addr)		\
+{							\
+	bit += ((unsigned long) addr & 3UL) << 3;	\
+	addr = (void *) ((unsigned long) addr & ~3UL);	\
+}
+#else
+#error "how many bits you are?!"
+#endif
+
+static inline int mb_test_bit(int bit, void *addr)
+{
+	/*
+	 * ext4_test_bit on architecture like powerpc
+	 * needs unsigned long aligned address
+	 */
+	mb_correct_addr_and_bit(bit, addr);
+	return ext4_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+	mb_correct_addr_and_bit(bit, addr);
+	ext4_set_bit(bit, addr);
+}
+
+static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
+{
+	mb_correct_addr_and_bit(bit, addr);
+	ext4_set_bit_atomic(lock, bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+	mb_correct_addr_and_bit(bit, addr);
+	ext4_clear_bit(bit, addr);
+}
+
+static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
+{
+	mb_correct_addr_and_bit(bit, addr);
+	ext4_clear_bit_atomic(lock, bit, addr);
+}
+
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+{
+	char *bb;
+
+	/* FIXME!! is this needed */
+	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+	BUG_ON(max == NULL);
+
+	if (order > e4b->bd_blkbits + 1) {
+		*max = 0;
+		return NULL;
+	}
+
+	/* at order 0 we see each particular block */
+	*max = 1 << (e4b->bd_blkbits + 3);
+	if (order == 0)
+		return EXT4_MB_BITMAP(e4b);
+
+	bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
+
+	return bb;
+}
+
+#ifdef DOUBLE_CHECK
+static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
+			   int first, int count)
+{
+	int i;
+	struct super_block *sb = e4b->bd_sb;
+
+	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+		return;
+	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	for (i = 0; i < count; i++) {
+		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
+			ext4_fsblk_t blocknr;
+			blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+			blocknr += first + i;
+			blocknr +=
+			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+
+			ext4_error(sb, __FUNCTION__, "double-free of inode"
+				   " %lu's block %llu(bit %u in group %lu)\n",
+				   inode ? inode->i_ino : 0, blocknr,
+				   first + i, e4b->bd_group);
+		}
+		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
+	}
+}
+
+static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
+{
+	int i;
+
+	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+		return;
+	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	for (i = 0; i < count; i++) {
+		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
+		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
+	}
+}
+
+static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+	if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
+		unsigned char *b1, *b2;
+		int i;
+		b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
+		b2 = (unsigned char *) bitmap;
+		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
+			if (b1[i] != b2[i]) {
+				printk("corruption in group %lu at byte %u(%u):"
+				       " %x in copy != %x on disk/prealloc\n",
+					e4b->bd_group, i, i * 8, b1[i], b2[i]);
+				BUG();
+			}
+		}
+	}
+}
+
+#else
+static inline void mb_free_blocks_double(struct inode *inode,
+				struct ext4_buddy *e4b, int first, int count)
+{
+	return;
+}
+static inline void mb_mark_used_double(struct ext4_buddy *e4b,
+						int first, int count)
+{
+	return;
+}
+static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+	return;
+}
+#endif
+
+#ifdef AGGRESSIVE_CHECK
+
+#define MB_CHECK_ASSERT(assert)						\
+do {									\
+	if (!(assert)) {						\
+		printk(KERN_EMERG					\
+			"Assertion failure in %s() at %s:%d: \"%s\"\n",	\
+			function, file, line, # assert);		\
+		BUG();							\
+	}								\
+} while (0)
+
+static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+				const char *function, int line)
+{
+	struct super_block *sb = e4b->bd_sb;
+	int order = e4b->bd_blkbits + 1;
+	int max;
+	int max2;
+	int i;
+	int j;
+	int k;
+	int count;
+	struct ext4_group_info *grp;
+	int fragments = 0;
+	int fstart;
+	struct list_head *cur;
+	void *buddy;
+	void *buddy2;
+
+	if (!test_opt(sb, MBALLOC))
+		return 0;
+
+	{
+		static int mb_check_counter;
+		if (mb_check_counter++ % 100 != 0)
+			return 0;
+	}
+
+	while (order > 1) {
+		buddy = mb_find_buddy(e4b, order, &max);
+		MB_CHECK_ASSERT(buddy);
+		buddy2 = mb_find_buddy(e4b, order - 1, &max2);
+		MB_CHECK_ASSERT(buddy2);
+		MB_CHECK_ASSERT(buddy != buddy2);
+		MB_CHECK_ASSERT(max * 2 == max2);
+
+		count = 0;
+		for (i = 0; i < max; i++) {
+
+			if (mb_test_bit(i, buddy)) {
+				/* only single bit in buddy2 may be 1 */
+				if (!mb_test_bit(i << 1, buddy2)) {
+					MB_CHECK_ASSERT(
+						mb_test_bit((i<<1)+1, buddy2));
+				} else if (!mb_test_bit((i << 1) + 1, buddy2)) {
+					MB_CHECK_ASSERT(
+						mb_test_bit(i << 1, buddy2));
+				}
+				continue;
+			}
+
+			/* both bits in buddy2 must be 0 */
+			MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
+			MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
+
+			for (j = 0; j < (1 << order); j++) {
+				k = (i * (1 << order)) + j;
+				MB_CHECK_ASSERT(
+					!mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+			}
+			count++;
+		}
+		MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
+		order--;
+	}
+
+	fstart = -1;
+	buddy = mb_find_buddy(e4b, 0, &max);
+	for (i = 0; i < max; i++) {
+		if (!mb_test_bit(i, buddy)) {
+			MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
+			if (fstart == -1) {
+				fragments++;
+				fstart = i;
+			}
+			continue;
+		}
+		fstart = -1;
+		/* check used bits only */
+		for (j = 0; j < e4b->bd_blkbits + 1; j++) {
+			buddy2 = mb_find_buddy(e4b, j, &max2);
+			k = i >> j;
+			MB_CHECK_ASSERT(k < max2);
+			MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
+		}
+	}
+	MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
+	MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
+
+	grp = ext4_get_group_info(sb, e4b->bd_group);
+	buddy = mb_find_buddy(e4b, 0, &max);
+	list_for_each(cur, &grp->bb_prealloc_list) {
+		ext4_group_t groupnr;
+		struct ext4_prealloc_space *pa;
+		pa = list_entry(cur, struct ext4_prealloc_space, group_list);
+		ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k);
+		MB_CHECK_ASSERT(groupnr == e4b->bd_group);
+		for (i = 0; i < pa->len; i++)
+			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
+	}
+	return 0;
+}
+#undef MB_CHECK_ASSERT
+#define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
+					__FILE__, __FUNCTION__, __LINE__)
+#else
+#define mb_check_buddy(e4b)
+#endif
+
+/* FIXME!! need more doc */
+static void ext4_mb_mark_free_simple(struct super_block *sb,
+				void *buddy, unsigned first, int len,
+					struct ext4_group_info *grp)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned short min;
+	unsigned short max;
+	unsigned short chunk;
+	unsigned short border;
+
+	BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
+
+	border = 2 << sb->s_blocksize_bits;
+
+	while (len > 0) {
+		/* find how many blocks can be covered since this position */
+		max = ffs(first | border) - 1;
+
+		/* find how many blocks of power 2 we need to mark */
+		min = fls(len) - 1;
+
+		if (max < min)
+			min = max;
+		chunk = 1 << min;
+
+		/* mark multiblock chunks only */
+		grp->bb_counters[min]++;
+		if (min > 0)
+			mb_clear_bit(first >> min,
+				     buddy + sbi->s_mb_offsets[min]);
+
+		len -= chunk;
+		first += chunk;
+	}
+}
+
+static void ext4_mb_generate_buddy(struct super_block *sb,
+				void *buddy, void *bitmap, ext4_group_t group)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
+	unsigned short i = 0;
+	unsigned short first;
+	unsigned short len;
+	unsigned free = 0;
+	unsigned fragments = 0;
+	unsigned long long period = get_cycles();
+
+	/* initialize buddy from bitmap which is aggregation
+	 * of on-disk bitmap and preallocations */
+	i = ext4_find_next_zero_bit(bitmap, max, 0);
+	grp->bb_first_free = i;
+	while (i < max) {
+		fragments++;
+		first = i;
+		i = ext4_find_next_bit(bitmap, max, i);
+		len = i - first;
+		free += len;
+		if (len > 1)
+			ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
+		else
+			grp->bb_counters[0]++;
+		if (i < max)
+			i = ext4_find_next_zero_bit(bitmap, max, i);
+	}
+	grp->bb_fragments = fragments;
+
+	if (free != grp->bb_free) {
+		printk(KERN_DEBUG
+			"EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+			group, free, grp->bb_free);
+		grp->bb_free = free;
+	}
+
+	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+
+	period = get_cycles() - period;
+	spin_lock(&EXT4_SB(sb)->s_bal_lock);
+	EXT4_SB(sb)->s_mb_buddies_generated++;
+	EXT4_SB(sb)->s_mb_generation_time += period;
+	spin_unlock(&EXT4_SB(sb)->s_bal_lock);
+}
+
+/* The buddy information is attached the buddy cache inode
+ * for convenience. The information regarding each group
+ * is loaded via ext4_mb_load_buddy. The information involve
+ * block bitmap and buddy information. The information are
+ * stored in the inode as
+ *
+ * {                        page                        }
+ * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.
+ * So for each group we take up 2 blocks. A page can
+ * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
+ * So it can have information regarding groups_per_page which
+ * is blocks_per_page/2
+ */
+
+static int ext4_mb_init_cache(struct page *page, char *incore)
+{
+	int blocksize;
+	int blocks_per_page;
+	int groups_per_page;
+	int err = 0;
+	int i;
+	ext4_group_t first_group;
+	int first_block;
+	struct super_block *sb;
+	struct buffer_head *bhs;
+	struct buffer_head **bh;
+	struct inode *inode;
+	char *data;
+	char *bitmap;
+
+	mb_debug("init page %lu\n", page->index);
+
+	inode = page->mapping->host;
+	sb = inode->i_sb;
+	blocksize = 1 << inode->i_blkbits;
+	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
+
+	groups_per_page = blocks_per_page >> 1;
+	if (groups_per_page == 0)
+		groups_per_page = 1;
+
+	/* allocate buffer_heads to read bitmaps */
+	if (groups_per_page > 1) {
+		err = -ENOMEM;
+		i = sizeof(struct buffer_head *) * groups_per_page;
+		bh = kzalloc(i, GFP_NOFS);
+		if (bh == NULL)
+			goto out;
+	} else
+		bh = &bhs;
+
+	first_group = page->index * blocks_per_page / 2;
+
+	/* read all groups the page covers into the cache */
+	for (i = 0; i < groups_per_page; i++) {
+		struct ext4_group_desc *desc;
+
+		if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+			break;
+
+		err = -EIO;
+		desc = ext4_get_group_desc(sb, first_group + i, NULL);
+		if (desc == NULL)
+			goto out;
+
+		err = -ENOMEM;
+		bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
+		if (bh[i] == NULL)
+			goto out;
+
+		if (bh_uptodate_or_lock(bh[i]))
+			continue;
+
+		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+			ext4_init_block_bitmap(sb, bh[i],
+						first_group + i, desc);
+			set_buffer_uptodate(bh[i]);
+			unlock_buffer(bh[i]);
+			continue;
+		}
+		get_bh(bh[i]);
+		bh[i]->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh[i]);
+		mb_debug("read bitmap for group %lu\n", first_group + i);
+	}
+
+	/* wait for I/O completion */
+	for (i = 0; i < groups_per_page && bh[i]; i++)
+		wait_on_buffer(bh[i]);
+
+	err = -EIO;
+	for (i = 0; i < groups_per_page && bh[i]; i++)
+		if (!buffer_uptodate(bh[i]))
+			goto out;
+
+	first_block = page->index * blocks_per_page;
+	for (i = 0; i < blocks_per_page; i++) {
+		int group;
+		struct ext4_group_info *grinfo;
+
+		group = (first_block + i) >> 1;
+		if (group >= EXT4_SB(sb)->s_groups_count)
+			break;
+
+		/*
+		 * data carry information regarding this
+		 * particular group in the format specified
+		 * above
+		 *
+		 */
+		data = page_address(page) + (i * blocksize);
+		bitmap = bh[group - first_group]->b_data;
+
+		/*
+		 * We place the buddy block and bitmap block
+		 * close together
+		 */
+		if ((first_block + i) & 1) {
+			/* this is block of buddy */
+			BUG_ON(incore == NULL);
+			mb_debug("put buddy for group %u in page %lu/%x\n",
+				group, page->index, i * blocksize);
+			memset(data, 0xff, blocksize);
+			grinfo = ext4_get_group_info(sb, group);
+			grinfo->bb_fragments = 0;
+			memset(grinfo->bb_counters, 0,
+			       sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+			/*
+			 * incore got set to the group block bitmap below
+			 */
+			ext4_mb_generate_buddy(sb, data, incore, group);
+			incore = NULL;
+		} else {
+			/* this is block of bitmap */
+			BUG_ON(incore != NULL);
+			mb_debug("put bitmap for group %u in page %lu/%x\n",
+				group, page->index, i * blocksize);
+
+			/* see comments in ext4_mb_put_pa() */
+			ext4_lock_group(sb, group);
+			memcpy(data, bitmap, blocksize);
+
+			/* mark all preallocated blks used in in-core bitmap */
+			ext4_mb_generate_from_pa(sb, data, group);
+			ext4_unlock_group(sb, group);
+
+			/* set incore so that the buddy information can be
+			 * generated using this
+			 */
+			incore = data;
+		}
+	}
+	SetPageUptodate(page);
+
+out:
+	if (bh) {
+		for (i = 0; i < groups_per_page && bh[i]; i++)
+			brelse(bh[i]);
+		if (bh != &bhs)
+			kfree(bh);
+	}
+	return err;
+}
+
+static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+		struct ext4_buddy *e4b)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	int blocks_per_page;
+	int block;
+	int pnum;
+	int poff;
+	struct page *page;
+
+	mb_debug("load group %lu\n", group);
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+	e4b->bd_blkbits = sb->s_blocksize_bits;
+	e4b->bd_info = ext4_get_group_info(sb, group);
+	e4b->bd_sb = sb;
+	e4b->bd_group = group;
+	e4b->bd_buddy_page = NULL;
+	e4b->bd_bitmap_page = NULL;
+
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+
+	/* we could use find_or_create_page(), but it locks page
+	 * what we'd like to avoid in fast path ... */
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page == NULL || !PageUptodate(page)) {
+		if (page)
+			page_cache_release(page);
+		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+		if (page) {
+			BUG_ON(page->mapping != inode->i_mapping);
+			if (!PageUptodate(page)) {
+				ext4_mb_init_cache(page, NULL);
+				mb_cmp_bitmaps(e4b, page_address(page) +
+					       (poff * sb->s_blocksize));
+			}
+			unlock_page(page);
+		}
+	}
+	if (page == NULL || !PageUptodate(page))
+		goto err;
+	e4b->bd_bitmap_page = page;
+	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+	mark_page_accessed(page);
+
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page == NULL || !PageUptodate(page)) {
+		if (page)
+			page_cache_release(page);
+		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+		if (page) {
+			BUG_ON(page->mapping != inode->i_mapping);
+			if (!PageUptodate(page))
+				ext4_mb_init_cache(page, e4b->bd_bitmap);
+
+			unlock_page(page);
+		}
+	}
+	if (page == NULL || !PageUptodate(page))
+		goto err;
+	e4b->bd_buddy_page = page;
+	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+	mark_page_accessed(page);
+
+	BUG_ON(e4b->bd_bitmap_page == NULL);
+	BUG_ON(e4b->bd_buddy_page == NULL);
+
+	return 0;
+
+err:
+	if (e4b->bd_bitmap_page)
+		page_cache_release(e4b->bd_bitmap_page);
+	if (e4b->bd_buddy_page)
+		page_cache_release(e4b->bd_buddy_page);
+	e4b->bd_buddy = NULL;
+	e4b->bd_bitmap = NULL;
+	return -EIO;
+}
+
+static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+{
+	if (e4b->bd_bitmap_page)
+		page_cache_release(e4b->bd_bitmap_page);
+	if (e4b->bd_buddy_page)
+		page_cache_release(e4b->bd_buddy_page);
+}
+
+
+static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
+{
+	int order = 1;
+	void *bb;
+
+	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
+
+	bb = EXT4_MB_BUDDY(e4b);
+	while (order <= e4b->bd_blkbits + 1) {
+		block = block >> 1;
+		if (!mb_test_bit(block, bb)) {
+			/* this block is part of buddy of order 'order' */
+			return order;
+		}
+		bb += 1 << (e4b->bd_blkbits - order);
+		order++;
+	}
+	return 0;
+}
+
+static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+{
+	__u32 *addr;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: clear whole word at once */
+			addr = bm + (cur >> 3);
+			*addr = 0;
+			cur += 32;
+			continue;
+		}
+		mb_clear_bit_atomic(lock, cur, bm);
+		cur++;
+	}
+}
+
+static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+{
+	__u32 *addr;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: set whole word at once */
+			addr = bm + (cur >> 3);
+			*addr = 0xffffffff;
+			cur += 32;
+			continue;
+		}
+		mb_set_bit_atomic(lock, cur, bm);
+		cur++;
+	}
+}
+
+static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+			  int first, int count)
+{
+	int block = 0;
+	int max = 0;
+	int order;
+	void *buddy;
+	void *buddy2;
+	struct super_block *sb = e4b->bd_sb;
+
+	BUG_ON(first + count > (sb->s_blocksize << 3));
+	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	mb_check_buddy(e4b);
+	mb_free_blocks_double(inode, e4b, first, count);
+
+	e4b->bd_info->bb_free += count;
+	if (first < e4b->bd_info->bb_first_free)
+		e4b->bd_info->bb_first_free = first;
+
+	/* let's maintain fragments counter */
+	if (first != 0)
+		block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
+	if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
+		max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
+	if (block && max)
+		e4b->bd_info->bb_fragments--;
+	else if (!block && !max)
+		e4b->bd_info->bb_fragments++;
+
+	/* let's maintain buddy itself */
+	while (count-- > 0) {
+		block = first++;
+		order = 0;
+
+		if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
+			ext4_fsblk_t blocknr;
+			blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+			blocknr += block;
+			blocknr +=
+			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+
+			ext4_error(sb, __FUNCTION__, "double-free of inode"
+				   " %lu's block %llu(bit %u in group %lu)\n",
+				   inode ? inode->i_ino : 0, blocknr, block,
+				   e4b->bd_group);
+		}
+		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
+		e4b->bd_info->bb_counters[order]++;
+
+		/* start of the buddy */
+		buddy = mb_find_buddy(e4b, order, &max);
+
+		do {
+			block &= ~1UL;
+			if (mb_test_bit(block, buddy) ||
+					mb_test_bit(block + 1, buddy))
+				break;
+
+			/* both the buddies are free, try to coalesce them */
+			buddy2 = mb_find_buddy(e4b, order + 1, &max);
+
+			if (!buddy2)
+				break;
+
+			if (order > 0) {
+				/* for special purposes, we don't set
+				 * free bits in bitmap */
+				mb_set_bit(block, buddy);
+				mb_set_bit(block + 1, buddy);
+			}
+			e4b->bd_info->bb_counters[order]--;
+			e4b->bd_info->bb_counters[order]--;
+
+			block = block >> 1;
+			order++;
+			e4b->bd_info->bb_counters[order]++;
+
+			mb_clear_bit(block, buddy2);
+			buddy = buddy2;
+		} while (1);
+	}
+	mb_check_buddy(e4b);
+
+	return 0;
+}
+
+static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+				int needed, struct ext4_free_extent *ex)
+{
+	int next = block;
+	int max;
+	int ord;
+	void *buddy;
+
+	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	BUG_ON(ex == NULL);
+
+	buddy = mb_find_buddy(e4b, order, &max);
+	BUG_ON(buddy == NULL);
+	BUG_ON(block >= max);
+	if (mb_test_bit(block, buddy)) {
+		ex->fe_len = 0;
+		ex->fe_start = 0;
+		ex->fe_group = 0;
+		return 0;
+	}
+
+	/* FIXME dorp order completely ? */
+	if (likely(order == 0)) {
+		/* find actual order */
+		order = mb_find_order_for_block(e4b, block);
+		block = block >> order;
+	}
+
+	ex->fe_len = 1 << order;
+	ex->fe_start = block << order;
+	ex->fe_group = e4b->bd_group;
+
+	/* calc difference from given start */
+	next = next - ex->fe_start;
+	ex->fe_len -= next;
+	ex->fe_start += next;
+
+	while (needed > ex->fe_len &&
+	       (buddy = mb_find_buddy(e4b, order, &max))) {
+
+		if (block + 1 >= max)
+			break;
+
+		next = (block + 1) * (1 << order);
+		if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+			break;
+
+		ord = mb_find_order_for_block(e4b, next);
+
+		order = ord;
+		block = next >> order;
+		ex->fe_len += 1 << order;
+	}
+
+	BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
+	return ex->fe_len;
+}
+
+static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
+{
+	int ord;
+	int mlen = 0;
+	int max = 0;
+	int cur;
+	int start = ex->fe_start;
+	int len = ex->fe_len;
+	unsigned ret = 0;
+	int len0 = len;
+	void *buddy;
+
+	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
+	BUG_ON(e4b->bd_group != ex->fe_group);
+	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	mb_check_buddy(e4b);
+	mb_mark_used_double(e4b, start, len);
+
+	e4b->bd_info->bb_free -= len;
+	if (e4b->bd_info->bb_first_free == start)
+		e4b->bd_info->bb_first_free += len;
+
+	/* let's maintain fragments counter */
+	if (start != 0)
+		mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
+		max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+	if (mlen && max)
+		e4b->bd_info->bb_fragments++;
+	else if (!mlen && !max)
+		e4b->bd_info->bb_fragments--;
+
+	/* let's maintain buddy itself */
+	while (len) {
+		ord = mb_find_order_for_block(e4b, start);
+
+		if (((start >> ord) << ord) == start && len >= (1 << ord)) {
+			/* the whole chunk may be allocated at once! */
+			mlen = 1 << ord;
+			buddy = mb_find_buddy(e4b, ord, &max);
+			BUG_ON((start >> ord) >= max);
+			mb_set_bit(start >> ord, buddy);
+			e4b->bd_info->bb_counters[ord]--;
+			start += mlen;
+			len -= mlen;
+			BUG_ON(len < 0);
+			continue;
+		}
+
+		/* store for history */
+		if (ret == 0)
+			ret = len | (ord << 16);
+
+		/* we have to split large buddy */
+		BUG_ON(ord <= 0);
+		buddy = mb_find_buddy(e4b, ord, &max);
+		mb_set_bit(start >> ord, buddy);
+		e4b->bd_info->bb_counters[ord]--;
+
+		ord--;
+		cur = (start >> ord) & ~1U;
+		buddy = mb_find_buddy(e4b, ord, &max);
+		mb_clear_bit(cur, buddy);
+		mb_clear_bit(cur + 1, buddy);
+		e4b->bd_info->bb_counters[ord]++;
+		e4b->bd_info->bb_counters[ord]++;
+	}
+
+	mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
+			EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+	mb_check_buddy(e4b);
+
+	return ret;
+}
+
+/*
+ * Must be called under group lock!
+ */
+static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	int ret;
+
+	BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
+	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+	ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
+	ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
+	ret = mb_mark_used(e4b, &ac->ac_b_ex);
+
+	/* preallocation can change ac_b_ex, thus we store actually
+	 * allocated blocks for history */
+	ac->ac_f_ex = ac->ac_b_ex;
+
+	ac->ac_status = AC_STATUS_FOUND;
+	ac->ac_tail = ret & 0xffff;
+	ac->ac_buddy = ret >> 16;
+
+	/* XXXXXXX: SUCH A HORRIBLE **CK */
+	/*FIXME!! Why ? */
+	ac->ac_bitmap_page = e4b->bd_bitmap_page;
+	get_page(ac->ac_bitmap_page);
+	ac->ac_buddy_page = e4b->bd_buddy_page;
+	get_page(ac->ac_buddy_page);
+
+	/* store last allocated for subsequent stream allocation */
+	if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+		spin_lock(&sbi->s_md_lock);
+		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
+		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
+		spin_unlock(&sbi->s_md_lock);
+	}
+}
+
+/*
+ * regular allocator, for general purposes allocation
+ */
+
+static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b,
+					int finish_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_free_extent *bex = &ac->ac_b_ex;
+	struct ext4_free_extent *gex = &ac->ac_g_ex;
+	struct ext4_free_extent ex;
+	int max;
+
+	/*
+	 * We don't want to scan for a whole year
+	 */
+	if (ac->ac_found > sbi->s_mb_max_to_scan &&
+			!(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		ac->ac_status = AC_STATUS_BREAK;
+		return;
+	}
+
+	/*
+	 * Haven't found good chunk so far, let's continue
+	 */
+	if (bex->fe_len < gex->fe_len)
+		return;
+
+	if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
+			&& bex->fe_group == e4b->bd_group) {
+		/* recheck chunk's availability - we don't know
+		 * when it was found (within this lock-unlock
+		 * period or not) */
+		max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+		if (max >= gex->fe_len) {
+			ext4_mb_use_best_found(ac, e4b);
+			return;
+		}
+	}
+}
+
+/*
+ * The routine checks whether found extent is good enough. If it is,
+ * then the extent gets marked used and flag is set to the context
+ * to stop scanning. Otherwise, the extent is compared with the
+ * previous found extent and if new one is better, then it's stored
+ * in the context. Later, the best found extent will be used, if
+ * mballoc can't find good enough extent.
+ *
+ * FIXME: real allocation policy is to be designed yet!
+ */
+static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
+					struct ext4_free_extent *ex,
+					struct ext4_buddy *e4b)
+{
+	struct ext4_free_extent *bex = &ac->ac_b_ex;
+	struct ext4_free_extent *gex = &ac->ac_g_ex;
+
+	BUG_ON(ex->fe_len <= 0);
+	BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
+
+	ac->ac_found++;
+
+	/*
+	 * The special case - take what you catch first
+	 */
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		*bex = *ex;
+		ext4_mb_use_best_found(ac, e4b);
+		return;
+	}
+
+	/*
+	 * Let's check whether the chuck is good enough
+	 */
+	if (ex->fe_len == gex->fe_len) {
+		*bex = *ex;
+		ext4_mb_use_best_found(ac, e4b);
+		return;
+	}
+
+	/*
+	 * If this is first found extent, just store it in the context
+	 */
+	if (bex->fe_len == 0) {
+		*bex = *ex;
+		return;
+	}
+
+	/*
+	 * If new found extent is better, store it in the context
+	 */
+	if (bex->fe_len < gex->fe_len) {
+		/* if the request isn't satisfied, any found extent
+		 * larger than previous best one is better */
+		if (ex->fe_len > bex->fe_len)
+			*bex = *ex;
+	} else if (ex->fe_len > gex->fe_len) {
+		/* if the request is satisfied, then we try to find
+		 * an extent that still satisfy the request, but is
+		 * smaller than previous one */
+		if (ex->fe_len < bex->fe_len)
+			*bex = *ex;
+	}
+
+	ext4_mb_check_limits(ac, e4b, 0);
+}
+
+static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct ext4_free_extent ex = ac->ac_b_ex;
+	ext4_group_t group = ex.fe_group;
+	int max;
+	int err;
+
+	BUG_ON(ex.fe_len <= 0);
+	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+	if (err)
+		return err;
+
+	ext4_lock_group(ac->ac_sb, group);
+	max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+
+	if (max > 0) {
+		ac->ac_b_ex = ex;
+		ext4_mb_use_best_found(ac, e4b);
+	}
+
+	ext4_unlock_group(ac->ac_sb, group);
+	ext4_mb_release_desc(e4b);
+
+	return 0;
+}
+
+static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+				struct ext4_buddy *e4b)
+{
+	ext4_group_t group = ac->ac_g_ex.fe_group;
+	int max;
+	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_free_extent ex;
+
+	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+		return 0;
+
+	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+	if (err)
+		return err;
+
+	ext4_lock_group(ac->ac_sb, group);
+	max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+			     ac->ac_g_ex.fe_len, &ex);
+
+	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+		ext4_fsblk_t start;
+
+		start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
+			ex.fe_start + le32_to_cpu(es->s_first_data_block);
+		/* use do_div to get remainder (would be 64-bit modulo) */
+		if (do_div(start, sbi->s_stripe) == 0) {
+			ac->ac_found++;
+			ac->ac_b_ex = ex;
+			ext4_mb_use_best_found(ac, e4b);
+		}
+	} else if (max >= ac->ac_g_ex.fe_len) {
+		BUG_ON(ex.fe_len <= 0);
+		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+		ac->ac_found++;
+		ac->ac_b_ex = ex;
+		ext4_mb_use_best_found(ac, e4b);
+	} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
+		/* Sometimes, caller may want to merge even small
+		 * number of blocks to an existing extent */
+		BUG_ON(ex.fe_len <= 0);
+		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+		ac->ac_found++;
+		ac->ac_b_ex = ex;
+		ext4_mb_use_best_found(ac, e4b);
+	}
+	ext4_unlock_group(ac->ac_sb, group);
+	ext4_mb_release_desc(e4b);
+
+	return 0;
+}
+
+/*
+ * The routine scans buddy structures (not bitmap!) from given order
+ * to max order and tries to find big enough chunk to satisfy the req
+ */
+static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_group_info *grp = e4b->bd_info;
+	void *buddy;
+	int i;
+	int k;
+	int max;
+
+	BUG_ON(ac->ac_2order <= 0);
+	for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+		if (grp->bb_counters[i] == 0)
+			continue;
+
+		buddy = mb_find_buddy(e4b, i, &max);
+		BUG_ON(buddy == NULL);
+
+		k = ext4_find_next_zero_bit(buddy, max, 0);
+		BUG_ON(k >= max);
+
+		ac->ac_found++;
+
+		ac->ac_b_ex.fe_len = 1 << i;
+		ac->ac_b_ex.fe_start = k << i;
+		ac->ac_b_ex.fe_group = e4b->bd_group;
+
+		ext4_mb_use_best_found(ac, e4b);
+
+		BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+
+		if (EXT4_SB(sb)->s_mb_stats)
+			atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
+
+		break;
+	}
+}
+
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
+ * free blocks in the group, so the routine can know upper limit.
+ */
+static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+					struct ext4_buddy *e4b)
+{
+	struct super_block *sb = ac->ac_sb;
+	void *bitmap = EXT4_MB_BITMAP(e4b);
+	struct ext4_free_extent ex;
+	int i;
+	int free;
+
+	free = e4b->bd_info->bb_free;
+	BUG_ON(free <= 0);
+
+	i = e4b->bd_info->bb_first_free;
+
+	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
+		i = ext4_find_next_zero_bit(bitmap,
+						EXT4_BLOCKS_PER_GROUP(sb), i);
+		if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+			BUG_ON(free != 0);
+			break;
+		}
+
+		mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+		BUG_ON(ex.fe_len <= 0);
+		BUG_ON(free < ex.fe_len);
+
+		ext4_mb_measure_extent(ac, &ex, e4b);
+
+		i += ex.fe_len;
+		free -= ex.fe_len;
+	}
+
+	ext4_mb_check_limits(ac, e4b, 1);
+}
+
+/*
+ * This is a special case for storages like raid5
+ * we try to find stripe-aligned chunks for stripe-size requests
+ * XXX should do so at least for multiples of stripe size as well
+ */
+static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+				 struct ext4_buddy *e4b)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	void *bitmap = EXT4_MB_BITMAP(e4b);
+	struct ext4_free_extent ex;
+	ext4_fsblk_t first_group_block;
+	ext4_fsblk_t a;
+	ext4_grpblk_t i;
+	int max;
+
+	BUG_ON(sbi->s_stripe == 0);
+
+	/* find first stripe-aligned block in group */
+	first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
+		+ le32_to_cpu(sbi->s_es->s_first_data_block);
+	a = first_group_block + sbi->s_stripe - 1;
+	do_div(a, sbi->s_stripe);
+	i = (a * sbi->s_stripe) - first_group_block;
+
+	while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+		if (!mb_test_bit(i, bitmap)) {
+			max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+			if (max >= sbi->s_stripe) {
+				ac->ac_found++;
+				ac->ac_b_ex = ex;
+				ext4_mb_use_best_found(ac, e4b);
+				break;
+			}
+		}
+		i += sbi->s_stripe;
+	}
+}
+
+static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+				ext4_group_t group, int cr)
+{
+	unsigned free, fragments;
+	unsigned i, bits;
+	struct ext4_group_desc *desc;
+	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+
+	BUG_ON(cr < 0 || cr >= 4);
+	BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+	free = grp->bb_free;
+	fragments = grp->bb_fragments;
+	if (free == 0)
+		return 0;
+	if (fragments == 0)
+		return 0;
+
+	switch (cr) {
+	case 0:
+		BUG_ON(ac->ac_2order == 0);
+		/* If this group is uninitialized, skip it initially */
+		desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
+		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
+			return 0;
+
+		bits = ac->ac_sb->s_blocksize_bits + 1;
+		for (i = ac->ac_2order; i <= bits; i++)
+			if (grp->bb_counters[i] > 0)
+				return 1;
+		break;
+	case 1:
+		if ((free / fragments) >= ac->ac_g_ex.fe_len)
+			return 1;
+		break;
+	case 2:
+		if (free >= ac->ac_g_ex.fe_len)
+			return 1;
+		break;
+	case 3:
+		return 1;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+{
+	ext4_group_t group;
+	ext4_group_t i;
+	int cr;
+	int err = 0;
+	int bsbits;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	struct ext4_buddy e4b;
+	loff_t size, isize;
+
+	sb = ac->ac_sb;
+	sbi = EXT4_SB(sb);
+	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+	/* first, try the goal */
+	err = ext4_mb_find_by_goal(ac, &e4b);
+	if (err || ac->ac_status == AC_STATUS_FOUND)
+		goto out;
+
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		goto out;
+
+	/*
+	 * ac->ac2_order is set only if the fe_len is a power of 2
+	 * if ac2_order is set we also set criteria to 0 so that we
+	 * try exact allocation using buddy.
+	 */
+	i = fls(ac->ac_g_ex.fe_len);
+	ac->ac_2order = 0;
+	/*
+	 * We search using buddy data only if the order of the request
+	 * is greater than equal to the sbi_s_mb_order2_reqs
+	 * You can tune it via /proc/fs/ext4/<partition>/order2_req
+	 */
+	if (i >= sbi->s_mb_order2_reqs) {
+		/*
+		 * This should tell if fe_len is exactly power of 2
+		 */
+		if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+			ac->ac_2order = i - 1;
+	}
+
+	bsbits = ac->ac_sb->s_blocksize_bits;
+	/* if stream allocation is enabled, use global goal */
+	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+	isize = i_size_read(ac->ac_inode) >> bsbits;
+	if (size < isize)
+		size = isize;
+
+	if (size < sbi->s_mb_stream_request &&
+			(ac->ac_flags & EXT4_MB_HINT_DATA)) {
+		/* TBD: may be hot point */
+		spin_lock(&sbi->s_md_lock);
+		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
+		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
+		spin_unlock(&sbi->s_md_lock);
+	}
+
+	/* searching for the right group start from the goal value specified */
+	group = ac->ac_g_ex.fe_group;
+
+	/* Let's just scan groups to find more-less suitable blocks */
+	cr = ac->ac_2order ? 0 : 1;
+	/*
+	 * cr == 0 try to get exact allocation,
+	 * cr == 3  try to get anything
+	 */
+repeat:
+	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
+		ac->ac_criteria = cr;
+		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+			struct ext4_group_info *grp;
+			struct ext4_group_desc *desc;
+
+			if (group == EXT4_SB(sb)->s_groups_count)
+				group = 0;
+
+			/* quick check to skip empty groups */
+			grp = ext4_get_group_info(ac->ac_sb, group);
+			if (grp->bb_free == 0)
+				continue;
+
+			/*
+			 * if the group is already init we check whether it is
+			 * a good group and if not we don't load the buddy
+			 */
+			if (EXT4_MB_GRP_NEED_INIT(grp)) {
+				/*
+				 * we need full data about the group
+				 * to make a good selection
+				 */
+				err = ext4_mb_load_buddy(sb, group, &e4b);
+				if (err)
+					goto out;
+				ext4_mb_release_desc(&e4b);
+			}
+
+			/*
+			 * If the particular group doesn't satisfy our
+			 * criteria we continue with the next group
+			 */
+			if (!ext4_mb_good_group(ac, group, cr))
+				continue;
+
+			err = ext4_mb_load_buddy(sb, group, &e4b);
+			if (err)
+				goto out;
+
+			ext4_lock_group(sb, group);
+			if (!ext4_mb_good_group(ac, group, cr)) {
+				/* someone did allocation from this group */
+				ext4_unlock_group(sb, group);
+				ext4_mb_release_desc(&e4b);
+				continue;
+			}
+
+			ac->ac_groups_scanned++;
+			desc = ext4_get_group_desc(sb, group, NULL);
+			if (cr == 0 || (desc->bg_flags &
+					cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
+					ac->ac_2order != 0))
+				ext4_mb_simple_scan_group(ac, &e4b);
+			else if (cr == 1 &&
+					ac->ac_g_ex.fe_len == sbi->s_stripe)
+				ext4_mb_scan_aligned(ac, &e4b);
+			else
+				ext4_mb_complex_scan_group(ac, &e4b);
+
+			ext4_unlock_group(sb, group);
+			ext4_mb_release_desc(&e4b);
+
+			if (ac->ac_status != AC_STATUS_CONTINUE)
+				break;
+		}
+	}
+
+	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
+	    !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		/*
+		 * We've been searching too long. Let's try to allocate
+		 * the best chunk we've found so far
+		 */
+
+		ext4_mb_try_best_found(ac, &e4b);
+		if (ac->ac_status != AC_STATUS_FOUND) {
+			/*
+			 * Someone more lucky has already allocated it.
+			 * The only thing we can do is just take first
+			 * found block(s)
+			printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
+			 */
+			ac->ac_b_ex.fe_group = 0;
+			ac->ac_b_ex.fe_start = 0;
+			ac->ac_b_ex.fe_len = 0;
+			ac->ac_status = AC_STATUS_CONTINUE;
+			ac->ac_flags |= EXT4_MB_HINT_FIRST;
+			cr = 3;
+			atomic_inc(&sbi->s_mb_lost_chunks);
+			goto repeat;
+		}
+	}
+out:
+	return err;
+}
+
+#ifdef EXT4_MB_HISTORY
+struct ext4_mb_proc_session {
+	struct ext4_mb_history *history;
+	struct super_block *sb;
+	int start;
+	int max;
+};
+
+static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
+					struct ext4_mb_history *hs,
+					int first)
+{
+	if (hs == s->history + s->max)
+		hs = s->history;
+	if (!first && hs == s->history + s->start)
+		return NULL;
+	while (hs->orig.fe_len == 0) {
+		hs++;
+		if (hs == s->history + s->max)
+			hs = s->history;
+		if (hs == s->history + s->start)
+			return NULL;
+	}
+	return hs;
+}
+
+static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
+{
+	struct ext4_mb_proc_session *s = seq->private;
+	struct ext4_mb_history *hs;
+	int l = *pos;
+
+	if (l == 0)
+		return SEQ_START_TOKEN;
+	hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
+	if (!hs)
+		return NULL;
+	while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
+	return hs;
+}
+
+static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
+				      loff_t *pos)
+{
+	struct ext4_mb_proc_session *s = seq->private;
+	struct ext4_mb_history *hs = v;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
+	else
+		return ext4_mb_history_skip_empty(s, ++hs, 0);
+}
+
+static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
+{
+	char buf[25], buf2[25], buf3[25], *fmt;
+	struct ext4_mb_history *hs = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
+				"%-5s %-2s %-5s %-5s %-5s %-6s\n",
+			  "pid", "inode", "original", "goal", "result", "found",
+			   "grps", "cr", "flags", "merge", "tail", "broken");
+		return 0;
+	}
+
+	if (hs->op == EXT4_MB_HISTORY_ALLOC) {
+		fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
+			"%-5u %-5s %-5u %-6u\n";
+		sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+			hs->result.fe_start, hs->result.fe_len,
+			hs->result.fe_logical);
+		sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+			hs->orig.fe_start, hs->orig.fe_len,
+			hs->orig.fe_logical);
+		sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
+			hs->goal.fe_start, hs->goal.fe_len,
+			hs->goal.fe_logical);
+		seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
+				hs->found, hs->groups, hs->cr, hs->flags,
+				hs->merged ? "M" : "", hs->tail,
+				hs->buddy ? 1 << hs->buddy : 0);
+	} else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
+		fmt = "%-5u %-8u %-23s %-23s %-23s\n";
+		sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+			hs->result.fe_start, hs->result.fe_len,
+			hs->result.fe_logical);
+		sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+			hs->orig.fe_start, hs->orig.fe_len,
+			hs->orig.fe_logical);
+		seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
+	} else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
+		sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+			hs->result.fe_start, hs->result.fe_len);
+		seq_printf(seq, "%-5u %-8u %-23s discard\n",
+				hs->pid, hs->ino, buf2);
+	} else if (hs->op == EXT4_MB_HISTORY_FREE) {
+		sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+			hs->result.fe_start, hs->result.fe_len);
+		seq_printf(seq, "%-5u %-8u %-23s free\n",
+				hs->pid, hs->ino, buf2);
+	}
+	return 0;
+}
+
+static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations ext4_mb_seq_history_ops = {
+	.start  = ext4_mb_seq_history_start,
+	.next   = ext4_mb_seq_history_next,
+	.stop   = ext4_mb_seq_history_stop,
+	.show   = ext4_mb_seq_history_show,
+};
+
+static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
+{
+	struct super_block *sb = PDE(inode)->data;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_mb_proc_session *s;
+	int rc;
+	int size;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL)
+		return -ENOMEM;
+	s->sb = sb;
+	size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
+	s->history = kmalloc(size, GFP_KERNEL);
+	if (s->history == NULL) {
+		kfree(s);
+		return -ENOMEM;
+	}
+
+	spin_lock(&sbi->s_mb_history_lock);
+	memcpy(s->history, sbi->s_mb_history, size);
+	s->max = sbi->s_mb_history_max;
+	s->start = sbi->s_mb_history_cur % s->max;
+	spin_unlock(&sbi->s_mb_history_lock);
+
+	rc = seq_open(file, &ext4_mb_seq_history_ops);
+	if (rc == 0) {
+		struct seq_file *m = (struct seq_file *)file->private_data;
+		m->private = s;
+	} else {
+		kfree(s->history);
+		kfree(s);
+	}
+	return rc;
+
+}
+
+static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = (struct seq_file *)file->private_data;
+	struct ext4_mb_proc_session *s = seq->private;
+	kfree(s->history);
+	kfree(s);
+	return seq_release(inode, file);
+}
+
+static ssize_t ext4_mb_seq_history_write(struct file *file,
+				const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct seq_file *seq = (struct seq_file *)file->private_data;
+	struct ext4_mb_proc_session *s = seq->private;
+	struct super_block *sb = s->sb;
+	char str[32];
+	int value;
+
+	if (count >= sizeof(str)) {
+		printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
+				"mb_history", (int)sizeof(str));
+		return -EOVERFLOW;
+	}
+
+	if (copy_from_user(str, buffer, count))
+		return -EFAULT;
+
+	value = simple_strtol(str, NULL, 0);
+	if (value < 0)
+		return -ERANGE;
+	EXT4_SB(sb)->s_mb_history_filter = value;
+
+	return count;
+}
+
+static struct file_operations ext4_mb_seq_history_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_mb_seq_history_open,
+	.read		= seq_read,
+	.write		= ext4_mb_seq_history_write,
+	.llseek		= seq_lseek,
+	.release	= ext4_mb_seq_history_release,
+};
+
+static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+{
+	struct super_block *sb = seq->private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t group;
+
+	if (*pos < 0 || *pos >= sbi->s_groups_count)
+		return NULL;
+
+	group = *pos + 1;
+	return (void *) group;
+}
+
+static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct super_block *sb = seq->private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t group;
+
+	++*pos;
+	if (*pos < 0 || *pos >= sbi->s_groups_count)
+		return NULL;
+	group = *pos + 1;
+	return (void *) group;;
+}
+
+static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+{
+	struct super_block *sb = seq->private;
+	long group = (long) v;
+	int i;
+	int err;
+	struct ext4_buddy e4b;
+	struct sg {
+		struct ext4_group_info info;
+		unsigned short counters[16];
+	} sg;
+
+	group--;
+	if (group == 0)
+		seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
+				"[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
+				  "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
+			   "group", "free", "frags", "first",
+			   "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
+			   "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+		sizeof(struct ext4_group_info);
+	err = ext4_mb_load_buddy(sb, group, &e4b);
+	if (err) {
+		seq_printf(seq, "#%-5lu: I/O error\n", group);
+		return 0;
+	}
+	ext4_lock_group(sb, group);
+	memcpy(&sg, ext4_get_group_info(sb, group), i);
+	ext4_unlock_group(sb, group);
+	ext4_mb_release_desc(&e4b);
+
+	seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+			sg.info.bb_fragments, sg.info.bb_first_free);
+	for (i = 0; i <= 13; i++)
+		seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
+				sg.info.bb_counters[i] : 0);
+	seq_printf(seq, " ]\n");
+
+	return 0;
+}
+
+static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations ext4_mb_seq_groups_ops = {
+	.start  = ext4_mb_seq_groups_start,
+	.next   = ext4_mb_seq_groups_next,
+	.stop   = ext4_mb_seq_groups_stop,
+	.show   = ext4_mb_seq_groups_show,
+};
+
+static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
+{
+	struct super_block *sb = PDE(inode)->data;
+	int rc;
+
+	rc = seq_open(file, &ext4_mb_seq_groups_ops);
+	if (rc == 0) {
+		struct seq_file *m = (struct seq_file *)file->private_data;
+		m->private = sb;
+	}
+	return rc;
+
+}
+
+static struct file_operations ext4_mb_seq_groups_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_mb_seq_groups_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static void ext4_mb_history_release(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	remove_proc_entry("mb_groups", sbi->s_mb_proc);
+	remove_proc_entry("mb_history", sbi->s_mb_proc);
+
+	kfree(sbi->s_mb_history);
+}
+
+static void ext4_mb_history_init(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int i;
+
+	if (sbi->s_mb_proc != NULL) {
+		struct proc_dir_entry *p;
+		p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
+		if (p) {
+			p->proc_fops = &ext4_mb_seq_history_fops;
+			p->data = sb;
+		}
+		p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
+		if (p) {
+			p->proc_fops = &ext4_mb_seq_groups_fops;
+			p->data = sb;
+		}
+	}
+
+	sbi->s_mb_history_max = 1000;
+	sbi->s_mb_history_cur = 0;
+	spin_lock_init(&sbi->s_mb_history_lock);
+	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
+	sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
+	if (likely(sbi->s_mb_history != NULL))
+		memset(sbi->s_mb_history, 0, i);
+	/* if we can't allocate history, then we simple won't use it */
+}
+
+static void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_mb_history h;
+
+	if (unlikely(sbi->s_mb_history == NULL))
+		return;
+
+	if (!(ac->ac_op & sbi->s_mb_history_filter))
+		return;
+
+	h.op = ac->ac_op;
+	h.pid = current->pid;
+	h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
+	h.orig = ac->ac_o_ex;
+	h.result = ac->ac_b_ex;
+	h.flags = ac->ac_flags;
+	h.found = ac->ac_found;
+	h.groups = ac->ac_groups_scanned;
+	h.cr = ac->ac_criteria;
+	h.tail = ac->ac_tail;
+	h.buddy = ac->ac_buddy;
+	h.merged = 0;
+	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
+		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+			h.merged = 1;
+		h.goal = ac->ac_g_ex;
+		h.result = ac->ac_f_ex;
+	}
+
+	spin_lock(&sbi->s_mb_history_lock);
+	memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
+	if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
+		sbi->s_mb_history_cur = 0;
+	spin_unlock(&sbi->s_mb_history_lock);
+}
+
+#else
+#define ext4_mb_history_release(sb)
+#define ext4_mb_history_init(sb)
+#endif
+
+static int ext4_mb_init_backend(struct super_block *sb)
+{
+	ext4_group_t i;
+	int j, len, metalen;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int num_meta_group_infos =
+		(sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+	struct ext4_group_info **meta_group_info;
+
+	/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
+	 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
+	 * So a two level scheme suffices for now. */
+	sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
+				    num_meta_group_infos, GFP_KERNEL);
+	if (sbi->s_group_info == NULL) {
+		printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
+		return -ENOMEM;
+	}
+	sbi->s_buddy_cache = new_inode(sb);
+	if (sbi->s_buddy_cache == NULL) {
+		printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+		goto err_freesgi;
+	}
+	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+
+	metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
+	for (i = 0; i < num_meta_group_infos; i++) {
+		if ((i + 1) == num_meta_group_infos)
+			metalen = sizeof(*meta_group_info) *
+				(sbi->s_groups_count -
+					(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
+		meta_group_info = kmalloc(metalen, GFP_KERNEL);
+		if (meta_group_info == NULL) {
+			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+			       "buddy group\n");
+			goto err_freemeta;
+		}
+		sbi->s_group_info[i] = meta_group_info;
+	}
+
+	/*
+	 * calculate needed size. if change bb_counters size,
+	 * don't forget about ext4_mb_generate_buddy()
+	 */
+	len = sizeof(struct ext4_group_info);
+	len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		struct ext4_group_desc *desc;
+
+		meta_group_info =
+			sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+		j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+		meta_group_info[j] = kzalloc(len, GFP_KERNEL);
+		if (meta_group_info[j] == NULL) {
+			printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+			i--;
+			goto err_freebuddy;
+		}
+		desc = ext4_get_group_desc(sb, i, NULL);
+		if (desc == NULL) {
+			printk(KERN_ERR
+				"EXT4-fs: can't read descriptor %lu\n", i);
+			goto err_freebuddy;
+		}
+		memset(meta_group_info[j], 0, len);
+		set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+			&(meta_group_info[j]->bb_state));
+
+		/*
+		 * initialize bb_free to be able to skip
+		 * empty groups without initialization
+		 */
+		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+			meta_group_info[j]->bb_free =
+				ext4_free_blocks_after_init(sb, i, desc);
+		} else {
+			meta_group_info[j]->bb_free =
+				le16_to_cpu(desc->bg_free_blocks_count);
+		}
+
+		INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+		{
+			struct buffer_head *bh;
+			meta_group_info[j]->bb_bitmap =
+				kmalloc(sb->s_blocksize, GFP_KERNEL);
+			BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
+			bh = read_block_bitmap(sb, i);
+			BUG_ON(bh == NULL);
+			memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
+					sb->s_blocksize);
+			put_bh(bh);
+		}
+#endif
+
+	}
+
+	return 0;
+
+err_freebuddy:
+	while (i >= 0) {
+		kfree(ext4_get_group_info(sb, i));
+		i--;
+	}
+	i = num_meta_group_infos;
+err_freemeta:
+	while (--i >= 0)
+		kfree(sbi->s_group_info[i]);
+	iput(sbi->s_buddy_cache);
+err_freesgi:
+	kfree(sbi->s_group_info);
+	return -ENOMEM;
+}
+
+int ext4_mb_init(struct super_block *sb, int needs_recovery)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned i;
+	unsigned offset;
+	unsigned max;
+
+	if (!test_opt(sb, MBALLOC))
+		return 0;
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+
+	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
+	if (sbi->s_mb_offsets == NULL) {
+		clear_opt(sbi->s_mount_opt, MBALLOC);
+		return -ENOMEM;
+	}
+	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
+	if (sbi->s_mb_maxs == NULL) {
+		clear_opt(sbi->s_mount_opt, MBALLOC);
+		kfree(sbi->s_mb_maxs);
+		return -ENOMEM;
+	}
+
+	/* order 0 is regular bitmap */
+	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
+	sbi->s_mb_offsets[0] = 0;
+
+	i = 1;
+	offset = 0;
+	max = sb->s_blocksize << 2;
+	do {
+		sbi->s_mb_offsets[i] = offset;
+		sbi->s_mb_maxs[i] = max;
+		offset += 1 << (sb->s_blocksize_bits - i);
+		max = max >> 1;
+		i++;
+	} while (i <= sb->s_blocksize_bits + 1);
+
+	/* init file for buddy data */
+	i = ext4_mb_init_backend(sb);
+	if (i) {
+		clear_opt(sbi->s_mount_opt, MBALLOC);
+		kfree(sbi->s_mb_offsets);
+		kfree(sbi->s_mb_maxs);
+		return i;
+	}
+
+	spin_lock_init(&sbi->s_md_lock);
+	INIT_LIST_HEAD(&sbi->s_active_transaction);
+	INIT_LIST_HEAD(&sbi->s_closed_transaction);
+	INIT_LIST_HEAD(&sbi->s_committed_transaction);
+	spin_lock_init(&sbi->s_bal_lock);
+
+	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+	sbi->s_mb_stats = MB_DEFAULT_STATS;
+	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+	sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
+	sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+
+	i = sizeof(struct ext4_locality_group) * NR_CPUS;
+	sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+	if (sbi->s_locality_groups == NULL) {
+		clear_opt(sbi->s_mount_opt, MBALLOC);
+		kfree(sbi->s_mb_offsets);
+		kfree(sbi->s_mb_maxs);
+		return -ENOMEM;
+	}
+	for (i = 0; i < NR_CPUS; i++) {
+		struct ext4_locality_group *lg;
+		lg = &sbi->s_locality_groups[i];
+		mutex_init(&lg->lg_mutex);
+		INIT_LIST_HEAD(&lg->lg_prealloc_list);
+		spin_lock_init(&lg->lg_prealloc_lock);
+	}
+
+	ext4_mb_init_per_dev_proc(sb);
+	ext4_mb_history_init(sb);
+
+	printk("EXT4-fs: mballoc enabled\n");
+	return 0;
+}
+
+/* need to called with ext4 group lock (ext4_lock_group) */
+static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
+{
+	struct ext4_prealloc_space *pa;
+	struct list_head *cur, *tmp;
+	int count = 0;
+
+	list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		list_del(&pa->pa_group_list);
+		count++;
+		kfree(pa);
+	}
+	if (count)
+		mb_debug("mballoc: %u PAs left\n", count);
+
+}
+
+int ext4_mb_release(struct super_block *sb)
+{
+	ext4_group_t i;
+	int num_meta_group_infos;
+	struct ext4_group_info *grinfo;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (!test_opt(sb, MBALLOC))
+		return 0;
+
+	/* release freed, non-committed blocks */
+	spin_lock(&sbi->s_md_lock);
+	list_splice_init(&sbi->s_closed_transaction,
+			&sbi->s_committed_transaction);
+	list_splice_init(&sbi->s_active_transaction,
+			&sbi->s_committed_transaction);
+	spin_unlock(&sbi->s_md_lock);
+	ext4_mb_free_committed_blocks(sb);
+
+	if (sbi->s_group_info) {
+		for (i = 0; i < sbi->s_groups_count; i++) {
+			grinfo = ext4_get_group_info(sb, i);
+#ifdef DOUBLE_CHECK
+			kfree(grinfo->bb_bitmap);
+#endif
+			ext4_lock_group(sb, i);
+			ext4_mb_cleanup_pa(grinfo);
+			ext4_unlock_group(sb, i);
+			kfree(grinfo);
+		}
+		num_meta_group_infos = (sbi->s_groups_count +
+				EXT4_DESC_PER_BLOCK(sb) - 1) >>
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		for (i = 0; i < num_meta_group_infos; i++)
+			kfree(sbi->s_group_info[i]);
+		kfree(sbi->s_group_info);
+	}
+	kfree(sbi->s_mb_offsets);
+	kfree(sbi->s_mb_maxs);
+	if (sbi->s_buddy_cache)
+		iput(sbi->s_buddy_cache);
+	if (sbi->s_mb_stats) {
+		printk(KERN_INFO
+		       "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
+				atomic_read(&sbi->s_bal_allocated),
+				atomic_read(&sbi->s_bal_reqs),
+				atomic_read(&sbi->s_bal_success));
+		printk(KERN_INFO
+		      "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
+				"%u 2^N hits, %u breaks, %u lost\n",
+				atomic_read(&sbi->s_bal_ex_scanned),
+				atomic_read(&sbi->s_bal_goals),
+				atomic_read(&sbi->s_bal_2orders),
+				atomic_read(&sbi->s_bal_breaks),
+				atomic_read(&sbi->s_mb_lost_chunks));
+		printk(KERN_INFO
+		       "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
+				sbi->s_mb_buddies_generated++,
+				sbi->s_mb_generation_time);
+		printk(KERN_INFO
+		       "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
+				atomic_read(&sbi->s_mb_preallocated),
+				atomic_read(&sbi->s_mb_discarded));
+	}
+
+	kfree(sbi->s_locality_groups);
+
+	ext4_mb_history_release(sb);
+	ext4_mb_destroy_per_dev_proc(sb);
+
+	return 0;
+}
+
+static void ext4_mb_free_committed_blocks(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int err;
+	int i;
+	int count = 0;
+	int count2 = 0;
+	struct ext4_free_metadata *md;
+	struct ext4_buddy e4b;
+
+	if (list_empty(&sbi->s_committed_transaction))
+		return;
+
+	/* there is committed blocks to be freed yet */
+	do {
+		/* get next array of blocks */
+		md = NULL;
+		spin_lock(&sbi->s_md_lock);
+		if (!list_empty(&sbi->s_committed_transaction)) {
+			md = list_entry(sbi->s_committed_transaction.next,
+					struct ext4_free_metadata, list);
+			list_del(&md->list);
+		}
+		spin_unlock(&sbi->s_md_lock);
+
+		if (md == NULL)
+			break;
+
+		mb_debug("gonna free %u blocks in group %lu (0x%p):",
+				md->num, md->group, md);
+
+		err = ext4_mb_load_buddy(sb, md->group, &e4b);
+		/* we expect to find existing buddy because it's pinned */
+		BUG_ON(err != 0);
+
+		/* there are blocks to put in buddy to make them really free */
+		count += md->num;
+		count2++;
+		ext4_lock_group(sb, md->group);
+		for (i = 0; i < md->num; i++) {
+			mb_debug(" %u", md->blocks[i]);
+			err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+			BUG_ON(err != 0);
+		}
+		mb_debug("\n");
+		ext4_unlock_group(sb, md->group);
+
+		/* balance refcounts from ext4_mb_free_metadata() */
+		page_cache_release(e4b.bd_buddy_page);
+		page_cache_release(e4b.bd_bitmap_page);
+
+		kfree(md);
+		ext4_mb_release_desc(&e4b);
+
+	} while (md);
+
+	mb_debug("freed %u blocks in %u structures\n", count, count2);
+}
+
+#define EXT4_ROOT			"ext4"
+#define EXT4_MB_STATS_NAME		"stats"
+#define EXT4_MB_MAX_TO_SCAN_NAME	"max_to_scan"
+#define EXT4_MB_MIN_TO_SCAN_NAME	"min_to_scan"
+#define EXT4_MB_ORDER2_REQ		"order2_req"
+#define EXT4_MB_STREAM_REQ		"stream_req"
+#define EXT4_MB_GROUP_PREALLOC		"group_prealloc"
+
+
+
+#define MB_PROC_VALUE_READ(name)				\
+static int ext4_mb_read_##name(char *page, char **start,	\
+		off_t off, int count, int *eof, void *data)	\
+{								\
+	struct ext4_sb_info *sbi = data;			\
+	int len;						\
+	*eof = 1;						\
+	if (off != 0)						\
+		return 0;					\
+	len = sprintf(page, "%ld\n", sbi->s_mb_##name);		\
+	*start = page;						\
+	return len;						\
+}
+
+#define MB_PROC_VALUE_WRITE(name)				\
+static int ext4_mb_write_##name(struct file *file,		\
+		const char __user *buf, unsigned long cnt, void *data)	\
+{								\
+	struct ext4_sb_info *sbi = data;			\
+	char str[32];						\
+	long value;						\
+	if (cnt >= sizeof(str))					\
+		return -EINVAL;					\
+	if (copy_from_user(str, buf, cnt))			\
+		return -EFAULT;					\
+	value = simple_strtol(str, NULL, 0);			\
+	if (value <= 0)						\
+		return -ERANGE;					\
+	sbi->s_mb_##name = value;				\
+	return cnt;						\
+}
+
+MB_PROC_VALUE_READ(stats);
+MB_PROC_VALUE_WRITE(stats);
+MB_PROC_VALUE_READ(max_to_scan);
+MB_PROC_VALUE_WRITE(max_to_scan);
+MB_PROC_VALUE_READ(min_to_scan);
+MB_PROC_VALUE_WRITE(min_to_scan);
+MB_PROC_VALUE_READ(order2_reqs);
+MB_PROC_VALUE_WRITE(order2_reqs);
+MB_PROC_VALUE_READ(stream_request);
+MB_PROC_VALUE_WRITE(stream_request);
+MB_PROC_VALUE_READ(group_prealloc);
+MB_PROC_VALUE_WRITE(group_prealloc);
+
+#define	MB_PROC_HANDLER(name, var)					\
+do {									\
+	proc = create_proc_entry(name, mode, sbi->s_mb_proc);		\
+	if (proc == NULL) {						\
+		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
+		goto err_out;						\
+	}								\
+	proc->data = sbi;						\
+	proc->read_proc  = ext4_mb_read_##var ;				\
+	proc->write_proc = ext4_mb_write_##var;				\
+} while (0)
+
+static int ext4_mb_init_per_dev_proc(struct super_block *sb)
+{
+	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct proc_dir_entry *proc;
+	char devname[64];
+
+	snprintf(devname, sizeof(devname) - 1, "%s",
+		bdevname(sb->s_bdev, devname));
+	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
+
+	MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
+	MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
+	MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
+	MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
+	MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
+	MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
+
+	return 0;
+
+err_out:
+	printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
+	remove_proc_entry(devname, proc_root_ext4);
+	sbi->s_mb_proc = NULL;
+
+	return -ENOMEM;
+}
+
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	char devname[64];
+
+	if (sbi->s_mb_proc == NULL)
+		return -EINVAL;
+
+	snprintf(devname, sizeof(devname) - 1, "%s",
+		bdevname(sb->s_bdev, devname));
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
+	remove_proc_entry(devname, proc_root_ext4);
+
+	return 0;
+}
+
+int __init init_ext4_mballoc(void)
+{
+	ext4_pspace_cachep =
+		kmem_cache_create("ext4_prealloc_space",
+				     sizeof(struct ext4_prealloc_space),
+				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	if (ext4_pspace_cachep == NULL)
+		return -ENOMEM;
+
+#ifdef CONFIG_PROC_FS
+	proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
+	if (proc_root_ext4 == NULL)
+		printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
+#endif
+
+	return 0;
+}
+
+void exit_ext4_mballoc(void)
+{
+	/* XXX: synchronize_rcu(); */
+	kmem_cache_destroy(ext4_pspace_cachep);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry(EXT4_ROOT, proc_root_fs);
+#endif
+}
+
+
+/*
+ * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
+ * Returns 0 if success or error code
+ */
+static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+				handle_t *handle)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_super_block *es;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *gdp_bh;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	ext4_fsblk_t block;
+	int err;
+
+	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+	BUG_ON(ac->ac_b_ex.fe_len <= 0);
+
+	sb = ac->ac_sb;
+	sbi = EXT4_SB(sb);
+	es = sbi->s_es;
+
+	ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+			gdp->bg_free_blocks_count);
+
+	err = -EIO;
+	bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+	if (!bitmap_bh)
+		goto out_err;
+
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+	if (!gdp)
+		goto out_err;
+
+	err = ext4_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto out_err;
+
+	block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
+		+ ac->ac_b_ex.fe_start
+		+ le32_to_cpu(es->s_first_data_block);
+
+	if (block == ext4_block_bitmap(sb, gdp) ||
+			block == ext4_inode_bitmap(sb, gdp) ||
+			in_range(block, ext4_inode_table(sb, gdp),
+				EXT4_SB(sb)->s_itb_per_group)) {
+
+		ext4_error(sb, __FUNCTION__,
+			   "Allocating block in system zone - block = %llu",
+			   block);
+	}
+#ifdef AGGRESSIVE_CHECK
+	{
+		int i;
+		for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
+			BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
+						bitmap_bh->b_data));
+		}
+	}
+#endif
+	mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
+				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
+
+	spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+		gdp->bg_free_blocks_count =
+			cpu_to_le16(ext4_free_blocks_after_init(sb,
+						ac->ac_b_ex.fe_group,
+						gdp));
+	}
+	gdp->bg_free_blocks_count =
+		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
+				- ac->ac_b_ex.fe_len);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	if (err)
+		goto out_err;
+	err = ext4_journal_dirty_metadata(handle, gdp_bh);
+
+out_err:
+	sb->s_dirt = 1;
+	put_bh(bitmap_bh);
+	return err;
+}
+
+/*
+ * here we normalize request for locality group
+ * Group request are normalized to s_strip size if we set the same via mount
+ * option. If not we set it to s_mb_group_prealloc which can be configured via
+ * /proc/fs/ext4/<partition>/group_prealloc
+ *
+ * XXX: should we try to preallocate more than the group has now?
+ */
+static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg = ac->ac_lg;
+
+	BUG_ON(lg == NULL);
+	if (EXT4_SB(sb)->s_stripe)
+		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
+	else
+		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+	mb_debug("#%u: goal %lu blocks for locality group\n",
+		current->pid, ac->ac_g_ex.fe_len);
+}
+
+/*
+ * Normalization means making request better in terms of
+ * size and alignment
+ */
+static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+				struct ext4_allocation_request *ar)
+{
+	int bsbits, max;
+	ext4_lblk_t end;
+	struct list_head *cur;
+	loff_t size, orig_size, start_off;
+	ext4_lblk_t start, orig_start;
+	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+
+	/* do normalize only data requests, metadata requests
+	   do not need preallocation */
+	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+		return;
+
+	/* sometime caller may want exact blocks */
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		return;
+
+	/* caller may indicate that preallocation isn't
+	 * required (it's a tail, for example) */
+	if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
+		return;
+
+	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
+		ext4_mb_normalize_group_request(ac);
+		return ;
+	}
+
+	bsbits = ac->ac_sb->s_blocksize_bits;
+
+	/* first, let's learn actual file size
+	 * given current request is allocated */
+	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+	size = size << bsbits;
+	if (size < i_size_read(ac->ac_inode))
+		size = i_size_read(ac->ac_inode);
+
+	/* max available blocks in a free group */
+	max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
+				EXT4_SB(ac->ac_sb)->s_itb_per_group;
+
+#define NRL_CHECK_SIZE(req, size, max,bits)	\
+		(req <= (size) || max <= ((size) >> bits))
+
+	/* first, try to predict filesize */
+	/* XXX: should this table be tunable? */
+	start_off = 0;
+	if (size <= 16 * 1024) {
+		size = 16 * 1024;
+	} else if (size <= 32 * 1024) {
+		size = 32 * 1024;
+	} else if (size <= 64 * 1024) {
+		size = 64 * 1024;
+	} else if (size <= 128 * 1024) {
+		size = 128 * 1024;
+	} else if (size <= 256 * 1024) {
+		size = 256 * 1024;
+	} else if (size <= 512 * 1024) {
+		size = 512 * 1024;
+	} else if (size <= 1024 * 1024) {
+		size = 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
+		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+						(20 - bsbits)) << 20;
+		size = 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
+		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+							(22 - bsbits)) << 22;
+		size = 4 * 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+					(8<<20)>>bsbits, max, bsbits)) {
+		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+							(23 - bsbits)) << 23;
+		size = 8 * 1024 * 1024;
+	} else {
+		start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+		size	  = ac->ac_o_ex.fe_len << bsbits;
+	}
+	orig_size = size = size >> bsbits;
+	orig_start = start = start_off >> bsbits;
+
+	/* don't cover already allocated blocks in selected range */
+	if (ar->pleft && start <= ar->lleft) {
+		size -= ar->lleft + 1 - start;
+		start = ar->lleft + 1;
+	}
+	if (ar->pright && start + size - 1 >= ar->lright)
+		size -= start + size - ar->lright;
+
+	end = start + size;
+
+	/* check we don't cross already preallocated blocks */
+	rcu_read_lock();
+	list_for_each_rcu(cur, &ei->i_prealloc_list) {
+		struct ext4_prealloc_space *pa;
+		unsigned long pa_end;
+
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+
+		if (pa->pa_deleted)
+			continue;
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+
+		pa_end = pa->pa_lstart + pa->pa_len;
+
+		/* PA must not overlap original request */
+		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
+			ac->ac_o_ex.fe_logical < pa->pa_lstart));
+
+		/* skip PA normalized request doesn't overlap with */
+		if (pa->pa_lstart >= end) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		if (pa_end <= start) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+
+		if (pa_end <= ac->ac_o_ex.fe_logical) {
+			BUG_ON(pa_end < start);
+			start = pa_end;
+		}
+
+		if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+			BUG_ON(pa->pa_lstart > end);
+			end = pa->pa_lstart;
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+	size = end - start;
+
+	/* XXX: extra loop to check we really don't overlap preallocations */
+	rcu_read_lock();
+	list_for_each_rcu(cur, &ei->i_prealloc_list) {
+		struct ext4_prealloc_space *pa;
+		unsigned long pa_end;
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted == 0) {
+			pa_end = pa->pa_lstart + pa->pa_len;
+			BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+
+	if (start + size <= ac->ac_o_ex.fe_logical &&
+			start > ac->ac_o_ex.fe_logical) {
+		printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
+			(unsigned long) start, (unsigned long) size,
+			(unsigned long) ac->ac_o_ex.fe_logical);
+	}
+	BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
+			start > ac->ac_o_ex.fe_logical);
+	BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+
+	/* now prepare goal request */
+
+	/* XXX: is it better to align blocks WRT to logical
+	 * placement or satisfy big request as is */
+	ac->ac_g_ex.fe_logical = start;
+	ac->ac_g_ex.fe_len = size;
+
+	/* define goal start in order to merge */
+	if (ar->pright && (ar->lright == (start + size))) {
+		/* merge to the right */
+		ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
+						&ac->ac_f_ex.fe_group,
+						&ac->ac_f_ex.fe_start);
+		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+	}
+	if (ar->pleft && (ar->lleft + 1 == start)) {
+		/* merge to the left */
+		ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
+						&ac->ac_f_ex.fe_group,
+						&ac->ac_f_ex.fe_start);
+		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+	}
+
+	mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+		(unsigned) orig_size, (unsigned) start);
+}
+
+static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+
+	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+		atomic_inc(&sbi->s_bal_reqs);
+		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
+		if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+			atomic_inc(&sbi->s_bal_success);
+		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+			atomic_inc(&sbi->s_bal_goals);
+		if (ac->ac_found > sbi->s_mb_max_to_scan)
+			atomic_inc(&sbi->s_bal_breaks);
+	}
+
+	ext4_mb_store_history(ac);
+}
+
+/*
+ * use blocks preallocated to inode
+ */
+static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
+				struct ext4_prealloc_space *pa)
+{
+	ext4_fsblk_t start;
+	ext4_fsblk_t end;
+	int len;
+
+	/* found preallocated blocks, use them */
+	start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+	end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
+	len = end - start;
+	ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
+					&ac->ac_b_ex.fe_start);
+	ac->ac_b_ex.fe_len = len;
+	ac->ac_status = AC_STATUS_FOUND;
+	ac->ac_pa = pa;
+
+	BUG_ON(start < pa->pa_pstart);
+	BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+	BUG_ON(pa->pa_free < len);
+	pa->pa_free -= len;
+
+	mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa);
+}
+
+/*
+ * use blocks preallocated to locality group
+ */
+static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
+				struct ext4_prealloc_space *pa)
+{
+	unsigned len = ac->ac_o_ex.fe_len;
+
+	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
+					&ac->ac_b_ex.fe_group,
+					&ac->ac_b_ex.fe_start);
+	ac->ac_b_ex.fe_len = len;
+	ac->ac_status = AC_STATUS_FOUND;
+	ac->ac_pa = pa;
+
+	/* we don't correct pa_pstart or pa_plen here to avoid
+	 * possible race when tte group is being loaded concurrently
+	 * instead we correct pa later, after blocks are marked
+	 * in on-disk bitmap -- see ext4_mb_release_context() */
+	/*
+	 * FIXME!! but the other CPUs can look at this particular
+	 * pa and think that it have enought free blocks if we
+	 * don't update pa_free here right ?
+	 */
+	mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+}
+
+/*
+ * search goal blocks in preallocated space
+ */
+static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
+{
+	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+	struct ext4_locality_group *lg;
+	struct ext4_prealloc_space *pa;
+	struct list_head *cur;
+
+	/* only data can be preallocated */
+	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+		return 0;
+
+	/* first, try per-file preallocation */
+	rcu_read_lock();
+	list_for_each_rcu(cur, &ei->i_prealloc_list) {
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+
+		/* all fields in this condition don't change,
+		 * so we can skip locking for them */
+		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
+			ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+			continue;
+
+		/* found preallocated blocks, use them */
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted == 0 && pa->pa_free) {
+			atomic_inc(&pa->pa_count);
+			ext4_mb_use_inode_pa(ac, pa);
+			spin_unlock(&pa->pa_lock);
+			ac->ac_criteria = 10;
+			rcu_read_unlock();
+			return 1;
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+
+	/* can we use group allocation? */
+	if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
+		return 0;
+
+	/* inode may have no locality group for some reason */
+	lg = ac->ac_lg;
+	if (lg == NULL)
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_rcu(cur, &lg->lg_prealloc_list) {
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+		spin_lock(&pa->pa_lock);
+		if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
+			atomic_inc(&pa->pa_count);
+			ext4_mb_use_group_pa(ac, pa);
+			spin_unlock(&pa->pa_lock);
+			ac->ac_criteria = 20;
+			rcu_read_unlock();
+			return 1;
+		}
+		spin_unlock(&pa->pa_lock);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/*
+ * the function goes through all preallocation in this group and marks them
+ * used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_prealloc_space *pa;
+	struct list_head *cur;
+	ext4_group_t groupnr;
+	ext4_grpblk_t start;
+	int preallocated = 0;
+	int count = 0;
+	int len;
+
+	/* all form of preallocation discards first load group,
+	 * so the only competing code is preallocation use.
+	 * we don't need any locking here
+	 * notice we do NOT ignore preallocations with pa_deleted
+	 * otherwise we could leave used blocks available for
+	 * allocation in buddy when concurrent ext4_mb_put_pa()
+	 * is dropping preallocation
+	 */
+	list_for_each(cur, &grp->bb_prealloc_list) {
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		spin_lock(&pa->pa_lock);
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+					     &groupnr, &start);
+		len = pa->pa_len;
+		spin_unlock(&pa->pa_lock);
+		if (unlikely(len == 0))
+			continue;
+		BUG_ON(groupnr != group);
+		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+						bitmap, start, len);
+		preallocated += len;
+		count++;
+	}
+	mb_debug("prellocated %u for group %lu\n", preallocated, group);
+}
+
+static void ext4_mb_pa_callback(struct rcu_head *head)
+{
+	struct ext4_prealloc_space *pa;
+	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+	kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
+/*
+ * drops a reference to preallocated space descriptor
+ * if this was the last reference and the space is consumed
+ */
+static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
+			struct super_block *sb, struct ext4_prealloc_space *pa)
+{
+	unsigned long grp;
+
+	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
+		return;
+
+	/* in this short window concurrent discard can set pa_deleted */
+	spin_lock(&pa->pa_lock);
+	if (pa->pa_deleted == 1) {
+		spin_unlock(&pa->pa_lock);
+		return;
+	}
+
+	pa->pa_deleted = 1;
+	spin_unlock(&pa->pa_lock);
+
+	/* -1 is to protect from crossing allocation group */
+	ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
+
+	/*
+	 * possible race:
+	 *
+	 *  P1 (buddy init)			P2 (regular allocation)
+	 *					find block B in PA
+	 *  copy on-disk bitmap to buddy
+	 *  					mark B in on-disk bitmap
+	 *					drop PA from group
+	 *  mark all PAs in buddy
+	 *
+	 * thus, P1 initializes buddy with B available. to prevent this
+	 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
+	 * against that pair
+	 */
+	ext4_lock_group(sb, grp);
+	list_del(&pa->pa_group_list);
+	ext4_unlock_group(sb, grp);
+
+	spin_lock(pa->pa_obj_lock);
+	list_del_rcu(&pa->pa_inode_list);
+	spin_unlock(pa->pa_obj_lock);
+
+	call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+}
+
+/*
+ * creates new preallocated space for given inode
+ */
+static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_prealloc_space *pa;
+	struct ext4_group_info *grp;
+	struct ext4_inode_info *ei;
+
+	/* preallocate only when found space is larger then requested */
+	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+	if (pa == NULL)
+		return -ENOMEM;
+
+	if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
+		int winl;
+		int wins;
+		int win;
+		int offs;
+
+		/* we can't allocate as much as normalizer wants.
+		 * so, found space must get proper lstart
+		 * to cover original request */
+		BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
+		BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
+
+		/* we're limited by original request in that
+		 * logical block must be covered any way
+		 * winl is window we can move our chunk within */
+		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+
+		/* also, we should cover whole original request */
+		wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+
+		/* the smallest one defines real window */
+		win = min(winl, wins);
+
+		offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+		if (offs && offs < win)
+			win = offs;
+
+		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
+		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+	}
+
+	/* preallocation can change ac_b_ex, thus we store actually
+	 * allocated blocks for history */
+	ac->ac_f_ex = ac->ac_b_ex;
+
+	pa->pa_lstart = ac->ac_b_ex.fe_logical;
+	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+	pa->pa_len = ac->ac_b_ex.fe_len;
+	pa->pa_free = pa->pa_len;
+	atomic_set(&pa->pa_count, 1);
+	spin_lock_init(&pa->pa_lock);
+	pa->pa_deleted = 0;
+	pa->pa_linear = 0;
+
+	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+
+	ext4_mb_use_inode_pa(ac, pa);
+	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+
+	ei = EXT4_I(ac->ac_inode);
+	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+
+	pa->pa_obj_lock = &ei->i_prealloc_lock;
+	pa->pa_inode = ac->ac_inode;
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+	spin_lock(pa->pa_obj_lock);
+	list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
+	spin_unlock(pa->pa_obj_lock);
+
+	return 0;
+}
+
+/*
+ * creates new preallocated space for locality group inodes belongs to
+ */
+static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg;
+	struct ext4_prealloc_space *pa;
+	struct ext4_group_info *grp;
+
+	/* preallocate only when found space is larger then requested */
+	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+	BUG_ON(ext4_pspace_cachep == NULL);
+	pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+	if (pa == NULL)
+		return -ENOMEM;
+
+	/* preallocation can change ac_b_ex, thus we store actually
+	 * allocated blocks for history */
+	ac->ac_f_ex = ac->ac_b_ex;
+
+	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+	pa->pa_lstart = pa->pa_pstart;
+	pa->pa_len = ac->ac_b_ex.fe_len;
+	pa->pa_free = pa->pa_len;
+	atomic_set(&pa->pa_count, 1);
+	spin_lock_init(&pa->pa_lock);
+	pa->pa_deleted = 0;
+	pa->pa_linear = 1;
+
+	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+
+	ext4_mb_use_group_pa(ac, pa);
+	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+
+	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+	lg = ac->ac_lg;
+	BUG_ON(lg == NULL);
+
+	pa->pa_obj_lock = &lg->lg_prealloc_lock;
+	pa->pa_inode = NULL;
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+	spin_lock(pa->pa_obj_lock);
+	list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
+	spin_unlock(pa->pa_obj_lock);
+
+	return 0;
+}
+
+static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+{
+	int err;
+
+	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+		err = ext4_mb_new_group_pa(ac);
+	else
+		err = ext4_mb_new_inode_pa(ac);
+	return err;
+}
+
+/*
+ * finds all unused blocks in on-disk bitmap, frees them in
+ * in-core bitmap and buddy.
+ * @pa must be unlinked from inode and group lists, so that
+ * nobody else can find/use it.
+ * the caller MUST hold group/inode locks.
+ * TODO: optimize the case when there are no in-core structures yet
+ */
+static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
+				struct buffer_head *bitmap_bh,
+				struct ext4_prealloc_space *pa)
+{
+	struct ext4_allocation_context ac;
+	struct super_block *sb = e4b->bd_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned long end;
+	unsigned long next;
+	ext4_group_t group;
+	ext4_grpblk_t bit;
+	sector_t start;
+	int err = 0;
+	int free = 0;
+
+	BUG_ON(pa->pa_deleted == 0);
+	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+	end = bit + pa->pa_len;
+
+	ac.ac_sb = sb;
+	ac.ac_inode = pa->pa_inode;
+	ac.ac_op = EXT4_MB_HISTORY_DISCARD;
+
+	while (bit < end) {
+		bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit);
+		if (bit >= end)
+			break;
+		next = ext4_find_next_bit(bitmap_bh->b_data, end, bit);
+		if (next > end)
+			next = end;
+		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
+				le32_to_cpu(sbi->s_es->s_first_data_block);
+		mb_debug("    free preallocated %u/%u in group %u\n",
+				(unsigned) start, (unsigned) next - bit,
+				(unsigned) group);
+		free += next - bit;
+
+		ac.ac_b_ex.fe_group = group;
+		ac.ac_b_ex.fe_start = bit;
+		ac.ac_b_ex.fe_len = next - bit;
+		ac.ac_b_ex.fe_logical = 0;
+		ext4_mb_store_history(&ac);
+
+		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
+		bit = next + 1;
+	}
+	if (free != pa->pa_free) {
+		printk(KERN_ERR "pa %p: logic %lu, phys. %lu, len %lu\n",
+			pa, (unsigned long) pa->pa_lstart,
+			(unsigned long) pa->pa_pstart,
+			(unsigned long) pa->pa_len);
+		printk(KERN_ERR "free %u, pa_free %u\n", free, pa->pa_free);
+	}
+	BUG_ON(free != pa->pa_free);
+	atomic_add(free, &sbi->s_mb_discarded);
+
+	return err;
+}
+
+static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
+				struct ext4_prealloc_space *pa)
+{
+	struct ext4_allocation_context ac;
+	struct super_block *sb = e4b->bd_sb;
+	ext4_group_t group;
+	ext4_grpblk_t bit;
+
+	ac.ac_op = EXT4_MB_HISTORY_DISCARD;
+
+	BUG_ON(pa->pa_deleted == 0);
+	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
+	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+
+	ac.ac_sb = sb;
+	ac.ac_inode = NULL;
+	ac.ac_b_ex.fe_group = group;
+	ac.ac_b_ex.fe_start = bit;
+	ac.ac_b_ex.fe_len = pa->pa_len;
+	ac.ac_b_ex.fe_logical = 0;
+	ext4_mb_store_history(&ac);
+
+	return 0;
+}
+
+/*
+ * releases all preallocations in given group
+ *
+ * first, we need to decide discard policy:
+ * - when do we discard
+ *   1) ENOSPC
+ * - how many do we discard
+ *   1) how many requested
+ */
+static int ext4_mb_discard_group_preallocations(struct super_block *sb,
+					ext4_group_t group, int needed)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_prealloc_space *pa, *tmp;
+	struct list_head list;
+	struct ext4_buddy e4b;
+	int err;
+	int busy = 0;
+	int free = 0;
+
+	mb_debug("discard preallocation for group %lu\n", group);
+
+	if (list_empty(&grp->bb_prealloc_list))
+		return 0;
+
+	bitmap_bh = read_block_bitmap(sb, group);
+	if (bitmap_bh == NULL) {
+		/* error handling here */
+		ext4_mb_release_desc(&e4b);
+		BUG_ON(bitmap_bh == NULL);
+	}
+
+	err = ext4_mb_load_buddy(sb, group, &e4b);
+	BUG_ON(err != 0); /* error handling here */
+
+	if (needed == 0)
+		needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+
+	grp = ext4_get_group_info(sb, group);
+	INIT_LIST_HEAD(&list);
+
+repeat:
+	ext4_lock_group(sb, group);
+	list_for_each_entry_safe(pa, tmp,
+				&grp->bb_prealloc_list, pa_group_list) {
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			spin_unlock(&pa->pa_lock);
+			busy = 1;
+			continue;
+		}
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+
+		/* seems this one can be freed ... */
+		pa->pa_deleted = 1;
+
+		/* we can trust pa_free ... */
+		free += pa->pa_free;
+
+		spin_unlock(&pa->pa_lock);
+
+		list_del(&pa->pa_group_list);
+		list_add(&pa->u.pa_tmp_list, &list);
+	}
+
+	/* if we still need more blocks and some PAs were used, try again */
+	if (free < needed && busy) {
+		busy = 0;
+		ext4_unlock_group(sb, group);
+		/*
+		 * Yield the CPU here so that we don't get soft lockup
+		 * in non preempt case.
+		 */
+		yield();
+		goto repeat;
+	}
+
+	/* found anything to free? */
+	if (list_empty(&list)) {
+		BUG_ON(free != 0);
+		goto out;
+	}
+
+	/* now free all selected PAs */
+	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+
+		/* remove from object (inode or locality group) */
+		spin_lock(pa->pa_obj_lock);
+		list_del_rcu(&pa->pa_inode_list);
+		spin_unlock(pa->pa_obj_lock);
+
+		if (pa->pa_linear)
+			ext4_mb_release_group_pa(&e4b, pa);
+		else
+			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+
+out:
+	ext4_unlock_group(sb, group);
+	ext4_mb_release_desc(&e4b);
+	put_bh(bitmap_bh);
+	return free;
+}
+
+/*
+ * releases all non-used preallocated blocks for given inode
+ *
+ * It's important to discard preallocations under i_data_sem
+ * We don't want another block to be served from the prealloc
+ * space when we are discarding the inode prealloc space.
+ *
+ * FIXME!! Make sure it is valid at all the call sites
+ */
+void ext4_mb_discard_inode_preallocations(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_prealloc_space *pa, *tmp;
+	ext4_group_t group = 0;
+	struct list_head list;
+	struct ext4_buddy e4b;
+	int err;
+
+	if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
+		return;
+	}
+
+	mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+
+	INIT_LIST_HEAD(&list);
+
+repeat:
+	/* first, collect all pa's in the inode */
+	spin_lock(&ei->i_prealloc_lock);
+	while (!list_empty(&ei->i_prealloc_list)) {
+		pa = list_entry(ei->i_prealloc_list.next,
+				struct ext4_prealloc_space, pa_inode_list);
+		BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			/* this shouldn't happen often - nobody should
+			 * use preallocation while we're discarding it */
+			spin_unlock(&pa->pa_lock);
+			spin_unlock(&ei->i_prealloc_lock);
+			printk(KERN_ERR "uh-oh! used pa while discarding\n");
+			WARN_ON(1);
+			schedule_timeout_uninterruptible(HZ);
+			goto repeat;
+
+		}
+		if (pa->pa_deleted == 0) {
+			pa->pa_deleted = 1;
+			spin_unlock(&pa->pa_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			list_add(&pa->u.pa_tmp_list, &list);
+			continue;
+		}
+
+		/* someone is deleting pa right now */
+		spin_unlock(&pa->pa_lock);
+		spin_unlock(&ei->i_prealloc_lock);
+
+		/* we have to wait here because pa_deleted
+		 * doesn't mean pa is already unlinked from
+		 * the list. as we might be called from
+		 * ->clear_inode() the inode will get freed
+		 * and concurrent thread which is unlinking
+		 * pa from inode's list may access already
+		 * freed memory, bad-bad-bad */
+
+		/* XXX: if this happens too often, we can
+		 * add a flag to force wait only in case
+		 * of ->clear_inode(), but not in case of
+		 * regular truncate */
+		schedule_timeout_uninterruptible(HZ);
+		goto repeat;
+	}
+	spin_unlock(&ei->i_prealloc_lock);
+
+	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+		BUG_ON(pa->pa_linear != 0);
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+
+		err = ext4_mb_load_buddy(sb, group, &e4b);
+		BUG_ON(err != 0); /* error handling here */
+
+		bitmap_bh = read_block_bitmap(sb, group);
+		if (bitmap_bh == NULL) {
+			/* error handling here */
+			ext4_mb_release_desc(&e4b);
+			BUG_ON(bitmap_bh == NULL);
+		}
+
+		ext4_lock_group(sb, group);
+		list_del(&pa->pa_group_list);
+		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+		ext4_unlock_group(sb, group);
+
+		ext4_mb_release_desc(&e4b);
+		put_bh(bitmap_bh);
+
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+}
+
+/*
+ * finds all preallocated spaces and return blocks being freed to them
+ * if preallocated space becomes full (no block is used from the space)
+ * then the function frees space in buddy
+ * XXX: at the moment, truncate (which is the only way to free blocks)
+ * discards all preallocations
+ */
+static void ext4_mb_return_to_preallocation(struct inode *inode,
+					struct ext4_buddy *e4b,
+					sector_t block, int count)
+{
+	BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
+}
+#ifdef MB_DEBUG
+static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+	struct super_block *sb = ac->ac_sb;
+	ext4_group_t i;
+
+	printk(KERN_ERR "EXT4-fs: Can't allocate:"
+			" Allocation context details:\n");
+	printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
+			ac->ac_status, ac->ac_flags);
+	printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
+			"best %lu/%lu/%lu@%lu cr %d\n",
+			(unsigned long)ac->ac_o_ex.fe_group,
+			(unsigned long)ac->ac_o_ex.fe_start,
+			(unsigned long)ac->ac_o_ex.fe_len,
+			(unsigned long)ac->ac_o_ex.fe_logical,
+			(unsigned long)ac->ac_g_ex.fe_group,
+			(unsigned long)ac->ac_g_ex.fe_start,
+			(unsigned long)ac->ac_g_ex.fe_len,
+			(unsigned long)ac->ac_g_ex.fe_logical,
+			(unsigned long)ac->ac_b_ex.fe_group,
+			(unsigned long)ac->ac_b_ex.fe_start,
+			(unsigned long)ac->ac_b_ex.fe_len,
+			(unsigned long)ac->ac_b_ex.fe_logical,
+			(int)ac->ac_criteria);
+	printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
+		ac->ac_found);
+	printk(KERN_ERR "EXT4-fs: groups: \n");
+	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+		struct ext4_prealloc_space *pa;
+		ext4_grpblk_t start;
+		struct list_head *cur;
+		ext4_lock_group(sb, i);
+		list_for_each(cur, &grp->bb_prealloc_list) {
+			pa = list_entry(cur, struct ext4_prealloc_space,
+					pa_group_list);
+			spin_lock(&pa->pa_lock);
+			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+						     NULL, &start);
+			spin_unlock(&pa->pa_lock);
+			printk(KERN_ERR "PA:%lu:%d:%u \n", i,
+							start, pa->pa_len);
+		}
+		ext4_lock_group(sb, i);
+
+		if (grp->bb_free == 0)
+			continue;
+		printk(KERN_ERR "%lu: %d/%d \n",
+		       i, grp->bb_free, grp->bb_fragments);
+	}
+	printk(KERN_ERR "\n");
+}
+#else
+static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+	return;
+}
+#endif
+
+/*
+ * We use locality group preallocation for small size file. The size of the
+ * file is determined by the current size or the resulting size after
+ * allocation which ever is larger
+ *
+ * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ */
+static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	int bsbits = ac->ac_sb->s_blocksize_bits;
+	loff_t size, isize;
+
+	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+		return;
+
+	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+	isize = i_size_read(ac->ac_inode) >> bsbits;
+	size = max(size, isize);
+
+	/* don't use group allocation for large files */
+	if (size >= sbi->s_mb_stream_request)
+		return;
+
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		return;
+
+	BUG_ON(ac->ac_lg != NULL);
+	/*
+	 * locality group prealloc space are per cpu. The reason for having
+	 * per cpu locality group is to reduce the contention between block
+	 * request from multiple CPUs.
+	 */
+	ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
+	put_cpu();
+
+	/* we're going to use group allocation */
+	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
+
+	/* serialize all allocations in the group */
+	mutex_lock(&ac->ac_lg->lg_mutex);
+}
+
+static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
+				struct ext4_allocation_request *ar)
+{
+	struct super_block *sb = ar->inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_group_t group;
+	unsigned long len;
+	unsigned long goal;
+	ext4_grpblk_t block;
+
+	/* we can't allocate > group size */
+	len = ar->len;
+
+	/* just a dirty hack to filter too big requests  */
+	if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
+		len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+
+	/* start searching from the goal */
+	goal = ar->goal;
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+			goal >= ext4_blocks_count(es))
+		goal = le32_to_cpu(es->s_first_data_block);
+	ext4_get_group_no_and_offset(sb, goal, &group, &block);
+
+	/* set up allocation goals */
+	ac->ac_b_ex.fe_logical = ar->logical;
+	ac->ac_b_ex.fe_group = 0;
+	ac->ac_b_ex.fe_start = 0;
+	ac->ac_b_ex.fe_len = 0;
+	ac->ac_status = AC_STATUS_CONTINUE;
+	ac->ac_groups_scanned = 0;
+	ac->ac_ex_scanned = 0;
+	ac->ac_found = 0;
+	ac->ac_sb = sb;
+	ac->ac_inode = ar->inode;
+	ac->ac_o_ex.fe_logical = ar->logical;
+	ac->ac_o_ex.fe_group = group;
+	ac->ac_o_ex.fe_start = block;
+	ac->ac_o_ex.fe_len = len;
+	ac->ac_g_ex.fe_logical = ar->logical;
+	ac->ac_g_ex.fe_group = group;
+	ac->ac_g_ex.fe_start = block;
+	ac->ac_g_ex.fe_len = len;
+	ac->ac_f_ex.fe_len = 0;
+	ac->ac_flags = ar->flags;
+	ac->ac_2order = 0;
+	ac->ac_criteria = 0;
+	ac->ac_pa = NULL;
+	ac->ac_bitmap_page = NULL;
+	ac->ac_buddy_page = NULL;
+	ac->ac_lg = NULL;
+
+	/* we have to define context: we'll we work with a file or
+	 * locality group. this is a policy, actually */
+	ext4_mb_group_or_file(ac);
+
+	mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+			"left: %u/%u, right %u/%u to %swritable\n",
+			(unsigned) ar->len, (unsigned) ar->logical,
+			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
+			(unsigned) ar->lleft, (unsigned) ar->pleft,
+			(unsigned) ar->lright, (unsigned) ar->pright,
+			atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+	return 0;
+
+}
+
+/*
+ * release all resource we used in allocation
+ */
+static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+{
+	if (ac->ac_pa) {
+		if (ac->ac_pa->pa_linear) {
+			/* see comment in ext4_mb_use_group_pa() */
+			spin_lock(&ac->ac_pa->pa_lock);
+			ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
+			ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
+			ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
+			ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
+			spin_unlock(&ac->ac_pa->pa_lock);
+		}
+		ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
+	}
+	if (ac->ac_bitmap_page)
+		page_cache_release(ac->ac_bitmap_page);
+	if (ac->ac_buddy_page)
+		page_cache_release(ac->ac_buddy_page);
+	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+		mutex_unlock(&ac->ac_lg->lg_mutex);
+	ext4_mb_collect_stats(ac);
+	return 0;
+}
+
+static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
+{
+	ext4_group_t i;
+	int ret;
+	int freed = 0;
+
+	for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+		freed += ret;
+		needed -= ret;
+	}
+
+	return freed;
+}
+
+/*
+ * Main entry point into mballoc to allocate blocks
+ * it tries to use preallocation first, then falls back
+ * to usual allocation
+ */
+ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
+				 struct ext4_allocation_request *ar, int *errp)
+{
+	struct ext4_allocation_context ac;
+	struct ext4_sb_info *sbi;
+	struct super_block *sb;
+	ext4_fsblk_t block = 0;
+	int freed;
+	int inquota;
+
+	sb = ar->inode->i_sb;
+	sbi = EXT4_SB(sb);
+
+	if (!test_opt(sb, MBALLOC)) {
+		block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+					    &(ar->len), errp);
+		return block;
+	}
+
+	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
+		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+		ar->len--;
+	}
+	if (ar->len == 0) {
+		*errp = -EDQUOT;
+		return 0;
+	}
+	inquota = ar->len;
+
+	ext4_mb_poll_new_transaction(sb, handle);
+
+	*errp = ext4_mb_initialize_context(&ac, ar);
+	if (*errp) {
+		ar->len = 0;
+		goto out;
+	}
+
+	ac.ac_op = EXT4_MB_HISTORY_PREALLOC;
+	if (!ext4_mb_use_preallocated(&ac)) {
+
+		ac.ac_op = EXT4_MB_HISTORY_ALLOC;
+		ext4_mb_normalize_request(&ac, ar);
+
+repeat:
+		/* allocate space in core */
+		ext4_mb_regular_allocator(&ac);
+
+		/* as we've just preallocated more space than
+		 * user requested orinally, we store allocated
+		 * space in a special descriptor */
+		if (ac.ac_status == AC_STATUS_FOUND &&
+				ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len)
+			ext4_mb_new_preallocation(&ac);
+	}
+
+	if (likely(ac.ac_status == AC_STATUS_FOUND)) {
+		ext4_mb_mark_diskspace_used(&ac, handle);
+		*errp = 0;
+		block = ext4_grp_offs_to_block(sb, &ac.ac_b_ex);
+		ar->len = ac.ac_b_ex.fe_len;
+	} else {
+		freed  = ext4_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len);
+		if (freed)
+			goto repeat;
+		*errp = -ENOSPC;
+		ac.ac_b_ex.fe_len = 0;
+		ar->len = 0;
+		ext4_mb_show_ac(&ac);
+	}
+
+	ext4_mb_release_context(&ac);
+
+out:
+	if (ar->len < inquota)
+		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+
+	return block;
+}
+static void ext4_mb_poll_new_transaction(struct super_block *sb,
+						handle_t *handle)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sbi->s_last_transaction == handle->h_transaction->t_tid)
+		return;
+
+	/* new transaction! time to close last one and free blocks for
+	 * committed transaction. we know that only transaction can be
+	 * active, so previos transaction can be being logged and we
+	 * know that transaction before previous is known to be already
+	 * logged. this means that now we may free blocks freed in all
+	 * transactions before previous one. hope I'm clear enough ... */
+
+	spin_lock(&sbi->s_md_lock);
+	if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
+		mb_debug("new transaction %lu, old %lu\n",
+				(unsigned long) handle->h_transaction->t_tid,
+				(unsigned long) sbi->s_last_transaction);
+		list_splice_init(&sbi->s_closed_transaction,
+				&sbi->s_committed_transaction);
+		list_splice_init(&sbi->s_active_transaction,
+				&sbi->s_closed_transaction);
+		sbi->s_last_transaction = handle->h_transaction->t_tid;
+	}
+	spin_unlock(&sbi->s_md_lock);
+
+	ext4_mb_free_committed_blocks(sb);
+}
+
+static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
+			  ext4_group_t group, ext4_grpblk_t block, int count)
+{
+	struct ext4_group_info *db = e4b->bd_info;
+	struct super_block *sb = e4b->bd_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_free_metadata *md;
+	int i;
+
+	BUG_ON(e4b->bd_bitmap_page == NULL);
+	BUG_ON(e4b->bd_buddy_page == NULL);
+
+	ext4_lock_group(sb, group);
+	for (i = 0; i < count; i++) {
+		md = db->bb_md_cur;
+		if (md && db->bb_tid != handle->h_transaction->t_tid) {
+			db->bb_md_cur = NULL;
+			md = NULL;
+		}
+
+		if (md == NULL) {
+			ext4_unlock_group(sb, group);
+			md = kmalloc(sizeof(*md), GFP_NOFS);
+			if (md == NULL)
+				return -ENOMEM;
+			md->num = 0;
+			md->group = group;
+
+			ext4_lock_group(sb, group);
+			if (db->bb_md_cur == NULL) {
+				spin_lock(&sbi->s_md_lock);
+				list_add(&md->list, &sbi->s_active_transaction);
+				spin_unlock(&sbi->s_md_lock);
+				/* protect buddy cache from being freed,
+				 * otherwise we'll refresh it from
+				 * on-disk bitmap and lose not-yet-available
+				 * blocks */
+				page_cache_get(e4b->bd_buddy_page);
+				page_cache_get(e4b->bd_bitmap_page);
+				db->bb_md_cur = md;
+				db->bb_tid = handle->h_transaction->t_tid;
+				mb_debug("new md 0x%p for group %lu\n",
+						md, md->group);
+			} else {
+				kfree(md);
+				md = db->bb_md_cur;
+			}
+		}
+
+		BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
+		md->blocks[md->num] = block + i;
+		md->num++;
+		if (md->num == EXT4_BB_MAX_BLOCKS) {
+			/* no more space, put full container on a sb's list */
+			db->bb_md_cur = NULL;
+		}
+	}
+	ext4_unlock_group(sb, group);
+	return 0;
+}
+
+/*
+ * Main entry point into mballoc to free blocks
+ */
+void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
+			unsigned long block, unsigned long count,
+			int metadata, unsigned long *freed)
+{
+	struct buffer_head *bitmap_bh = 0;
+	struct super_block *sb = inode->i_sb;
+	struct ext4_allocation_context ac;
+	struct ext4_group_desc *gdp;
+	struct ext4_super_block *es;
+	unsigned long overflow;
+	ext4_grpblk_t bit;
+	struct buffer_head *gd_bh;
+	ext4_group_t block_group;
+	struct ext4_sb_info *sbi;
+	struct ext4_buddy e4b;
+	int err = 0;
+	int ret;
+
+	*freed = 0;
+
+	ext4_mb_poll_new_transaction(sb, handle);
+
+	sbi = EXT4_SB(sb);
+	es = EXT4_SB(sb)->s_es;
+	if (block < le32_to_cpu(es->s_first_data_block) ||
+	    block + count < block ||
+	    block + count > ext4_blocks_count(es)) {
+		ext4_error(sb, __FUNCTION__,
+			    "Freeing blocks not in datazone - "
+			    "block = %lu, count = %lu", block, count);
+		goto error_return;
+	}
+
+	ext4_debug("freeing block %lu\n", block);
+
+	ac.ac_op = EXT4_MB_HISTORY_FREE;
+	ac.ac_inode = inode;
+	ac.ac_sb = sb;
+
+do_more:
+	overflow = 0;
+	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+		count -= overflow;
+	}
+	bitmap_bh = read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
+	if (!gdp)
+		goto error_return;
+
+	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
+	    in_range(block, ext4_inode_table(sb, gdp),
+		      EXT4_SB(sb)->s_itb_per_group) ||
+	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
+		      EXT4_SB(sb)->s_itb_per_group)) {
+
+		ext4_error(sb, __FUNCTION__,
+			   "Freeing blocks in system zone - "
+			   "Block = %lu, count = %lu", block, count);
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting write access");
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
+
+#ifdef AGGRESSIVE_CHECK
+	{
+		int i;
+		for (i = 0; i < count; i++)
+			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
+	}
+#endif
+	mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+			bit, count);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+
+	ac.ac_b_ex.fe_group = block_group;
+	ac.ac_b_ex.fe_start = bit;
+	ac.ac_b_ex.fe_len = count;
+	ext4_mb_store_history(&ac);
+
+	if (metadata) {
+		/* blocks being freed are metadata. these blocks shouldn't
+		 * be used until this transaction is committed */
+		ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+	} else {
+		ext4_lock_group(sb, block_group);
+		err = mb_free_blocks(inode, &e4b, bit, count);
+		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
+		ext4_unlock_group(sb, block_group);
+		BUG_ON(err != 0);
+	}
+
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	gdp->bg_free_blocks_count =
+		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	percpu_counter_add(&sbi->s_freeblocks_counter, count);
+
+	ext4_mb_release_desc(&e4b);
+
+	*freed += count;
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext4_journal_dirty_metadata(handle, gd_bh);
+	if (!err)
+		err = ret;
+
+	if (overflow && !err) {
+		block += count;
+		count = overflow;
+		put_bh(bitmap_bh);
+		goto do_more;
+	}
+	sb->s_dirt = 1;
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, err);
+	return;
+}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index ec7cb567a7da..3ebc2332f52e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -236,10 +236,10 @@ static int free_dind_blocks(handle_t *handle,
 	for (i = 0; i < max_entries; i++) {
 		if (tmp_idata[i])
 			ext4_free_blocks(handle, inode,
-					le32_to_cpu(tmp_idata[i]), 1);
+					le32_to_cpu(tmp_idata[i]), 1, 1);
 	}
 	put_bh(bh);
-	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1);
+	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
 	return 0;
 }
 
@@ -267,7 +267,7 @@ static int free_tind_blocks(handle_t *handle,
 		}
 	}
 	put_bh(bh);
-	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1);
+	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
 	return 0;
 }
 
@@ -278,7 +278,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode)
 
 	if (ei->i_data[EXT4_IND_BLOCK])
 		ext4_free_blocks(handle, inode,
-				le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1);
+				le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1);
 
 	if (ei->i_data[EXT4_DIND_BLOCK]) {
 		retval = free_dind_blocks(handle, inode,
@@ -365,7 +365,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
 		}
 	}
 	put_bh(bh);
-	ext4_free_blocks(handle, inode, block, 1);
+	ext4_free_blocks(handle, inode, block, 1, 1);
 	return retval;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 64fc7f111734..3a51ffc47790 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -503,6 +503,7 @@ static void ext4_put_super (struct super_block * sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i;
 
+	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
 	jbd2_journal_destroy(sbi->s_journal);
@@ -569,6 +570,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_block_alloc_info = NULL;
 	ei->vfs_inode.i_version = 1;
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+	INIT_LIST_HEAD(&ei->i_prealloc_list);
+	spin_lock_init(&ei->i_prealloc_lock);
 	return &ei->vfs_inode;
 }
 
@@ -881,6 +884,7 @@ enum {
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+	Opt_mballoc, Opt_nomballoc, Opt_stripe,
 };
 
 static match_table_t tokens = {
@@ -935,6 +939,9 @@ static match_table_t tokens = {
 	{Opt_extents, "extents"},
 	{Opt_noextents, "noextents"},
 	{Opt_i_version, "i_version"},
+	{Opt_mballoc, "mballoc"},
+	{Opt_nomballoc, "nomballoc"},
+	{Opt_stripe, "stripe=%u"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
 };
@@ -1284,6 +1291,19 @@ clear_qf_name:
 			set_opt(sbi->s_mount_opt, I_VERSION);
 			sb->s_flags |= MS_I_VERSION;
 			break;
+		case Opt_mballoc:
+			set_opt(sbi->s_mount_opt, MBALLOC);
+			break;
+		case Opt_nomballoc:
+			clear_opt(sbi->s_mount_opt, MBALLOC);
+			break;
+		case Opt_stripe:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_stripe = option;
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1742,6 +1762,34 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
 	return (has_super + ext4_group_first_block_no(sb, bg));
 }
 
+/**
+ * ext4_get_stripe_size: Get the stripe size.
+ * @sbi: In memory super block info
+ *
+ * If we have specified it via mount option, then
+ * use the mount option value. If the value specified at mount time is
+ * greater than the blocks per group use the super block value.
+ * If the super block value is greater than blocks per group return 0.
+ * Allocator needs it be less than blocks per group.
+ *
+ */
+static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
+{
+	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
+	unsigned long stripe_width =
+			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+
+	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
+		return sbi->s_stripe;
+
+	if (stripe_width <= sbi->s_blocks_per_group)
+		return stripe_width;
+
+	if (stride <= sbi->s_blocks_per_group)
+		return stride;
+
+	return 0;
+}
 
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 				__releases(kernel_sem)
@@ -2091,6 +2139,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_rsv_window_head.rsv_goal_size = 0;
 	ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
 
+	sbi->s_stripe = ext4_get_stripe_size(sbi);
+
 	/*
 	 * set up enough so that it can read an inode
 	 */
@@ -2250,6 +2300,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		"writeback");
 
 	ext4_ext_init(sb);
+	ext4_mb_init(sb, needs_recovery);
 
 	lock_kernel();
 	return 0;
@@ -3232,9 +3283,15 @@ static struct file_system_type ext4dev_fs_type = {
 
 static int __init init_ext4_fs(void)
 {
-	int err = init_ext4_xattr();
+	int err;
+
+	err = init_ext4_mballoc();
 	if (err)
 		return err;
+
+	err = init_ext4_xattr();
+	if (err)
+		goto out2;
 	err = init_inodecache();
 	if (err)
 		goto out1;
@@ -3246,6 +3303,8 @@ out:
 	destroy_inodecache();
 out1:
 	exit_ext4_xattr();
+out2:
+	exit_ext4_mballoc();
 	return err;
 }
 
@@ -3254,6 +3313,7 @@ static void __exit exit_ext4_fs(void)
 	unregister_filesystem(&ext4dev_fs_type);
 	destroy_inodecache();
 	exit_ext4_xattr();
+	exit_ext4_mballoc();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 86387302c2a9..d7962139c010 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -480,7 +480,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		ea_bdebug(bh, "refcount now=0; freeing");
 		if (ce)
 			mb_cache_entry_free(ce);
-		ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
+		ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
 		get_bh(bh);
 		ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
 	} else {
@@ -821,7 +821,7 @@ inserted:
 			new_bh = sb_getblk(sb, block);
 			if (!new_bh) {
 getblk_failed:
-				ext4_free_blocks(handle, inode, block, 1);
+				ext4_free_blocks(handle, inode, block, 1, 1);
 				error = -EIO;
 				goto cleanup;
 			}
-- 
cgit v1.2.3


From 3dbd0ede4d5320bd4c3cb914fec0595135b6d9a1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:26 -0500
Subject: ext4: Enable the multiblock allocator by default

Enable the multiblock allocator by default.

Fix ext4_show_options() so if it is not enabled, the nomballoc option
included in /proc/mounts.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3a51ffc47790..b60c34038bb8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -736,6 +736,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nobh");
 	if (!test_opt(sb, EXTENTS))
 		seq_puts(seq, ",noextents");
+	if (!test_opt(sb, MBALLOC))
+		seq_puts(seq, ",nomballoc");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
 
@@ -1903,6 +1905,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	 * User -o noextents to turn it off
 	 */
 	set_opt(sbi->s_mount_opt, EXTENTS);
+	/*
+	 * turn on mballoc feature by default in ext4 filesystem
+	 * User -o nomballoc to turn it off
+	 */
+	set_opt(sbi->s_mount_opt, MBALLOC);
 
 	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
 			    NULL, 0))
-- 
cgit v1.2.3


From cb45bbe44b09f35bb12d67ffa7ecff862608aeae Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Add stripe= option to /proc/mounts Add stripe= option to
 /proc/mounts for ext4 filesystems.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b60c34038bb8..00560cfa519d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -741,6 +741,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
 
+	if (sbi->s_stripe)
+		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
 	/*
 	 * journal mode get enabled in different ways
 	 * So just print the value even if we didn't specify it
-- 
cgit v1.2.3


From ce40733ce93de402ed629762f0e912d9af187cef Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Check for return value from sb_set_blocksize

sb_set_blocksize validates whether the specfied block size can be used by
the file system. Make sure we fail mounting the file system if the
blocksize specfied cannot be used.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/super.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 00560cfa519d..055a0cd0168e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1812,7 +1812,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	unsigned long def_mount_opts;
 	struct inode *root;
 	int blocksize;
-	int hblock;
 	int db_count;
 	int i;
 	int needs_recovery;
@@ -1969,20 +1968,16 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	hblock = bdev_hardsect_size(sb->s_bdev);
 	if (sb->s_blocksize != blocksize) {
-		/*
-		 * Make sure the blocksize for the filesystem is larger
-		 * than the hardware sectorsize for the machine.
-		 */
-		if (blocksize < hblock) {
-			printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
-			       "device blocksize %d.\n", blocksize, hblock);
+
+		/* Validate the filesystem blocksize */
+		if (!sb_set_blocksize(sb, blocksize)) {
+			printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+					blocksize);
 			goto failed_mount;
 		}
 
 		brelse (bh);
-		sb_set_blocksize(sb, blocksize);
 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
 		offset = do_div(logical_sb_block, blocksize);
 		bh = sb_bread(sb, logical_sb_block);
-- 
cgit v1.2.3


From dbf9d7da33f79302fb1e4d7c6b2f6598e8608e72 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: fix uniniatilized extent splitting error

Fix bug reported by Dmitry Monakhov caused by lost error code

    Testcase:

    blksize = 0x1000;
    fd = open(argv[1], O_RDWR|O_CREAT, 0700);
    unsigned long long sz = 0x10000000UL;
    /* allocating big blocks chunk */
    syscall(__NR_fallocate, fd, 0, 0UL, sz)

    /* grab all other available filesystem space */
    tfd = open("tmp", O_RDWR|O_CREAT|O_DIRECT, 0700);
    while( write(tfd, buf, 4096) > 0); /* loop untill ENOSPC */
    fsync(fd); /* just in case */
    while (pos < sz) {
    	/* each seek+ write operation result in splits uninitialized extent
    	in three extents. Splitting may result in new extent allocation
    	which probably will fail because of ENOSPC*/

    	lseek(fd, blksize*2 -1, SEEK_CUR);
    	if ((ret = write(fd, 'a', 1)) != 1)
    		exit(1);
    	pos += blksize * 2;
    }

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0cffb59fff46..ce9aa5860569 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2373,9 +2373,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ret = ext4_ext_convert_to_initialized(handle, inode,
 								path, iblock,
 								max_blocks);
-			if (ret <= 0)
+			if (ret <= 0) {
+				err = ret;
 				goto out2;
-			else
+			} else
 				allocated = ret;
 			goto outnew;
 		}
-- 
cgit v1.2.3


From b939e3766ec19eb556cb784c2faace253c6e1560 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: ext4: Use the ext4_ext_actual_len() helper function

ext4 uses the high bit of the extent length to encode whether the extent
is intialized or not. The helper function ext4_ext_get_actual_len should
be used to get the actual length of the extent.

This addresses the kernel bug documented here:
     http://bugzilla.kernel.org/show_bug.cgi?id=9732

kernel BUG at fs/ext4/extents.c:1056!
....
Call Trace:
[<ffffffff88366073>] :ext4dev:ext4_ext_get_blocks+0x5ba/0x8c1
[<ffffffff81053c91>] lock_release_holdtime+0x27/0x49
[<ffffffff812748f6>] _spin_unlock+0x17/0x20
[<ffffffff883400a6>] :jbd2:start_this_handle+0x4e0/0x4fe
[<ffffffff88366564>] :ext4dev:ext4_fallocate+0x175/0x39a
[<ffffffff81053c91>] lock_release_holdtime+0x27/0x49
[<ffffffff81056480>] __lock_acquire+0x4e7/0xc4d
[<ffffffff81053c91>] lock_release_holdtime+0x27/0x49
[<ffffffff810a8de7>] sys_fallocate+0xe4/0x10d
[<ffffffff8100c043>] tracesys+0xd5/0xda

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ce9aa5860569..bc7081f1fbe8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1029,7 +1029,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 {
 	struct ext4_extent_idx *ix;
 	struct ext4_extent *ex;
-	int depth;
+	int depth, ee_len;
 
 	BUG_ON(path == NULL);
 	depth = path->p_depth;
@@ -1043,6 +1043,7 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 	 * first one in the file */
 
 	ex = path[depth].p_ext;
+	ee_len = ext4_ext_get_actual_len(ex);
 	if (*logical < le32_to_cpu(ex->ee_block)) {
 		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
 		while (--depth >= 0) {
@@ -1052,10 +1053,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 		return 0;
 	}
 
-	BUG_ON(*logical < le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len));
+	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
 
-	*logical = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1;
-	*phys = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - 1;
+	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
+	*phys = ext_pblock(ex) + ee_len - 1;
 	return 0;
 }
 
@@ -1075,7 +1076,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	struct ext4_extent_idx *ix;
 	struct ext4_extent *ex;
 	ext4_fsblk_t block;
-	int depth;
+	int depth, ee_len;
 
 	BUG_ON(path == NULL);
 	depth = path->p_depth;
@@ -1089,6 +1090,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	 * first one in the file */
 
 	ex = path[depth].p_ext;
+	ee_len = ext4_ext_get_actual_len(ex);
 	if (*logical < le32_to_cpu(ex->ee_block)) {
 		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
 		while (--depth >= 0) {
@@ -1100,7 +1102,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 		return 0;
 	}
 
-	BUG_ON(*logical < le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len));
+	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
 
 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
 		/* next allocated block in this leaf */
@@ -1316,7 +1318,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	if (ext1_ee_len + ext2_ee_len > max_len)
 		return 0;
 #ifdef AGGRESSIVE_TEST
-	if (le16_to_cpu(ex1->ee_len) >= 4)
+	if (ext1_ee_len >= 4)
 		return 0;
 #endif
 
@@ -2313,7 +2315,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				   - le32_to_cpu(newex.ee_block)
 				   + ext_pblock(&newex);
 			/* number of remaining blocks in the extent */
-			allocated = le16_to_cpu(newex.ee_len) -
+			allocated = ext4_ext_get_actual_len(&newex) -
 					(iblock - le32_to_cpu(newex.ee_block));
 			goto out;
 		} else {
@@ -2429,7 +2431,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newex.ee_len = cpu_to_le16(max_blocks);
 	err = ext4_ext_check_overlap(inode, &newex, path);
 	if (err)
-		allocated = le16_to_cpu(newex.ee_len);
+		allocated = ext4_ext_get_actual_len(&newex);
 	else
 		allocated = max_blocks;
 
@@ -2461,7 +2463,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 * but otherwise we'd need to call it every free() */
 		ext4_mb_discard_inode_preallocations(inode);
 		ext4_free_blocks(handle, inode, ext_pblock(&newex),
-					le16_to_cpu(newex.ee_len), 0);
+					ext4_ext_get_actual_len(&newex), 0);
 		goto out2;
 	}
 
@@ -2470,7 +2472,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
-	allocated = le16_to_cpu(newex.ee_len);
+	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
 	__set_bit(BH_New, &bh_result->b_state);
 
-- 
cgit v1.2.3


From 7b7510662f4d05ddcc45d435769860e73e6aa20e Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: jbd2: add lockdep support

Ported from similar patch for the jbd layer.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index f30802aeefae..70b3199e69dc 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -241,6 +241,8 @@ out:
 	return ret;
 }
 
+static struct lock_class_key jbd2_handle_key;
+
 /* Allocate a new handle.  This should probably be in a slab... */
 static handle_t *new_handle(int nblocks)
 {
@@ -251,6 +253,9 @@ static handle_t *new_handle(int nblocks)
 	handle->h_buffer_credits = nblocks;
 	handle->h_ref = 1;
 
+	lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
+						&jbd2_handle_key, 0);
+
 	return handle;
 }
 
@@ -293,7 +298,11 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 		jbd2_free_handle(handle);
 		current->journal_info = NULL;
 		handle = ERR_PTR(err);
+		goto out;
 	}
+
+	lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+out:
 	return handle;
 }
 
@@ -1419,6 +1428,8 @@ int jbd2_journal_stop(handle_t *handle)
 		spin_unlock(&journal->j_state_lock);
 	}
 
+	lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+
 	jbd2_free_handle(handle);
 	return err;
 }
-- 
cgit v1.2.3


From 77160957e29e9413f7420e85fca37a47d4ffac7f Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: jbd2: Mark jbd2 slabs as SLAB_TEMPORARY

This patch marks slab allocations by jbd2 as short-lived in support of
Mel Gorman's "Group short-lived and reclaimable kernel allocations"
patch.  (Ported from similar changes made to fs/jbd/journal.c and
fs/jbd/revoke.c in Mel's patch.)

Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 4 ++--
 fs/jbd2/revoke.c  | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 59ba2494dcaf..96ba846992e9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1973,7 +1973,7 @@ static int journal_init_jbd2_journal_head_cache(void)
 	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
 				sizeof(struct journal_head),
 				0,		/* offset */
-				0,		/* flags */
+				SLAB_TEMPORARY,	/* flags */
 				NULL);		/* ctor */
 	retval = 0;
 	if (jbd2_journal_head_cache == 0) {
@@ -2269,7 +2269,7 @@ static int __init journal_init_handle_cache(void)
 	jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
 				sizeof(handle_t),
 				0,		/* offset */
-				0,		/* flags */
+				SLAB_TEMPORARY,	/* flags */
 				NULL);		/* ctor */
 	if (jbd2_handle_cache == NULL) {
 		printk(KERN_EMERG "JBD: failed to create handle cache\n");
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 3595fd432d5b..df36f42e19e1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -171,13 +171,15 @@ int __init jbd2_journal_init_revoke_caches(void)
 {
 	jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
 					   sizeof(struct jbd2_revoke_record_s),
-					   0, SLAB_HWCACHE_ALIGN, NULL);
+					   0,
+					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+					   NULL);
 	if (jbd2_revoke_record_cache == 0)
 		return -ENOMEM;
 
 	jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
 					   sizeof(struct jbd2_revoke_table_s),
-					   0, 0, NULL);
+					   0, SLAB_TEMPORARY, NULL);
 	if (jbd2_revoke_table_cache == 0) {
 		kmem_cache_destroy(jbd2_revoke_record_cache);
 		jbd2_revoke_record_cache = NULL;
-- 
cgit v1.2.3


From db857da3369cd4eb6a28be1cce89d33162caa4a0 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: jbd2: Use round-jiffies() function for the "5 second" ext4/jbd2
 wakeup

While "every 5 seconds" doesn't sound as a problem, there can be many
of these (and these timers do add up over all the kernel).  The "5
second" wakeup isn't really timing sensitive; in addition even with
rounding it'll still happen every 5 seconds (with the exception of the
very first time, which is likely to be rounded up to somewhere closer
to 6 seconds)

(Ported from similar JBD patch made by Arjan van de Ven to
fs/jbd/transaction.c)

Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 70b3199e69dc..0c8adaba0c0b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -54,7 +54,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	spin_lock_init(&transaction->t_handle_lock);
 
 	/* Set up the commit timer for the new transaction. */
-	journal->j_commit_timer.expires = transaction->t_expires;
+	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
 	add_timer(&journal->j_commit_timer);
 
 	J_ASSERT(journal->j_running_transaction == NULL);
-- 
cgit v1.2.3


From 4019191be7316ed4a39e1c1c2b623baa7dc6c843 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Mon, 28 Jan 2008 23:58:27 -0500
Subject: jbd2: sparse pointer use of zero as null

Get rid of sparse related warnings from places that use integer as NULL
pointer.  (Ported from upstream ext3/jbd changes.)

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 0c8adaba0c0b..b9b0b6f899b9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1182,7 +1182,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	}
 
 	/* That test should have eliminated the following case: */
-	J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
 
 	JBUFFER_TRACE(jh, "file as BJ_Metadata");
 	spin_lock(&journal->j_list_lock);
@@ -1532,7 +1532,7 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 
 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
 	if (jh->b_jlist != BJ_None)
-		J_ASSERT_JH(jh, transaction != 0);
+		J_ASSERT_JH(jh, transaction != NULL);
 
 	switch (jh->b_jlist) {
 	case BJ_None:
@@ -1601,11 +1601,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 	if (buffer_locked(bh) || buffer_dirty(bh))
 		goto out;
 
-	if (jh->b_next_transaction != 0)
+	if (jh->b_next_transaction != NULL)
 		goto out;
 
 	spin_lock(&journal->j_list_lock);
-	if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
 		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
 			/* A written-back ordered data buffer */
 			JBUFFER_TRACE(jh, "release data");
@@ -1613,7 +1613,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 			jbd2_journal_remove_journal_head(bh);
 			__brelse(bh);
 		}
-	} else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
 		/* written-back checkpointed metadata buffer */
 		if (jh->b_jlist == BJ_None) {
 			JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1973,7 +1973,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 
 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
 	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
-				jh->b_transaction == 0);
+				jh->b_transaction == NULL);
 
 	if (jh->b_transaction && jh->b_jlist == jlist)
 		return;
-- 
cgit v1.2.3


From 9e97198dbf318be7958b57900d05b37c7e09ad7c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 29 Jan 2008 21:05:57 +0100
Subject: splice: fix problem with atime not being updated

A bug report on nfsd that states that since it was switched to use
splice instead of sendfile, the atime was no longer being updated
on the input file. do_generic_mapping_read() does this when accessing
the file, make splice do it for the direct splice handler.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 0a0b79b01d05..1577a7391d23 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1031,7 +1031,11 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 			goto out_release;
 	}
 
+done:
 	pipe->nrbufs = pipe->curbuf = 0;
+	if (bytes > 0)
+		file_accessed(in);
+
 	return bytes;
 
 out_release:
@@ -1047,16 +1051,11 @@ out_release:
 			buf->ops = NULL;
 		}
 	}
-	pipe->nrbufs = pipe->curbuf = 0;
-
-	/*
-	 * If we transferred some data, return the number of bytes:
-	 */
-	if (bytes > 0)
-		return bytes;
 
-	return ret;
+	if (!bytes)
+		bytes = ret;
 
+	goto done;
 }
 EXPORT_SYMBOL(splice_direct_to_actor);
 
-- 
cgit v1.2.3


From 6bd8fedaa16da1e24f38712ee759950d8c5f4f09 Mon Sep 17 00:00:00 2001
From: Lon Hohberger <lhh@redhat.com>
Date: Thu, 25 Oct 2007 18:51:54 -0400
Subject: dlm: bind connections from known local address when using TCP

A common problem occurs when multiple IP addresses within the same
subnet are assigned to the same NIC.  If we make a connection attempt to
another address on the same subnet as one of those addresses, the
connection attempt will not necessarily be routed from the address we
want.

In the case of the DLM, the other nodes will quickly drop the connection
attempt, causing problems.

This patch makes the DLM bind to the local address it acquired from the
cluster manager when using TCP prior to making a connection, obviating
the need for administrators to "fix" their systems or use clever routing
tricks.

Signed-off-by: Lon Hohberger <lhh@redhat.com>
Signed-off-by: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e9923ca9c2d9..57728448f1b9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con)
 static void tcp_connect_to_sock(struct connection *con)
 {
 	int result = -EHOSTUNREACH;
-	struct sockaddr_storage saddr;
+	struct sockaddr_storage saddr, src_addr;
 	int addr_len;
 	struct socket *sock;
 
@@ -898,6 +898,17 @@ static void tcp_connect_to_sock(struct connection *con)
 	con->connect_action = tcp_connect_to_sock;
 	add_sock(sock, con);
 
+	/* Bind to our cluster-known address connecting to avoid
+	   routing problems */
+	memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+	make_sockaddr(&src_addr, 0, &addr_len);
+	result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
+				 addr_len);
+	if (result < 0) {
+		log_print("could not bind for connect: %d", result);
+		/* This *may* not indicate a critical error */
+	}
+
 	make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
 
 	log_print("connecting to %d", con->nodeid);
-- 
cgit v1.2.3


From e028398da7615dd3a795505ddf7942506bbb49bd Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 3 Nov 2007 01:04:30 +0100
Subject: dlm: proper prototypes

This patch adds a proper prototype for some functions in
fs/dlm/dlm_internal.h

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h | 16 ++++++++++++++++
 fs/dlm/lock.c         |  1 -
 fs/dlm/lockspace.c    |  8 --------
 fs/dlm/main.c         | 10 ----------
 4 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d2fc2384c3be..ec61bbaf25df 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -570,5 +570,21 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
 	return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
 }
 
+int dlm_netlink_init(void);
+void dlm_netlink_exit(void);
+void dlm_timeout_warn(struct dlm_lkb *lkb);
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
 #endif				/* __DLM_INTERNAL_DOT_H__ */
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 3915b8e14146..7bc6ad9299a2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -88,7 +88,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
 static int receive_extralen(struct dlm_message *ms);
 static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
 static void del_timeout(struct dlm_lkb *lkb);
-void dlm_timeout_warn(struct dlm_lkb *lkb);
 
 /*
  * Lock compatibilty matrix - thanks Steve
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 5c108c49cb8c..a0de1cbc603d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -24,14 +24,6 @@
 #include "recover.h"
 #include "requestqueue.h"
 
-#ifdef CONFIG_DLM_DEBUG
-int dlm_create_debug_file(struct dlm_ls *ls);
-void dlm_delete_debug_file(struct dlm_ls *ls);
-#else
-static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
-static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
-#endif
-
 static int			ls_count;
 static struct mutex		ls_lock;
 static struct list_head		lslist;
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index eca2907f2386..58487fb95a4c 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -18,16 +18,6 @@
 #include "memory.h"
 #include "config.h"
 
-#ifdef CONFIG_DLM_DEBUG
-int dlm_register_debugfs(void);
-void dlm_unregister_debugfs(void);
-#else
-static inline int dlm_register_debugfs(void) { return 0; }
-static inline void dlm_unregister_debugfs(void) { }
-#endif
-int dlm_netlink_init(void);
-void dlm_netlink_exit(void);
-
 static int __init init_dlm(void)
 {
 	int error;
-- 
cgit v1.2.3


From 11b2498ba7c88343d91630d679c8f2aeb8d57c48 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 7 Nov 2007 09:06:06 -0600
Subject: dlm: don't print common non-errors

Change log_error() to log_debug() for conditions that can occur in
large number in normal operation.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 7bc6ad9299a2..63fe74df97c2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -4258,7 +4258,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
 	put_rsb(r);
  out:
 	if (error)
-		log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+		log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid);
 	rl->rl_result = error;
 	return error;
 }
-- 
cgit v1.2.3


From 52bda2b5bab87c388848bbc0f4d28d04858d5a7d Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 7 Nov 2007 09:06:49 -0600
Subject: dlm: use dlm prefix on alloc and free functions

The dlm functions in memory.c should use the dlm_ prefix.  Also, use
kzalloc/kfree directly for dlm_direntry's, removing the wrapper functions.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dir.c       | 10 +++++-----
 fs/dlm/lock.c      | 26 +++++++++++++-------------
 fs/dlm/lockspace.c |  8 ++++----
 fs/dlm/memory.c    | 32 ++++++++------------------------
 fs/dlm/memory.h    | 16 +++++++---------
 fs/dlm/recover.c   |  4 ++--
 6 files changed, 39 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 46754553fdcc..600bb1d1a9b6 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
 	spin_unlock(&ls->ls_recover_list_lock);
 
 	if (!found)
-		de = allocate_direntry(ls, len);
+		de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
 	return de;
 }
 
@@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls)
 		de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
 				list);
 		list_del(&de->list);
-		free_direntry(de);
+		kfree(de);
 	}
 	spin_unlock(&ls->ls_recover_list_lock);
 }
@@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
 	}
 
 	list_del(&de->list);
-	free_direntry(de);
+	kfree(de);
  out:
 	write_unlock(&ls->ls_dirtbl[bucket].lock);
 }
@@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 
 	write_unlock(&ls->ls_dirtbl[bucket].lock);
 
-	de = allocate_direntry(ls, namelen);
+	de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
 	if (!de)
 		return -ENOMEM;
 
@@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 	write_lock(&ls->ls_dirtbl[bucket].lock);
 	tmp = search_bucket(ls, name, namelen, bucket);
 	if (tmp) {
-		free_direntry(de);
+		kfree(de);
 		de = tmp;
 	} else {
 		list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 63fe74df97c2..ddb46281f34d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -334,7 +334,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
 {
 	struct dlm_rsb *r;
 
-	r = allocate_rsb(ls, len);
+	r = dlm_allocate_rsb(ls, len);
 	if (!r)
 		return NULL;
 
@@ -477,7 +477,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
 	error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
 	if (!error) {
 		write_unlock(&ls->ls_rsbtbl[bucket].lock);
-		free_rsb(r);
+		dlm_free_rsb(r);
 		r = tmp;
 		goto out;
 	}
@@ -518,7 +518,7 @@ static void toss_rsb(struct kref *kref)
 	list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
 	r->res_toss_time = jiffies;
 	if (r->res_lvbptr) {
-		free_lvb(r->res_lvbptr);
+		dlm_free_lvb(r->res_lvbptr);
 		r->res_lvbptr = NULL;
 	}
 }
@@ -588,7 +588,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 	uint32_t lkid = 0;
 	uint16_t bucket;
 
-	lkb = allocate_lkb(ls);
+	lkb = dlm_allocate_lkb(ls);
 	if (!lkb)
 		return -ENOMEM;
 
@@ -682,8 +682,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
 
 		/* for local/process lkbs, lvbptr points to caller's lksb */
 		if (lkb->lkb_lvbptr && is_master_copy(lkb))
-			free_lvb(lkb->lkb_lvbptr);
-		free_lkb(lkb);
+			dlm_free_lvb(lkb->lkb_lvbptr);
+		dlm_free_lkb(lkb);
 		return 1;
 	} else {
 		write_unlock(&ls->ls_lkbtbl[bucket].lock);
@@ -987,7 +987,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
 
 			if (is_master(r))
 				dir_remove(r);
-			free_rsb(r);
+			dlm_free_rsb(r);
 			count++;
 		} else {
 			write_unlock(&ls->ls_rsbtbl[b].lock);
@@ -1170,7 +1170,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 			return;
 
 		if (!r->res_lvbptr)
-			r->res_lvbptr = allocate_lvb(r->res_ls);
+			r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
 
 		if (!r->res_lvbptr)
 			return;
@@ -1202,7 +1202,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		return;
 
 	if (!r->res_lvbptr)
-		r->res_lvbptr = allocate_lvb(r->res_ls);
+		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
 
 	if (!r->res_lvbptr)
 		return;
@@ -2985,7 +2985,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
 
 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
 		if (!lkb->lkb_lvbptr)
-			lkb->lkb_lvbptr = allocate_lvb(ls);
+			lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
 		if (!lkb->lkb_lvbptr)
 			return -ENOMEM;
 		len = receive_extralen(ms);
@@ -3009,7 +3009,7 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 
 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
 		/* lkb was just created so there won't be an lvb yet */
-		lkb->lkb_lvbptr = allocate_lvb(ls);
+		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
 		if (!lkb->lkb_lvbptr)
 			return -ENOMEM;
 	}
@@ -4183,7 +4183,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
 
 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
-		lkb->lkb_lvbptr = allocate_lvb(ls);
+		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
 		if (!lkb->lkb_lvbptr)
 			return -ENOMEM;
 		lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4341,7 +4341,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 		}
 	}
 
-	/* After ua is attached to lkb it will be freed by free_lkb().
+	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
 	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
 	   lock and that lkb_astparam is the dlm_user_args structure. */
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index a0de1cbc603d..b180fdc51085 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -676,9 +676,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 			dlm_del_ast(lkb);
 
 			if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
-				free_lvb(lkb->lkb_lvbptr);
+				dlm_free_lvb(lkb->lkb_lvbptr);
 
-			free_lkb(lkb);
+			dlm_free_lkb(lkb);
 		}
 	}
 	dlm_astd_resume();
@@ -696,7 +696,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 					 res_hashchain);
 
 			list_del(&rsb->res_hashchain);
-			free_rsb(rsb);
+			dlm_free_rsb(rsb);
 		}
 
 		head = &ls->ls_rsbtbl[i].toss;
@@ -704,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 			rsb = list_entry(head->next, struct dlm_rsb,
 					 res_hashchain);
 			list_del(&rsb->res_hashchain);
-			free_rsb(rsb);
+			dlm_free_rsb(rsb);
 		}
 	}
 
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ecf0e5cb2035..f7783867491a 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,7 @@ void dlm_memory_exit(void)
 		kmem_cache_destroy(lkb_cache);
 }
 
-char *allocate_lvb(struct dlm_ls *ls)
+char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
 	char *p;
 
@@ -43,7 +43,7 @@ char *allocate_lvb(struct dlm_ls *ls)
 	return p;
 }
 
-void free_lvb(char *p)
+void dlm_free_lvb(char *p)
 {
 	kfree(p);
 }
@@ -51,7 +51,7 @@ void free_lvb(char *p)
 /* FIXME: have some minimal space built-in to rsb for the name and
    kmalloc a separate name if needed, like dentries are done */
 
-struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
 {
 	struct dlm_rsb *r;
 
@@ -61,14 +61,14 @@ struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
 	return r;
 }
 
-void free_rsb(struct dlm_rsb *r)
+void dlm_free_rsb(struct dlm_rsb *r)
 {
 	if (r->res_lvbptr)
-		free_lvb(r->res_lvbptr);
+		dlm_free_lvb(r->res_lvbptr);
 	kfree(r);
 }
 
-struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
 
@@ -76,7 +76,7 @@ struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
 	return lkb;
 }
 
-void free_lkb(struct dlm_lkb *lkb)
+void dlm_free_lkb(struct dlm_lkb *lkb)
 {
 	if (lkb->lkb_flags & DLM_IFL_USER) {
 		struct dlm_user_args *ua;
@@ -90,19 +90,3 @@ void free_lkb(struct dlm_lkb *lkb)
 	kmem_cache_free(lkb_cache, lkb);
 }
 
-struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
-{
-	struct dlm_direntry *de;
-
-	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
-		   printk("namelen = %d\n", namelen););
-
-	de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL);
-	return de;
-}
-
-void free_direntry(struct dlm_direntry *de)
-{
-	kfree(de);
-}
-
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 6ead158ccc5c..485fb29143bd 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -16,14 +16,12 @@
 
 int dlm_memory_init(void);
 void dlm_memory_exit(void);
-struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
-void free_rsb(struct dlm_rsb *r);
-struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
-void free_lkb(struct dlm_lkb *l);
-struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
-void free_direntry(struct dlm_direntry *de);
-char *allocate_lvb(struct dlm_ls *ls);
-void free_lvb(char *l);
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen);
+void dlm_free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
+void dlm_free_lkb(struct dlm_lkb *l);
+char *dlm_allocate_lvb(struct dlm_ls *ls);
+void dlm_free_lvb(char *l);
 
 #endif		/* __MEMORY_DOT_H__ */
 
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index c2cc7694cd16..2f9d9a30df97 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r)
 		goto out;
 
 	if (!r->res_lvbptr) {
-		r->res_lvbptr = allocate_lvb(r->res_ls);
+		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
 		if (!r->res_lvbptr)
 			goto out;
 	}
@@ -760,7 +760,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
 		list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
 					 res_hashchain) {
 			list_del(&r->res_hashchain);
-			free_rsb(r);
+			dlm_free_rsb(r);
 		}
 		write_unlock(&ls->ls_rsbtbl[i].lock);
 	}
-- 
cgit v1.2.3


From 39bd4177ddbeb4c86e854d3d5c4a6a26088e601e Mon Sep 17 00:00:00 2001
From: Patrick Caulfeld <pcaulfie@redhat.com>
Date: Wed, 9 Jan 2008 15:06:27 +0000
Subject: dlm: close othercons

This patch addresses a problem introduced with the last round of
lowcomms patches where the 'othercon' connections do not get freed when
the DLM shuts down.

This results in the error message
"slab error in kmem_cache_destroy(): cache `dlm_conn': Can't free all
objects"

and the DLM cannot be restarted without a system reboot.

See bz#428119

Signed-off-by: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Fabio M. Di Nitto <fabbione@ubuntu.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 57728448f1b9..7c1e5e5cccd8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1437,6 +1437,8 @@ void dlm_lowcomms_stop(void)
 		con = __nodeid2con(i, 0);
 		if (con) {
 			close_connection(con, true);
+			if (con->othercon)
+				kmem_cache_free(con_cache, con->othercon);
 			kmem_cache_free(con_cache, con);
 		}
 	}
-- 
cgit v1.2.3


From fccca7fc6aab4e6b519e2d606ef34632e4f50e33 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 26 Jan 2008 17:37:47 -0500
Subject: NFS: Fix a sillyrename race...

Ensure that readdir revalidates its data cache after blocking on
sillyrename.

Also fix a typo in nfs_do_call_unlink(): swap the ^= for an |=. The result
is the same, since we've already checked that the flag is unset, but it
makes the code more readable.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c    | 11 +++++------
 fs/nfs/unlink.c |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f697b5c74b7c..d9abdb1d6a2a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -537,12 +537,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	lock_kernel();
 
-	res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
-	if (res < 0) {
-		unlock_kernel();
-		return res;
-	}
-
 	/*
 	 * filp->f_pos points to the dirent entry number.
 	 * *desc->dir_cookie has the cookie for the next entry. We have
@@ -564,6 +558,10 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	desc->entry = &my_entry;
 
 	nfs_block_sillyrename(dentry);
+	res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
+	if (res < 0)
+		goto out;
+
 	while(!desc->entry->eof) {
 		res = readdir_search_pagecache(desc);
 
@@ -594,6 +592,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			break;
 		}
 	}
+out:
 	nfs_unblock_sillyrename(dentry);
 	unlock_kernel();
 	if (res > 0)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 233ad38161f9..c5fa6d8001f1 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -138,7 +138,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		spin_lock(&alias->d_lock);
 		if (!(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
 			alias->d_fsdata = data;
-			alias->d_flags ^= DCACHE_NFSFS_RENAMED;
+			alias->d_flags |= DCACHE_NFSFS_RENAMED;
 			ret = 1;
 		}
 		spin_unlock(&alias->d_lock);
-- 
cgit v1.2.3


From 609005c319bc6062b95ed82e132884ed7e22cdb9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Jan 2008 19:42:59 -0500
Subject: NFS: Sillyrename: in the case of a race, check aliases are really
 positive

In nfs_do_call_unlink() we check that we haven't raced, and that lookup()
hasn't created an aliased dentry to our sillydeleted dentry. If somebody
has deleted the file on the server and the lookup() resulted in a negative
dentry, then ignore...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/unlink.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index c5fa6d8001f1..431981d0265f 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -130,13 +130,15 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 	alias = d_lookup(parent, &data->args.name);
 	if (alias != NULL) {
 		int ret = 0;
+
 		/*
 		 * Hey, we raced with lookup... See if we need to transfer
 		 * the sillyrename information to the aliased dentry.
 		 */
 		nfs_free_dname(data);
 		spin_lock(&alias->d_lock);
-		if (!(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+		if (alias->d_inode != NULL &&
+		    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
 			alias->d_fsdata = data;
 			alias->d_flags |= DCACHE_NFSFS_RENAMED;
 			ret = 1;
-- 
cgit v1.2.3


From d45b9d8baf41acb177abbbe6746b1dea094b8a28 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Jan 2008 19:43:18 -0500
Subject: NFS: Handle -ENOENT errors in unlink()/rmdir()/rename()

If the server returns an ENOENT error, we still need to do a d_delete() in
order to ensure that the dentry is deleted.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d9abdb1d6a2a..06f26d40b4fe 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1267,6 +1267,12 @@ out_err:
 	return error;
 }
 
+static void nfs_dentry_handle_enoent(struct dentry *dentry)
+{
+	if (dentry->d_inode != NULL && !d_unhashed(dentry))
+		d_delete(dentry);
+}
+
 static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int error;
@@ -1279,6 +1285,8 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 	/* Ensure the VFS deletes this inode */
 	if (error == 0 && dentry->d_inode != NULL)
 		clear_nlink(dentry->d_inode);
+	else if (error == -ENOENT)
+		nfs_dentry_handle_enoent(dentry);
 	unlock_kernel();
 
 	return error;
@@ -1385,6 +1393,8 @@ static int nfs_safe_remove(struct dentry *dentry)
 		nfs_mark_for_revalidate(inode);
 	} else
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+	if (error == -ENOENT)
+		nfs_dentry_handle_enoent(dentry);
 out:
 	return error;
 }
@@ -1421,7 +1431,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	error = nfs_safe_remove(dentry);
-	if (!error) {
+	if (!error || error == -ENOENT) {
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	} else if (need_rehash)
 		d_rehash(dentry);
@@ -1634,7 +1644,8 @@ out:
 		d_move(old_dentry, new_dentry);
 		nfs_set_verifier(new_dentry,
 					nfs_save_change_attribute(new_dir));
-	}
+	} else if (error == -ENOENT)
+		nfs_dentry_handle_enoent(old_dentry);
 
 	/* new dentry created? */
 	if (dentry)
-- 
cgit v1.2.3


From 77f111929d024165e736e919187cff017279bebe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 28 Jan 2008 19:43:19 -0500
Subject: NFS: Ensure that we eject stale inodes as soon as possible

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06f26d40b4fe..32c666c612a1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -826,6 +826,10 @@ static int nfs_dentry_delete(struct dentry *dentry)
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		dentry->d_flags);
 
+	/* Unhash any dentry with a stale inode */
+	if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
+		return 1;
+
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
 		/* Unhash it, so that ->d_iput() would be called */
 		return 1;
-- 
cgit v1.2.3


From 8b1f9ee56e21e505a3d5d3e33f823006d1abdbaf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 Jan 2008 17:13:06 -0500
Subject: NFS: Optimise nfs_vm_page_mkwrite()

The current model locks the page twice for no good reason. Optimise by
inlining the parts of nfs_write_begin()/nfs_write_end() that we care about.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index b3bb89f7d5d2..4560fc2ddb4a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -392,35 +392,27 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	struct file *filp = vma->vm_file;
 	unsigned pagelen;
 	int ret = -EINVAL;
-	void *fsdata;
 	struct address_space *mapping;
-	loff_t offset;
 
 	lock_page(page);
 	mapping = page->mapping;
-	if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) {
-		unlock_page(page);
-		return -EINVAL;
-	}
+	if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+		goto out_unlock;
+
+	ret = 0;
 	pagelen = nfs_page_length(page);
-	offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-	unlock_page(page);
+	if (pagelen == 0)
+		goto out_unlock;
 
-	/*
-	 * we can use mapping after releasing the page lock, because:
-	 * we hold mmap_sem on the fault path, which should pin the vma
-	 * which should pin the file, which pins the dentry which should
-	 * hold a reference on inode.
-	 */
+	ret = nfs_flush_incompatible(filp, page);
+	if (ret != 0)
+		goto out_unlock;
 
-	if (pagelen) {
-		struct page *page2 = NULL;
-		ret = nfs_write_begin(filp, mapping, offset, pagelen,
-			       	0, &page2, &fsdata);
-		if (!ret)
-			ret = nfs_write_end(filp, mapping, offset, pagelen,
-				       	pagelen, page2, fsdata);
-	}
+	ret = nfs_updatepage(filp, page, 0, pagelen);
+	if (ret == 0)
+		ret = pagelen;
+out_unlock:
+	unlock_page(page);
 	return ret;
 }
 
-- 
cgit v1.2.3


From acee478afc6ff7e1b8852d9a4dca1ff36021414d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 Jan 2008 17:13:07 -0500
Subject: NFS: Clean up the write request locking.

Ensure that we set/clear NFS_PAGE_TAG_LOCKED when the nfs_page is hashed.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c | 13 ++++++++-----
 fs/nfs/write.c    | 16 +++++++---------
 2 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 345bb9b4765b..3b3dbb94393d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -111,13 +111,14 @@ void nfs_unlock_request(struct nfs_page *req)
  * nfs_set_page_tag_locked - Tag a request as locked
  * @req:
  */
-static int nfs_set_page_tag_locked(struct nfs_page *req)
+int nfs_set_page_tag_locked(struct nfs_page *req)
 {
 	struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
 
-	if (!nfs_lock_request(req))
+	if (!nfs_lock_request_dontget(req))
 		return 0;
-	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+	if (req->wb_page != NULL)
+		radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 	return 1;
 }
 
@@ -132,9 +133,10 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
 	if (req->wb_page != NULL) {
 		spin_lock(&inode->i_lock);
 		radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+		nfs_unlock_request(req);
 		spin_unlock(&inode->i_lock);
-	}
-	nfs_unlock_request(req);
+	} else
+		nfs_unlock_request(req);
 }
 
 /**
@@ -421,6 +423,7 @@ int nfs_scan_list(struct nfs_inode *nfsi,
 				goto out;
 			idx_start = req->wb_index + 1;
 			if (nfs_set_page_tag_locked(req)) {
+				kref_get(&req->wb_kref);
 				nfs_list_remove_request(req);
 				radix_tree_tag_clear(&nfsi->nfs_page_tree,
 						req->wb_index, tag);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 51cc1bd6a116..092e79c6d962 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -196,7 +196,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 	}
 	/* Update file length */
 	nfs_grow_file(page, offset, count);
-	nfs_unlock_request(req);
+	nfs_clear_page_tag_locked(req);
 	return 0;
 }
 
@@ -252,7 +252,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 				struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *req;
 	int ret;
 
@@ -263,10 +262,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 			spin_unlock(&inode->i_lock);
 			return 0;
 		}
-		if (nfs_lock_request_dontget(req))
+		if (nfs_set_page_tag_locked(req))
 			break;
 		/* Note: If we hold the page lock, as is the case in nfs_writepage,
-		 *	 then the call to nfs_lock_request_dontget() will always
+		 *	 then the call to nfs_set_page_tag_locked() will always
 		 *	 succeed provided that someone hasn't already marked the
 		 *	 request as dirty (in which case we don't care).
 		 */
@@ -280,7 +279,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 	if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
 		/* This request is marked for commit */
 		spin_unlock(&inode->i_lock);
-		nfs_unlock_request(req);
+		nfs_clear_page_tag_locked(req);
 		nfs_pageio_complete(pgio);
 		return 0;
 	}
@@ -288,8 +287,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 		spin_unlock(&inode->i_lock);
 		BUG();
 	}
-	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
-			NFS_PAGE_TAG_LOCKED);
 	spin_unlock(&inode->i_lock);
 	nfs_pageio_add_request(pgio, req);
 	return 0;
@@ -381,6 +378,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	set_page_private(req->wb_page, (unsigned long)req);
 	nfsi->npages++;
 	kref_get(&req->wb_kref);
+	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 	return 0;
 }
 
@@ -596,7 +594,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 		spin_lock(&inode->i_lock);
 		req = nfs_page_find_request_locked(page);
 		if (req) {
-			if (!nfs_lock_request_dontget(req)) {
+			if (!nfs_set_page_tag_locked(req)) {
 				int error;
 
 				spin_unlock(&inode->i_lock);
@@ -646,7 +644,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
 	    || req->wb_page != page
 	    || !nfs_dirty_request(req)
 	    || offset > rqend || end < req->wb_offset) {
-		nfs_unlock_request(req);
+		nfs_clear_page_tag_locked(req);
 		return ERR_PTR(-EBUSY);
 	}
 
-- 
cgit v1.2.3


From 2f74c0a05612b9c2014b5b67833dba9b9f523948 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 8 Jan 2008 17:56:07 -0500
Subject: NFSv4: Clean up the OPEN/CLOSE serialisation code

Reduce the time spent locking the rpc_sequence structure by queuing the
nfs_seqid only when we are ready to take the lock (when calling
nfs_wait_on_sequence).

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c  | 30 +++++++++++-------------------
 fs/nfs/nfs4state.c | 32 ++++++++++++++++----------------
 2 files changed, 27 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9e2e1c7291db..a51a7537f3f6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3331,15 +3331,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 
 	p->arg.fh = NFS_FH(inode);
 	p->arg.fl = &p->fl;
-	if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) {
-		p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
-		if (p->arg.open_seqid == NULL)
-			goto out_free;
-
-	}
+	p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
+	if (p->arg.open_seqid == NULL)
+		goto out_free;
 	p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
 	if (p->arg.lock_seqid == NULL)
-		goto out_free;
+		goto out_free_seqid;
 	p->arg.lock_stateid = &lsp->ls_stateid;
 	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_id.id;
@@ -3348,9 +3345,9 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->ctx = get_nfs_open_context(ctx);
 	memcpy(&p->fl, fl, sizeof(p->fl));
 	return p;
+out_free_seqid:
+	nfs_free_seqid(p->arg.open_seqid);
 out_free:
-	if (p->arg.open_seqid != NULL)
-		nfs_free_seqid(p->arg.open_seqid);
 	kfree(p);
 	return NULL;
 }
@@ -3368,20 +3365,16 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 	};
 
 	dprintk("%s: begin!\n", __FUNCTION__);
+	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
+		return;
 	/* Do we need to do an open_to_lock_owner? */
 	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
 		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
 			return;
 		data->arg.open_stateid = &state->stateid;
 		data->arg.new_lock_owner = 1;
-		/* Retest in case we raced... */
-		if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED))
-			goto do_rpc;
-	}
-	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
-		return;
-	data->arg.new_lock_owner = 0;
-do_rpc:	
+	} else
+		data->arg.new_lock_owner = 0;
 	data->timestamp = jiffies;
 	rpc_call_setup(task, &msg, 0);
 	dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
@@ -3419,6 +3412,7 @@ static void nfs4_lock_release(void *calldata)
 	struct nfs4_lockdata *data = calldata;
 
 	dprintk("%s: begin!\n", __FUNCTION__);
+	nfs_free_seqid(data->arg.open_seqid);
 	if (data->cancelled != 0) {
 		struct rpc_task *task;
 		task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
@@ -3428,8 +3422,6 @@ static void nfs4_lock_release(void *calldata)
 		dprintk("%s: cancelling lock!\n", __FUNCTION__);
 	} else
 		nfs_free_seqid(data->arg.lock_seqid);
-	if (data->arg.open_seqid != NULL)
-		nfs_free_seqid(data->arg.open_seqid);
 	nfs4_put_lock_state(data->lsp);
 	put_nfs_open_context(data->ctx);
 	kfree(data);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5a39c6f78acf..bf94c6e0a503 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -644,27 +644,26 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
 
 struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
 {
-	struct rpc_sequence *sequence = counter->sequence;
 	struct nfs_seqid *new;
 
 	new = kmalloc(sizeof(*new), GFP_KERNEL);
 	if (new != NULL) {
 		new->sequence = counter;
-		spin_lock(&sequence->lock);
-		list_add_tail(&new->list, &sequence->list);
-		spin_unlock(&sequence->lock);
+		INIT_LIST_HEAD(&new->list);
 	}
 	return new;
 }
 
 void nfs_free_seqid(struct nfs_seqid *seqid)
 {
-	struct rpc_sequence *sequence = seqid->sequence->sequence;
+	if (!list_empty(&seqid->list)) {
+		struct rpc_sequence *sequence = seqid->sequence->sequence;
 
-	spin_lock(&sequence->lock);
-	list_del(&seqid->list);
-	spin_unlock(&sequence->lock);
-	rpc_wake_up(&sequence->wait);
+		spin_lock(&sequence->lock);
+		list_del(&seqid->list);
+		spin_unlock(&sequence->lock);
+		rpc_wake_up(&sequence->wait);
+	}
 	kfree(seqid);
 }
 
@@ -675,6 +674,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
  */
 static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 {
+	BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid);
 	switch (status) {
 		case 0:
 			break;
@@ -726,15 +726,15 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
 	struct rpc_sequence *sequence = seqid->sequence->sequence;
 	int status = 0;
 
-	if (sequence->list.next == &seqid->list)
-		goto out;
 	spin_lock(&sequence->lock);
-	if (sequence->list.next != &seqid->list) {
-		rpc_sleep_on(&sequence->wait, task, NULL, NULL);
-		status = -EAGAIN;
-	}
+	if (list_empty(&seqid->list))
+		list_add_tail(&seqid->list, &sequence->list);
+	if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
+		goto unlock;
+	rpc_sleep_on(&sequence->wait, task, NULL, NULL);
+	status = -EAGAIN;
+unlock:
 	spin_unlock(&sequence->lock);
-out:
 	return status;
 }
 
-- 
cgit v1.2.3


From ef818a28fac9bd214e676986d8301db0582b92a9 Mon Sep 17 00:00:00 2001
From: Steve Dickson <SteveD@redhat.com>
Date: Thu, 8 Nov 2007 04:05:04 -0500
Subject: NFS: Stop sillyname renames and unmounts from racing

Added an active/deactive mechanism to the nfs_server structure
allowing async operations to hold off umount until the
operations are done.

Signed-off-by: Steve Dickson <steved@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c   |  3 +++
 fs/nfs/internal.h |  2 ++
 fs/nfs/super.c    | 24 ++++++++++++++++++++++++
 fs/nfs/unlink.c   |  4 ++++
 4 files changed, 33 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a6f625497612..c3740f5ab978 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -729,6 +729,9 @@ static struct nfs_server *nfs_alloc_server(void)
 	INIT_LIST_HEAD(&server->client_link);
 	INIT_LIST_HEAD(&server->master_link);
 
+	init_waitqueue_head(&server->active_wq);
+	atomic_set(&server->active, 0);
+
 	server->io_stats = nfs_alloc_iostats();
 	if (!server->io_stats) {
 		kfree(server);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f3acf48412be..75793794aefe 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -160,6 +160,8 @@ extern struct rpc_stat nfs_rpcstat;
 
 extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
+extern void nfs_sb_active(struct nfs_server *server);
+extern void nfs_sb_deactive(struct nfs_server *server);
 
 /* namespace.c */
 extern char *nfs_path(const char *base,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0b0c72a072ff..fda1635dd133 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -202,6 +202,7 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
 static int nfs_xdev_get_sb(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs_kill_super(struct super_block *);
+static void nfs_put_super(struct super_block *);
 
 static struct file_system_type nfs_fs_type = {
 	.owner		= THIS_MODULE,
@@ -223,6 +224,7 @@ static const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
 	.write_inode	= nfs_write_inode,
+	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -325,6 +327,28 @@ void __exit unregister_nfs_fs(void)
 	unregister_filesystem(&nfs_fs_type);
 }
 
+void nfs_sb_active(struct nfs_server *server)
+{
+	atomic_inc(&server->active);
+}
+
+void nfs_sb_deactive(struct nfs_server *server)
+{
+	if (atomic_dec_and_test(&server->active))
+		wake_up(&server->active_wq);
+}
+
+static void nfs_put_super(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	/*
+	 * Make sure there are no outstanding ops to this server.
+	 * If so, wait for them to finish before allowing the
+	 * unmount to continue.
+	 */
+	wait_event(server->active_wq, atomic_read(&server->active) == 0);
+}
+
 /*
  * Deliver file system statistics to userspace
  */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 431981d0265f..8e5428e0b86f 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -14,6 +14,8 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 
+#include "internal.h"
+
 struct nfs_unlinkdata {
 	struct hlist_node list;
 	struct nfs_removeargs args;
@@ -113,6 +115,7 @@ static void nfs_async_unlink_release(void *calldata)
 	struct nfs_unlinkdata	*data = calldata;
 
 	nfs_dec_sillycount(data->dir);
+	nfs_sb_deactive(NFS_SERVER(data->dir));
 	nfs_free_unlinkdata(data);
 }
 
@@ -153,6 +156,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		nfs_dec_sillycount(dir);
 		return 0;
 	}
+	nfs_sb_active(NFS_SERVER(dir));
 	data->args.fh = NFS_FH(dir);
 	nfs_fattr_init(&data->res.dir_attr);
 
-- 
cgit v1.2.3


From 84115e1cd4a3614c4e566d4cce31381dce3dbef9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 14 Jul 2007 15:39:59 -0400
Subject: SUNRPC: Cleanup of rpc_task initialisation

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 36 ++++++++++++++++++++++++++++--------
 fs/nfs/read.c   | 15 ++++++++++-----
 fs/nfs/write.c  | 30 +++++++++++++++++++-----------
 3 files changed, 57 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3c9d16b4f80c..f9f5fc13dc7d 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -272,6 +272,11 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
 	size_t rsize = NFS_SERVER(inode)->rsize;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.callback_ops = &nfs_read_direct_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
@@ -322,8 +327,8 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->res.eof = 0;
 		data->res.count = bytes;
 
-		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
-				&nfs_read_direct_ops, data);
+		task_setup_data.callback_data = data;
+		rpc_init_task(&data->task, &task_setup_data);
 		NFS_PROTO(inode)->read_setup(data);
 
 		data->task.tk_cookie = (unsigned long) inode;
@@ -431,6 +436,11 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	struct inode *inode = dreq->inode;
 	struct list_head *p;
 	struct nfs_write_data *data;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.callback_ops = &nfs_write_direct_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
 
 	dreq->count = 0;
 	get_dreq(dreq);
@@ -451,8 +461,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 		 * Reuse data->task; data->args should not have changed
 		 * since the original request was sent.
 		 */
-		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
-				&nfs_write_direct_ops, data);
+		task_setup_data.callback_data = data;
+		rpc_init_task(&data->task, &task_setup_data);
 		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
 
 		data->task.tk_priority = RPC_PRIORITY_NORMAL;
@@ -504,6 +514,12 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_write_data *data = dreq->commit_data;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(dreq->inode),
+		.callback_ops = &nfs_commit_direct_ops,
+		.callback_data = data,
+		.flags = RPC_TASK_ASYNC,
+	};
 
 	data->inode = dreq->inode;
 	data->cred = dreq->ctx->cred;
@@ -515,8 +531,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
 
-	rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
-				&nfs_commit_direct_ops, data);
+	rpc_init_task(&data->task, &task_setup_data);
 	NFS_PROTO(data->inode)->commit_setup(data, 0);
 
 	data->task.tk_priority = RPC_PRIORITY_NORMAL;
@@ -641,6 +656,11 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 	struct inode *inode = ctx->path.dentry->d_inode;
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.callback_ops = &nfs_write_direct_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	unsigned int pgbase;
 	int result;
@@ -694,8 +714,8 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->res.count = bytes;
 		data->res.verf = &data->verf;
 
-		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
-				&nfs_write_direct_ops, data);
+		task_setup_data.callback_data = data;
+		rpc_init_task(&data->task, &task_setup_data);
 		NFS_PROTO(inode)->write_setup(data, sync);
 
 		data->task.tk_priority = RPC_PRIORITY_NORMAL;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4587a86adaac..c7f0d5ebd451 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -160,11 +160,17 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 		const struct rpc_call_ops *call_ops,
 		unsigned int count, unsigned int offset)
 {
-	struct inode		*inode;
-	int flags;
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.callback_ops = call_ops,
+		.callback_data = data,
+		.flags = RPC_TASK_ASYNC | swap_flags,
+	};
 
 	data->req	  = req;
-	data->inode	  = inode = req->wb_context->path.dentry->d_inode;
+	data->inode	  = inode;
 	data->cred	  = req->wb_context->cred;
 
 	data->args.fh     = NFS_FH(inode);
@@ -180,8 +186,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	nfs_fattr_init(&data->fattr);
 
 	/* Set up the initial task struct. */
-	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
-	rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
+	rpc_init_task(&data->task, &task_setup_data);
 	NFS_PROTO(inode)->read_setup(data);
 
 	data->task.tk_cookie = (unsigned long)inode;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 092e79c6d962..c4376606f106 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -773,8 +773,14 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 		unsigned int count, unsigned int offset,
 		int how)
 {
-	struct inode		*inode;
-	int flags;
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.callback_ops = call_ops,
+		.callback_data = data,
+		.flags = flags,
+	};
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
@@ -796,8 +802,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	nfs_fattr_init(&data->fattr);
 
 	/* Set up the initial task struct.  */
-	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
-	rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
+	rpc_init_task(&data->task, &task_setup_data);
 	NFS_PROTO(inode)->write_setup(data, how);
 
 	data->task.tk_priority = flush_task_priority(how);
@@ -1144,16 +1149,20 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 		struct nfs_write_data *data,
 		int how)
 {
-	struct nfs_page		*first;
-	struct inode		*inode;
-	int flags;
+	struct nfs_page *first = nfs_list_entry(head->next);
+	struct inode *inode = first->wb_context->path.dentry->d_inode;
+	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(inode),
+		.callback_ops = &nfs_commit_ops,
+		.callback_data = data,
+		.flags = flags,
+	};
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 
 	list_splice_init(head, &data->pages);
-	first = nfs_list_entry(data->pages.next);
-	inode = first->wb_context->path.dentry->d_inode;
 
 	data->inode	  = inode;
 	data->cred	  = first->wb_context->cred;
@@ -1168,8 +1177,7 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	nfs_fattr_init(&data->fattr);
 
 	/* Set up the initial task struct.  */
-	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
-	rpc_init_task(&data->task, NFS_CLIENT(inode), flags, &nfs_commit_ops, data);
+	rpc_init_task(&data->task, &task_setup_data);
 	NFS_PROTO(inode)->commit_setup(data, how);
 
 	data->task.tk_priority = flush_task_priority(how);
-- 
cgit v1.2.3


From c970aa85e71bd581726c42df843f6f129db275ac Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 14 Jul 2007 15:39:59 -0400
Subject: SUNRPC: Clean up rpc_run_task

Make it use the new task initialiser structure instead of acting as a
wrapper.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 49 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/nfs/unlink.c   |  9 ++++++++-
 2 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a51a7537f3f6..ff2c5f83ce87 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -779,12 +779,18 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
 {
 	struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
 	struct rpc_task *task;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.callback_ops = &nfs4_open_confirm_ops,
+		.callback_data = data,
+		.flags = RPC_TASK_ASYNC,
+	};
 	int status;
 
 	kref_get(&data->kref);
 	data->rpc_done = 0;
 	data->rpc_status = 0;
-	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data);
+	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	status = nfs4_wait_for_completion_rpc_task(task);
@@ -908,13 +914,19 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	struct nfs_openargs *o_arg = &data->o_arg;
 	struct nfs_openres *o_res = &data->o_res;
 	struct rpc_task *task;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.callback_ops = &nfs4_open_ops,
+		.callback_data = data,
+		.flags = RPC_TASK_ASYNC,
+	};
 	int status;
 
 	kref_get(&data->kref);
 	data->rpc_done = 0;
 	data->rpc_status = 0;
 	data->cancelled = 0;
-	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data);
+	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	status = nfs4_wait_for_completion_rpc_task(task);
@@ -1309,6 +1321,11 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	struct nfs4_closedata *calldata;
 	struct nfs4_state_owner *sp = state->owner;
 	struct rpc_task *task;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.callback_ops = &nfs4_close_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
 	int status = -ENOMEM;
 
 	calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
@@ -1328,7 +1345,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	calldata->path.mnt = mntget(path->mnt);
 	calldata->path.dentry = dget(path->dentry);
 
-	task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_close_ops, calldata);
+	task_setup_data.callback_data = calldata;
+	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	status = 0;
@@ -3027,6 +3045,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	struct nfs4_delegreturndata *data;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct rpc_task *task;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = server->client,
+		.callback_ops = &nfs4_delegreturn_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
 	int status;
 
 	data = kmalloc(sizeof(*data), GFP_KERNEL);
@@ -3043,7 +3066,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	data->timestamp = jiffies;
 	data->rpc_status = 0;
 
-	task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data);
+	task_setup_data.callback_data = data;
+	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	status = nfs4_wait_for_completion_rpc_task(task);
@@ -3260,6 +3284,11 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 		struct nfs_seqid *seqid)
 {
 	struct nfs4_unlockdata *data;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(lsp->ls_state->inode),
+		.callback_ops = &nfs4_locku_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
 
 	/* Ensure this is an unlock - when canceling a lock, the
 	 * canceled lock is passed in, and it won't be an unlock.
@@ -3272,7 +3301,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	return rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data);
+	task_setup_data.callback_data = data;
+	return rpc_run_task(&task_setup_data);
 }
 
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
@@ -3438,6 +3468,11 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 {
 	struct nfs4_lockdata *data;
 	struct rpc_task *task;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = NFS_CLIENT(state->inode),
+		.callback_ops = &nfs4_lock_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
 	int ret;
 
 	dprintk("%s: begin!\n", __FUNCTION__);
@@ -3449,8 +3484,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 		data->arg.block = 1;
 	if (reclaim != 0)
 		data->arg.reclaim = 1;
-	task = rpc_run_task(NFS_CLIENT(state->inode), RPC_TASK_ASYNC,
-			&nfs4_lock_ops, data);
+	task_setup_data.callback_data = data;
+	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	ret = nfs4_wait_for_completion_rpc_task(task);
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 8e5428e0b86f..6660d9a53345 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -127,6 +127,11 @@ static const struct rpc_call_ops nfs_unlink_ops = {
 
 static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
 {
+	struct rpc_task_setup task_setup_data = {
+		.callback_ops = &nfs_unlink_ops,
+		.callback_data = data,
+		.flags = RPC_TASK_ASYNC,
+	};
 	struct rpc_task *task;
 	struct dentry *alias;
 
@@ -160,7 +165,9 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 	data->args.fh = NFS_FH(dir);
 	nfs_fattr_init(&data->res.dir_attr);
 
-	task = rpc_run_task(NFS_CLIENT(dir), RPC_TASK_ASYNC, &nfs_unlink_ops, data);
+	task_setup_data.rpc_client = NFS_CLIENT(dir);
+
+	task = rpc_run_task(&task_setup_data);
 	if (!IS_ERR(task))
 		rpc_put_task(task);
 	return 1;
-- 
cgit v1.2.3


From 3ff7576ddac06c3d07089e241b40826d24bbf1ac Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 14 Jul 2007 15:40:00 -0400
Subject: SUNRPC: Clean up the initialisation of priority queue scheduling
 info.

We want the default scheduling priority (priority == 0) to remain
RPC_PRIORITY_NORMAL.

Also ensure that the priority wait queue scheduling is per process id
instead of sometimes being per thread, and sometimes being per inode.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 10 ----------
 fs/nfs/read.c   |  2 --
 fs/nfs/write.c  | 12 +++++-------
 3 files changed, 5 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f9f5fc13dc7d..5bcc764e501a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -331,8 +331,6 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		rpc_init_task(&data->task, &task_setup_data);
 		NFS_PROTO(inode)->read_setup(data);
 
-		data->task.tk_cookie = (unsigned long) inode;
-
 		rpc_execute(&data->task);
 
 		dprintk("NFS: %5u initiated direct read call "
@@ -465,9 +463,6 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 		rpc_init_task(&data->task, &task_setup_data);
 		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
 
-		data->task.tk_priority = RPC_PRIORITY_NORMAL;
-		data->task.tk_cookie = (unsigned long) inode;
-
 		/*
 		 * We're called via an RPC callback, so BKL is already held.
 		 */
@@ -534,8 +529,6 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	rpc_init_task(&data->task, &task_setup_data);
 	NFS_PROTO(data->inode)->commit_setup(data, 0);
 
-	data->task.tk_priority = RPC_PRIORITY_NORMAL;
-	data->task.tk_cookie = (unsigned long)data->inode;
 	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
 	dreq->commit_data = NULL;
 
@@ -718,9 +711,6 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		rpc_init_task(&data->task, &task_setup_data);
 		NFS_PROTO(inode)->write_setup(data, sync);
 
-		data->task.tk_priority = RPC_PRIORITY_NORMAL;
-		data->task.tk_cookie = (unsigned long) inode;
-
 		rpc_execute(&data->task);
 
 		dprintk("NFS: %5u initiated direct write call "
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c7f0d5ebd451..8f1eb08ccffa 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -189,8 +189,6 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	rpc_init_task(&data->task, &task_setup_data);
 	NFS_PROTO(inode)->read_setup(data);
 
-	data->task.tk_cookie = (unsigned long)inode;
-
 	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 			data->task.tk_pid,
 			inode->i_sb->s_id,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c4376606f106..8d90e90ccd47 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -753,7 +753,7 @@ static void nfs_writepage_release(struct nfs_page *req)
 	nfs_clear_page_tag_locked(req);
 }
 
-static inline int flush_task_priority(int how)
+static int flush_task_priority(int how)
 {
 	switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
 		case FLUSH_HIGHPRI:
@@ -775,11 +775,13 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 {
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
 	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+	int priority = flush_task_priority(how);
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.flags = flags,
+		.priority = priority,
 	};
 
 	/* Set up the RPC argument and reply structs
@@ -805,9 +807,6 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	rpc_init_task(&data->task, &task_setup_data);
 	NFS_PROTO(inode)->write_setup(data, how);
 
-	data->task.tk_priority = flush_task_priority(how);
-	data->task.tk_cookie = (unsigned long)inode;
-
 	dprintk("NFS: %5u initiated write call "
 		"(req %s/%Ld, %u bytes @ offset %Lu)\n",
 		data->task.tk_pid,
@@ -1152,11 +1151,13 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	struct nfs_page *first = nfs_list_entry(head->next);
 	struct inode *inode = first->wb_context->path.dentry->d_inode;
 	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+	int priority = flush_task_priority(how);
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
 		.callback_ops = &nfs_commit_ops,
 		.callback_data = data,
 		.flags = flags,
+		.priority = priority,
 	};
 
 	/* Set up the RPC argument and reply structs
@@ -1180,9 +1181,6 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	rpc_init_task(&data->task, &task_setup_data);
 	NFS_PROTO(inode)->commit_setup(data, how);
 
-	data->task.tk_priority = flush_task_priority(how);
-	data->task.tk_cookie = (unsigned long)inode;
-	
 	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 }
 
-- 
cgit v1.2.3


From bdc7f021f3a1fade77adf3c2d7f65690566fddfe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 14 Jul 2007 15:40:00 -0400
Subject: NFS: Clean up the (commit|read|write)_setup() callback routines

Move the common code for setting up the nfs_write_data and nfs_read_data
structures into fs/nfs/read.c, fs/nfs/write.c and fs/nfs/direct.c.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c   | 45 ++++++++++++++++++++++++++++++++++++---------
 fs/nfs/nfs3proc.c | 41 ++++++-----------------------------------
 fs/nfs/nfs4proc.c | 49 ++++++++-----------------------------------------
 fs/nfs/proc.c     | 26 +++++---------------------
 fs/nfs/read.c     | 38 ++++++++++++++++++++------------------
 fs/nfs/write.c    | 51 +++++++++++++++++++++++++++++++++++----------------
 6 files changed, 110 insertions(+), 140 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 5bcc764e501a..244d1bd7002c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -272,8 +272,12 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
 	size_t rsize = NFS_SERVER(inode)->rsize;
+	struct rpc_message msg = {
+		.rpc_cred = ctx->cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
 		.callback_ops = &nfs_read_direct_ops,
 		.flags = RPC_TASK_ASYNC,
 	};
@@ -316,7 +320,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 
 		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
-		data->cred = ctx->cred;
+		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
@@ -326,10 +330,12 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
 		data->res.count = bytes;
+		msg.rpc_argp = &data->args;
+		msg.rpc_resp = &data->res;
 
 		task_setup_data.callback_data = data;
+		NFS_PROTO(inode)->read_setup(data, &msg);
 		rpc_init_task(&data->task, &task_setup_data);
-		NFS_PROTO(inode)->read_setup(data);
 
 		rpc_execute(&data->task);
 
@@ -434,6 +440,9 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	struct inode *inode = dreq->inode;
 	struct list_head *p;
 	struct nfs_write_data *data;
+	struct rpc_message msg = {
+		.rpc_cred = dreq->ctx->cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
 		.callback_ops = &nfs_write_direct_ops,
@@ -448,6 +457,9 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 
 		get_dreq(dreq);
 
+		/* Use stable writes */
+		data->args.stable = NFS_FILE_SYNC;
+
 		/*
 		 * Reset data->res.
 		 */
@@ -460,8 +472,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 		 * since the original request was sent.
 		 */
 		task_setup_data.callback_data = data;
+		msg.rpc_argp = &data->args;
+		msg.rpc_resp = &data->res;
+		NFS_PROTO(inode)->write_setup(data, &msg);
 		rpc_init_task(&data->task, &task_setup_data);
-		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
 
 		/*
 		 * We're called via an RPC callback, so BKL is already held.
@@ -509,15 +523,21 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_write_data *data = dreq->commit_data;
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = dreq->ctx->cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(dreq->inode),
+		.rpc_message = &msg,
 		.callback_ops = &nfs_commit_direct_ops,
 		.callback_data = data,
 		.flags = RPC_TASK_ASYNC,
 	};
 
 	data->inode = dreq->inode;
-	data->cred = dreq->ctx->cred;
+	data->cred = msg.rpc_cred;
 
 	data->args.fh = NFS_FH(data->inode);
 	data->args.offset = 0;
@@ -526,8 +546,8 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
 
+	NFS_PROTO(data->inode)->commit_setup(data, &msg);
 	rpc_init_task(&data->task, &task_setup_data);
-	NFS_PROTO(data->inode)->commit_setup(data, 0);
 
 	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
 	dreq->commit_data = NULL;
@@ -649,8 +669,12 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 	struct inode *inode = ctx->path.dentry->d_inode;
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
+	struct rpc_message msg = {
+		.rpc_cred = ctx->cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
 		.callback_ops = &nfs_write_direct_ops,
 		.flags = RPC_TASK_ASYNC,
 	};
@@ -696,20 +720,23 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 
 		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
-		data->cred = ctx->cred;
+		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
 		data->args.count = bytes;
+		data->args.stable = sync;
 		data->res.fattr = &data->fattr;
 		data->res.count = bytes;
 		data->res.verf = &data->verf;
 
 		task_setup_data.callback_data = data;
+		msg.rpc_argp = &data->args;
+		msg.rpc_resp = &data->res;
+		NFS_PROTO(inode)->write_setup(data, &msg);
 		rpc_init_task(&data->task, &task_setup_data);
-		NFS_PROTO(inode)->write_setup(data, sync);
 
 		rpc_execute(&data->task);
 
@@ -782,7 +809,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
 	size_t wsize = NFS_SERVER(inode)->wsize;
-	int sync = 0;
+	int sync = NFS_UNSTABLE;
 
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
@@ -790,7 +817,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	nfs_alloc_commit_data(dreq);
 
 	if (dreq->commit_data == NULL || count < wsize)
-		sync = FLUSH_STABLE;
+		sync = NFS_FILE_SYNC;
 
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 4cdc2361a669..e68580ebaa47 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -732,16 +732,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 }
 
-static void nfs3_proc_read_setup(struct nfs_read_data *data)
+static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
-	struct rpc_message	msg = {
-		.rpc_proc	= &nfs3_procedures[NFS3PROC_READ],
-		.rpc_argp	= &data->args,
-		.rpc_resp	= &data->res,
-		.rpc_cred	= data->cred,
-	};
-
-	rpc_call_setup(&data->task, &msg, 0);
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
 }
 
 static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -753,24 +746,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
-static void nfs3_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-	struct rpc_message	msg = {
-		.rpc_proc	= &nfs3_procedures[NFS3PROC_WRITE],
-		.rpc_argp	= &data->args,
-		.rpc_resp	= &data->res,
-		.rpc_cred	= data->cred,
-	};
-
-	data->args.stable = NFS_UNSTABLE;
-	if (how & FLUSH_STABLE) {
-		data->args.stable = NFS_FILE_SYNC;
-		if (NFS_I(data->inode)->ncommit)
-			data->args.stable = NFS_DATA_SYNC;
-	}
-
-	/* Finalize the task. */
-	rpc_call_setup(&data->task, &msg, 0);
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
 }
 
 static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -781,16 +759,9 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
-static void nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
+static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-	struct rpc_message	msg = {
-		.rpc_proc	= &nfs3_procedures[NFS3PROC_COMMIT],
-		.rpc_argp	= &data->args,
-		.rpc_resp	= &data->res,
-		.rpc_cred	= data->cred,
-	};
-
-	rpc_call_setup(&data->task, &msg, 0);
+	msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
 }
 
 static int
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ff2c5f83ce87..7c0baf23abdc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2432,18 +2432,10 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 }
 
-static void nfs4_proc_read_setup(struct nfs_read_data *data)
+static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ],
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = data->cred,
-	};
-
 	data->timestamp   = jiffies;
-
-	rpc_call_setup(&data->task, &msg, 0);
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 }
 
 static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -2461,33 +2453,15 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
-static void nfs4_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE],
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = data->cred,
-	};
-	struct inode *inode = data->inode;
-	struct nfs_server *server = NFS_SERVER(inode);
-	int stable;
-	
-	if (how & FLUSH_STABLE) {
-		if (!NFS_I(inode)->ncommit)
-			stable = NFS_FILE_SYNC;
-		else
-			stable = NFS_DATA_SYNC;
-	} else
-		stable = NFS_UNSTABLE;
-	data->args.stable = stable;
+	struct nfs_server *server = NFS_SERVER(data->inode);
+
 	data->args.bitmask = server->attr_bitmask;
 	data->res.server = server;
-
 	data->timestamp   = jiffies;
 
-	/* Finalize the task. */
-	rpc_call_setup(&data->task, &msg, 0);
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
 }
 
 static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -2502,20 +2476,13 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
-static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
+static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT],
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = data->cred,
-	};	
 	struct nfs_server *server = NFS_SERVER(data->inode);
 	
 	data->args.bitmask = server->attr_bitmask;
 	data->res.server = server;
-
-	rpc_call_setup(&data->task, &msg, 0);
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
 
 /*
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4f80d88e9fee..c9f46a24e75c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -565,16 +565,9 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 }
 
-static void nfs_proc_read_setup(struct nfs_read_data *data)
+static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
-	struct rpc_message	msg = {
-		.rpc_proc	= &nfs_procedures[NFSPROC_READ],
-		.rpc_argp	= &data->args,
-		.rpc_resp	= &data->res,
-		.rpc_cred	= data->cred,
-	};
-
-	rpc_call_setup(&data->task, &msg, 0);
+	msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
 }
 
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -584,24 +577,15 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
-static void nfs_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
-	struct rpc_message	msg = {
-		.rpc_proc	= &nfs_procedures[NFSPROC_WRITE],
-		.rpc_argp	= &data->args,
-		.rpc_resp	= &data->res,
-		.rpc_cred	= data->cred,
-	};
-
 	/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
 	data->args.stable = NFS_FILE_SYNC;
-
-	/* Finalize the task. */
-	rpc_call_setup(&data->task, &msg, 0);
+	msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
 }
 
 static void
-nfs_proc_commit_setup(struct nfs_write_data *data, int how)
+nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
 	BUG();
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8f1eb08ccffa..e9dbdc8eafe6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -153,6 +153,16 @@ static void nfs_readpage_release(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
+static void nfs_execute_read(struct nfs_read_data *data)
+{
+	struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
+	sigset_t oldset;
+
+	rpc_clnt_sigmask(clnt, &oldset);
+	rpc_execute(&data->task);
+	rpc_clnt_sigunmask(clnt, &oldset);
+}
+
 /*
  * Set up the NFS read request struct
  */
@@ -162,8 +172,14 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 {
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = req->wb_context->cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.flags = RPC_TASK_ASYNC | swap_flags,
@@ -171,7 +187,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 
 	data->req	  = req;
 	data->inode	  = inode;
-	data->cred	  = req->wb_context->cred;
+	data->cred	  = msg.rpc_cred;
 
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
@@ -186,8 +202,8 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	nfs_fattr_init(&data->fattr);
 
 	/* Set up the initial task struct. */
+	NFS_PROTO(inode)->read_setup(data, &msg);
 	rpc_init_task(&data->task, &task_setup_data);
-	NFS_PROTO(inode)->read_setup(data);
 
 	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 			data->task.tk_pid,
@@ -195,6 +211,8 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 			(long long)NFS_FILEID(inode),
 			count,
 			(unsigned long long)data->args.offset);
+
+	nfs_execute_read(data);
 }
 
 static void
@@ -210,19 +228,6 @@ nfs_async_read_error(struct list_head *head)
 	}
 }
 
-/*
- * Start an async read operation
- */
-static void nfs_execute_read(struct nfs_read_data *data)
-{
-	struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-	sigset_t oldset;
-
-	rpc_clnt_sigmask(clnt, &oldset);
-	rpc_execute(&data->task);
-	rpc_clnt_sigunmask(clnt, &oldset);
-}
-
 /*
  * Generate multiple requests to fill a single page.
  *
@@ -277,7 +282,6 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 				  rsize, offset);
 		offset += rsize;
 		nbytes -= rsize;
-		nfs_execute_read(data);
 	} while (nbytes != 0);
 
 	return 0;
@@ -315,8 +319,6 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
 	req = nfs_list_entry(data->pages.next);
 
 	nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
-
-	nfs_execute_read(data);
 	return 0;
 out_bad:
 	nfs_async_read_error(head);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8d90e90ccd47..9a69469274ae 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -764,6 +764,16 @@ static int flush_task_priority(int how)
 	return RPC_PRIORITY_NORMAL;
 }
 
+static void nfs_execute_write(struct nfs_write_data *data)
+{
+	struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
+	sigset_t oldset;
+
+	rpc_clnt_sigmask(clnt, &oldset);
+	rpc_execute(&data->task);
+	rpc_clnt_sigunmask(clnt, &oldset);
+}
+
 /*
  * Set up the argument/result storage required for the RPC call.
  */
@@ -776,8 +786,14 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
 	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 	int priority = flush_task_priority(how);
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = req->wb_context->cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.flags = flags,
@@ -789,7 +805,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 
 	data->req = req;
 	data->inode = inode = req->wb_context->path.dentry->d_inode;
-	data->cred = req->wb_context->cred;
+	data->cred = msg.rpc_cred;
 
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
@@ -797,6 +813,12 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	data->args.pages  = data->pagevec;
 	data->args.count  = count;
 	data->args.context = req->wb_context;
+	data->args.stable  = NFS_UNSTABLE;
+	if (how & FLUSH_STABLE) {
+		data->args.stable = NFS_DATA_SYNC;
+		if (!NFS_I(inode)->ncommit)
+			data->args.stable = NFS_FILE_SYNC;
+	}
 
 	data->res.fattr   = &data->fattr;
 	data->res.count   = count;
@@ -804,8 +826,8 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	nfs_fattr_init(&data->fattr);
 
 	/* Set up the initial task struct.  */
+	NFS_PROTO(inode)->write_setup(data, &msg);
 	rpc_init_task(&data->task, &task_setup_data);
-	NFS_PROTO(inode)->write_setup(data, how);
 
 	dprintk("NFS: %5u initiated write call "
 		"(req %s/%Ld, %u bytes @ offset %Lu)\n",
@@ -814,16 +836,8 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 		(long long)NFS_FILEID(inode),
 		count,
 		(unsigned long long)data->args.offset);
-}
 
-static void nfs_execute_write(struct nfs_write_data *data)
-{
-	struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-	sigset_t oldset;
-
-	rpc_clnt_sigmask(clnt, &oldset);
-	rpc_execute(&data->task);
-	rpc_clnt_sigunmask(clnt, &oldset);
+	nfs_execute_write(data);
 }
 
 /*
@@ -870,7 +884,6 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 				   wsize, offset, how);
 		offset += wsize;
 		nbytes -= wsize;
-		nfs_execute_write(data);
 	} while (nbytes != 0);
 
 	return 0;
@@ -918,7 +931,6 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 	/* Set up the argument struct */
 	nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
 
-	nfs_execute_write(data);
 	return 0;
  out_bad:
 	while (!list_empty(head)) {
@@ -1152,8 +1164,14 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	struct inode *inode = first->wb_context->path.dentry->d_inode;
 	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 	int priority = flush_task_priority(how);
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = first->wb_context->cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
+		.rpc_message = &msg,
 		.callback_ops = &nfs_commit_ops,
 		.callback_data = data,
 		.flags = flags,
@@ -1166,7 +1184,7 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	list_splice_init(head, &data->pages);
 
 	data->inode	  = inode;
-	data->cred	  = first->wb_context->cred;
+	data->cred	  = msg.rpc_cred;
 
 	data->args.fh     = NFS_FH(data->inode);
 	/* Note: we always request a commit of the entire inode */
@@ -1178,10 +1196,12 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	nfs_fattr_init(&data->fattr);
 
 	/* Set up the initial task struct.  */
+	NFS_PROTO(inode)->commit_setup(data, &msg);
 	rpc_init_task(&data->task, &task_setup_data);
-	NFS_PROTO(inode)->commit_setup(data, how);
 
 	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
+	nfs_execute_write(data);
 }
 
 /*
@@ -1201,7 +1221,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 	/* Set up the argument struct */
 	nfs_commit_rpcsetup(head, data, how);
 
-	nfs_execute_write(data);
 	return 0;
  out_bad:
 	while (!list_empty(head)) {
-- 
cgit v1.2.3


From 5138fde01161cd7976fdc51f6a17da73adaa6baf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 14 Jul 2007 15:40:01 -0400
Subject: NFS/SUNRPC: Convert all users of rpc_call_setup()

Replace use of rpc_call_setup() with rpc_init_task(), and in cases where we
need to initialise task->tk_action, with rpc_call_start().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 118 ++++++++++++++++++++++++------------------------------
 fs/nfs/unlink.c   |  28 ++++---------
 2 files changed, 60 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7c0baf23abdc..826b445b8c70 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -718,19 +718,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 	return err;
 }
 
-static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
-{
-	struct nfs4_opendata *data = calldata;
-	struct  rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
-		.rpc_argp = &data->c_arg,
-		.rpc_resp = &data->c_res,
-		.rpc_cred = data->owner->so_cred,
-	};
-	data->timestamp = jiffies;
-	rpc_call_setup(task, &msg, 0);
-}
-
 static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_opendata *data = calldata;
@@ -767,7 +754,6 @@ out_free:
 }
 
 static const struct rpc_call_ops nfs4_open_confirm_ops = {
-	.rpc_call_prepare = nfs4_open_confirm_prepare,
 	.rpc_call_done = nfs4_open_confirm_done,
 	.rpc_release = nfs4_open_confirm_release,
 };
@@ -779,8 +765,15 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
 {
 	struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
 	struct rpc_task *task;
+	struct  rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+		.rpc_argp = &data->c_arg,
+		.rpc_resp = &data->c_res,
+		.rpc_cred = data->owner->so_cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = server->client,
+		.rpc_message = &msg,
 		.callback_ops = &nfs4_open_confirm_ops,
 		.callback_data = data,
 		.flags = RPC_TASK_ASYNC,
@@ -790,6 +783,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
 	kref_get(&data->kref);
 	data->rpc_done = 0;
 	data->rpc_status = 0;
+	data->timestamp = jiffies;
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
@@ -807,13 +801,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_opendata *data = calldata;
 	struct nfs4_state_owner *sp = data->owner;
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
-		.rpc_argp = &data->o_arg,
-		.rpc_resp = &data->o_res,
-		.rpc_cred = sp->so_cred,
-	};
-	
+
 	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
 		return;
 	/*
@@ -838,11 +826,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	data->o_arg.id = sp->so_owner_id.id;
 	data->o_arg.clientid = sp->so_client->cl_clientid;
 	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
-		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
 		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
 	}
 	data->timestamp = jiffies;
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_start(task);
 	return;
 out_no_action:
 	task->tk_action = NULL;
@@ -914,8 +902,15 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	struct nfs_openargs *o_arg = &data->o_arg;
 	struct nfs_openres *o_res = &data->o_res;
 	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
+		.rpc_argp = o_arg,
+		.rpc_resp = o_res,
+		.rpc_cred = data->owner->so_cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = server->client,
+		.rpc_message = &msg,
 		.callback_ops = &nfs4_open_ops,
 		.callback_data = data,
 		.flags = RPC_TASK_ASYNC,
@@ -1256,12 +1251,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs4_closedata *calldata = data;
 	struct nfs4_state *state = calldata->state;
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
-		.rpc_argp = &calldata->arg,
-		.rpc_resp = &calldata->res,
-		.rpc_cred = state->owner->so_cred,
-	};
 	int clear_rd, clear_wr, clear_rdwr;
 
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
@@ -1288,14 +1277,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	}
 	nfs_fattr_init(calldata->res.fattr);
 	if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
-		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
 		calldata->arg.open_flags = FMODE_READ;
 	} else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
-		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
 		calldata->arg.open_flags = FMODE_WRITE;
 	}
 	calldata->timestamp = jiffies;
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_start(task);
 }
 
 static const struct rpc_call_ops nfs4_close_ops = {
@@ -1321,8 +1310,13 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	struct nfs4_closedata *calldata;
 	struct nfs4_state_owner *sp = state->owner;
 	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
+		.rpc_cred = state->owner->so_cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = server->client,
+		.rpc_message = &msg,
 		.callback_ops = &nfs4_close_ops,
 		.flags = RPC_TASK_ASYNC,
 	};
@@ -1345,6 +1339,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	calldata->path.mnt = mntget(path->mnt);
 	calldata->path.dentry = dget(path->dentry);
 
+	msg.rpc_argp = &calldata->arg,
+	msg.rpc_resp = &calldata->res,
 	task_setup_data.callback_data = calldata;
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
@@ -2966,25 +2962,11 @@ struct nfs4_delegreturndata {
 	struct nfs4_delegreturnres res;
 	struct nfs_fh fh;
 	nfs4_stateid stateid;
-	struct rpc_cred *cred;
 	unsigned long timestamp;
 	struct nfs_fattr fattr;
 	int rpc_status;
 };
 
-static void nfs4_delegreturn_prepare(struct rpc_task *task, void *calldata)
-{
-	struct nfs4_delegreturndata *data = calldata;
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = data->cred,
-	};
-	nfs_fattr_init(data->res.fattr);
-	rpc_call_setup(task, &msg, 0);
-}
-
 static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_delegreturndata *data = calldata;
@@ -2995,14 +2977,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 
 static void nfs4_delegreturn_release(void *calldata)
 {
-	struct nfs4_delegreturndata *data = calldata;
-
-	put_rpccred(data->cred);
 	kfree(calldata);
 }
 
 static const struct rpc_call_ops nfs4_delegreturn_ops = {
-	.rpc_call_prepare = nfs4_delegreturn_prepare,
 	.rpc_call_done = nfs4_delegreturn_done,
 	.rpc_release = nfs4_delegreturn_release,
 };
@@ -3012,8 +2990,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	struct nfs4_delegreturndata *data;
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
+		.rpc_cred = cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = server->client,
+		.rpc_message = &msg,
 		.callback_ops = &nfs4_delegreturn_ops,
 		.flags = RPC_TASK_ASYNC,
 	};
@@ -3029,11 +3012,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	memcpy(&data->stateid, stateid, sizeof(data->stateid));
 	data->res.fattr = &data->fattr;
 	data->res.server = server;
-	data->cred = get_rpccred(cred);
+	nfs_fattr_init(data->res.fattr);
 	data->timestamp = jiffies;
 	data->rpc_status = 0;
 
 	task_setup_data.callback_data = data;
+	msg.rpc_argp = &data->args,
+	msg.rpc_resp = &data->res,
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
@@ -3221,12 +3206,6 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs4_unlockdata *calldata = data;
-	struct rpc_message msg = {
-		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
-		.rpc_argp       = &calldata->arg,
-		.rpc_resp       = &calldata->res,
-		.rpc_cred	= calldata->lsp->ls_state->owner->so_cred,
-	};
 
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		return;
@@ -3236,7 +3215,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 		return;
 	}
 	calldata->timestamp = jiffies;
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_start(task);
 }
 
 static const struct rpc_call_ops nfs4_locku_ops = {
@@ -3251,8 +3230,13 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 		struct nfs_seqid *seqid)
 {
 	struct nfs4_unlockdata *data;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
+		.rpc_cred = ctx->cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(lsp->ls_state->inode),
+		.rpc_message = &msg,
 		.callback_ops = &nfs4_locku_ops,
 		.flags = RPC_TASK_ASYNC,
 	};
@@ -3268,6 +3252,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	msg.rpc_argp = &data->arg,
+	msg.rpc_resp = &data->res,
 	task_setup_data.callback_data = data;
 	return rpc_run_task(&task_setup_data);
 }
@@ -3353,13 +3339,6 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_lockdata *data = calldata;
 	struct nfs4_state *state = data->lsp->ls_state;
-	struct nfs4_state_owner *sp = state->owner;
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
-		.rpc_argp = &data->arg,
-		.rpc_resp = &data->res,
-		.rpc_cred = sp->so_cred,
-	};
 
 	dprintk("%s: begin!\n", __FUNCTION__);
 	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
@@ -3373,7 +3352,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 	} else
 		data->arg.new_lock_owner = 0;
 	data->timestamp = jiffies;
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_start(task);
 	dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
 }
 
@@ -3435,8 +3414,13 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 {
 	struct nfs4_lockdata *data;
 	struct rpc_task *task;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
+		.rpc_cred = state->owner->so_cred,
+	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(state->inode),
+		.rpc_message = &msg,
 		.callback_ops = &nfs4_lock_ops,
 		.flags = RPC_TASK_ASYNC,
 	};
@@ -3451,6 +3435,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 		data->arg.block = 1;
 	if (reclaim != 0)
 		data->arg.reclaim = 1;
+	msg.rpc_argp = &data->arg,
+	msg.rpc_resp = &data->res,
 	task_setup_data.callback_data = data;
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6660d9a53345..757415363422 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -70,24 +70,6 @@ static void nfs_dec_sillycount(struct inode *dir)
 		wake_up(&nfsi->waitqueue);
 }
 
-/**
- * nfs_async_unlink_init - Initialize the RPC info
- * task: rpc_task of the sillydelete
- */
-static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
-{
-	struct nfs_unlinkdata *data = calldata;
-	struct inode *dir = data->dir;
-	struct rpc_message msg = {
-		.rpc_argp = &data->args,
-		.rpc_resp = &data->res,
-		.rpc_cred = data->cred,
-	};
-
-	NFS_PROTO(dir)->unlink_setup(&msg, dir);
-	rpc_call_setup(task, &msg, 0);
-}
-
 /**
  * nfs_async_unlink_done - Sillydelete post-processing
  * @task: rpc_task of the sillydelete
@@ -120,14 +102,19 @@ static void nfs_async_unlink_release(void *calldata)
 }
 
 static const struct rpc_call_ops nfs_unlink_ops = {
-	.rpc_call_prepare = nfs_async_unlink_init,
 	.rpc_call_done = nfs_async_unlink_done,
 	.rpc_release = nfs_async_unlink_release,
 };
 
 static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
 {
+	struct rpc_message msg = {
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+		.rpc_cred = data->cred,
+	};
 	struct rpc_task_setup task_setup_data = {
+		.rpc_message = &msg,
 		.callback_ops = &nfs_unlink_ops,
 		.callback_data = data,
 		.flags = RPC_TASK_ASYNC,
@@ -165,8 +152,9 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 	data->args.fh = NFS_FH(dir);
 	nfs_fattr_init(&data->res.dir_attr);
 
-	task_setup_data.rpc_client = NFS_CLIENT(dir);
+	NFS_PROTO(dir)->unlink_setup(&msg, dir);
 
+	task_setup_data.rpc_client = NFS_CLIENT(dir);
 	task = rpc_run_task(&task_setup_data);
 	if (!IS_ERR(task))
 		rpc_put_task(task);
-- 
cgit v1.2.3


From 0773769191d943358a8392fa86abd756d004c4b6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 Oct 2007 18:42:54 -0400
Subject: NFS/SUNRPC: Convert users of rpc_init_task+rpc_execute to
 rpc_run_task()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 28 ++++++++++++++++++++--------
 fs/nfs/read.c   | 17 +++++------------
 fs/nfs/write.c  | 24 ++++++++++--------------
 3 files changed, 35 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 244d1bd7002c..eadd87f7159f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -272,6 +272,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
 	size_t rsize = NFS_SERVER(inode)->rsize;
+	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_cred = ctx->cred,
 	};
@@ -333,11 +334,13 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		msg.rpc_argp = &data->args;
 		msg.rpc_resp = &data->res;
 
+		task_setup_data.task = &data->task;
 		task_setup_data.callback_data = data;
 		NFS_PROTO(inode)->read_setup(data, &msg);
-		rpc_init_task(&data->task, &task_setup_data);
 
-		rpc_execute(&data->task);
+		task = rpc_run_task(&task_setup_data);
+		if (!IS_ERR(task))
+			rpc_put_task(task);
 
 		dprintk("NFS: %5u initiated direct read call "
 			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
@@ -440,6 +443,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 	struct inode *inode = dreq->inode;
 	struct list_head *p;
 	struct nfs_write_data *data;
+	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_cred = dreq->ctx->cred,
 	};
@@ -471,16 +475,18 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 		 * Reuse data->task; data->args should not have changed
 		 * since the original request was sent.
 		 */
+		task_setup_data.task = &data->task;
 		task_setup_data.callback_data = data;
 		msg.rpc_argp = &data->args;
 		msg.rpc_resp = &data->res;
 		NFS_PROTO(inode)->write_setup(data, &msg);
-		rpc_init_task(&data->task, &task_setup_data);
 
 		/*
 		 * We're called via an RPC callback, so BKL is already held.
 		 */
-		rpc_execute(&data->task);
+		task = rpc_run_task(&task_setup_data);
+		if (!IS_ERR(task))
+			rpc_put_task(task);
 
 		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 				data->task.tk_pid,
@@ -523,12 +529,14 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_write_data *data = dreq->commit_data;
+	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
 		.rpc_cred = dreq->ctx->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
+		.task = &data->task,
 		.rpc_client = NFS_CLIENT(dreq->inode),
 		.rpc_message = &msg,
 		.callback_ops = &nfs_commit_direct_ops,
@@ -547,14 +555,15 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->res.verf = &data->verf;
 
 	NFS_PROTO(data->inode)->commit_setup(data, &msg);
-	rpc_init_task(&data->task, &task_setup_data);
 
 	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
 	dreq->commit_data = NULL;
 
 	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 
-	rpc_execute(&data->task);
+	task = rpc_run_task(&task_setup_data);
+	if (!IS_ERR(task))
+		rpc_put_task(task);
 }
 
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
@@ -669,6 +678,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 	struct inode *inode = ctx->path.dentry->d_inode;
 	unsigned long user_addr = (unsigned long)iov->iov_base;
 	size_t count = iov->iov_len;
+	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_cred = ctx->cred,
 	};
@@ -732,13 +742,15 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->res.count = bytes;
 		data->res.verf = &data->verf;
 
+		task_setup_data.task = &data->task;
 		task_setup_data.callback_data = data;
 		msg.rpc_argp = &data->args;
 		msg.rpc_resp = &data->res;
 		NFS_PROTO(inode)->write_setup(data, &msg);
-		rpc_init_task(&data->task, &task_setup_data);
 
-		rpc_execute(&data->task);
+		task = rpc_run_task(&task_setup_data);
+		if (!IS_ERR(task))
+			rpc_put_task(task);
 
 		dprintk("NFS: %5u initiated direct write call "
 			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e9dbdc8eafe6..efc121c494fe 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -153,16 +153,6 @@ static void nfs_readpage_release(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
-static void nfs_execute_read(struct nfs_read_data *data)
-{
-	struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-	sigset_t oldset;
-
-	rpc_clnt_sigmask(clnt, &oldset);
-	rpc_execute(&data->task);
-	rpc_clnt_sigunmask(clnt, &oldset);
-}
-
 /*
  * Set up the NFS read request struct
  */
@@ -172,12 +162,14 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 {
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
 		.rpc_cred = req->wb_context->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
+		.task = &data->task,
 		.rpc_client = NFS_CLIENT(inode),
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
@@ -203,7 +195,6 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 
 	/* Set up the initial task struct. */
 	NFS_PROTO(inode)->read_setup(data, &msg);
-	rpc_init_task(&data->task, &task_setup_data);
 
 	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 			data->task.tk_pid,
@@ -212,7 +203,9 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 			count,
 			(unsigned long long)data->args.offset);
 
-	nfs_execute_read(data);
+	task = rpc_run_task(&task_setup_data);
+	if (!IS_ERR(task))
+		rpc_put_task(task);
 }
 
 static void
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9a69469274ae..fbd64f2fa7f9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -764,16 +764,6 @@ static int flush_task_priority(int how)
 	return RPC_PRIORITY_NORMAL;
 }
 
-static void nfs_execute_write(struct nfs_write_data *data)
-{
-	struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-	sigset_t oldset;
-
-	rpc_clnt_sigmask(clnt, &oldset);
-	rpc_execute(&data->task);
-	rpc_clnt_sigunmask(clnt, &oldset);
-}
-
 /*
  * Set up the argument/result storage required for the RPC call.
  */
@@ -786,6 +776,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
 	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 	int priority = flush_task_priority(how);
+	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
@@ -793,6 +784,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = NFS_CLIENT(inode),
+		.task = &data->task,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
@@ -827,7 +819,6 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 
 	/* Set up the initial task struct.  */
 	NFS_PROTO(inode)->write_setup(data, &msg);
-	rpc_init_task(&data->task, &task_setup_data);
 
 	dprintk("NFS: %5u initiated write call "
 		"(req %s/%Ld, %u bytes @ offset %Lu)\n",
@@ -837,7 +828,9 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 		count,
 		(unsigned long long)data->args.offset);
 
-	nfs_execute_write(data);
+	task = rpc_run_task(&task_setup_data);
+	if (!IS_ERR(task))
+		rpc_put_task(task);
 }
 
 /*
@@ -1164,12 +1157,14 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	struct inode *inode = first->wb_context->path.dentry->d_inode;
 	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 	int priority = flush_task_priority(how);
+	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
 		.rpc_cred = first->wb_context->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
+		.task = &data->task,
 		.rpc_client = NFS_CLIENT(inode),
 		.rpc_message = &msg,
 		.callback_ops = &nfs_commit_ops,
@@ -1197,11 +1192,12 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 
 	/* Set up the initial task struct.  */
 	NFS_PROTO(inode)->commit_setup(data, &msg);
-	rpc_init_task(&data->task, &task_setup_data);
 
 	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 
-	nfs_execute_write(data);
+	task = rpc_run_task(&task_setup_data);
+	if (!IS_ERR(task))
+		rpc_put_task(task);
 }
 
 /*
-- 
cgit v1.2.3


From 8a8c74bf94fcdec058062d331b3d9777910778ab Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:31:47 -0400
Subject: NFS: Ensure nfs_wcc_update_inode always converts file size to loff_t

The nfs_wcc_update_inode() function omits logic to convert the type of
the NFS on-the-wire value of a file's size (__u64) to the type of file
size value stored in struct inode (loff_t, which is signed).

Everywhere else in the NFS client I checked already correctly converts the
file size type.

This effects only very large files.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index db5d96dc6107..cd0e57f3a00f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -814,8 +814,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			if (S_ISDIR(inode->i_mode))
 				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 		}
-		if (inode->i_size == fattr->pre_size && nfsi->npages == 0)
-			inode->i_size = fattr->size;
+		if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
+		    nfsi->npages == 0)
+			inode->i_size = nfs_size_to_loff_t(fattr->size);
 	}
 }
 
-- 
cgit v1.2.3


From 6232dbbcffc617a5a47596b2ec347b24dc2dd2fd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:31:52 -0400
Subject: NFS: Use unsigned intermediates for manipulating header lengths
 (NFSv2 XDR)

Clean up: prevent length underflow and mixed sign comparisons when
unmarshalling NFS version 2 read, readdir, and readlink replies.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs2xdr.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 668ab96c7b59..1f7ea675e0c5 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -262,7 +262,9 @@ static int
 nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 {
 	struct kvec *iov = req->rq_rcv_buf.head;
-	int	status, count, recvd, hdrlen;
+	size_t hdrlen;
+	u32 count, recvd;
+	int status;
 
 	if ((status = ntohl(*p++)))
 		return -nfs_stat_to_errno(status);
@@ -273,7 +275,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
 		dprintk("NFS: READ reply header overflowed:"
-				"length %d > %Zu\n", hdrlen, iov->iov_len);
+				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
 		dprintk("NFS: READ header is short. iovec will be shifted.\n");
@@ -283,11 +285,11 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
 		dprintk("NFS: server cheating in read reply: "
-			"count %d > recvd %d\n", count, recvd);
+			"count %u > recvd %u\n", count, recvd);
 		count = recvd;
 	}
 
-	dprintk("RPC:      readres OK count %d\n", count);
+	dprintk("RPC:      readres OK count %u\n", count);
 	if (count < res->count)
 		res->count = count;
 
@@ -423,9 +425,10 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
 	struct page **page;
-	int hdrlen, recvd;
+	size_t hdrlen;
+	unsigned int pglen, recvd;
+	u32 len;
 	int status, nr;
-	unsigned int len, pglen;
 	__be32 *end, *entry, *kaddr;
 
 	if ((status = ntohl(*p++)))
@@ -434,7 +437,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
 		dprintk("NFS: READDIR reply header overflowed:"
-				"length %d > %Zu\n", hdrlen, iov->iov_len);
+				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
 		dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
@@ -576,7 +579,8 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
-	int hdrlen, len, recvd;
+	size_t hdrlen;
+	u32 len, recvd;
 	char	*kaddr;
 	int	status;
 
@@ -584,14 +588,14 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 		return -nfs_stat_to_errno(status);
 	/* Convert length of symlink */
 	len = ntohl(*p++);
-	if (len >= rcvbuf->page_len || len <= 0) {
+	if (len >= rcvbuf->page_len) {
 		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
 	}
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
 		dprintk("NFS: READLINK reply header overflowed:"
-				"length %d > %Zu\n", hdrlen, iov->iov_len);
+				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
 		dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
-- 
cgit v1.2.3


From c957c526ef86e472359dadb4204dab8a503b687d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:31:57 -0400
Subject: NFS: Use unsigned intermediates for manipulating header lengths
 (NFSv3 XDR)

Clean up: prevent length underflow and mixed sign comparisons when
unmarshalling NFS version 3 read, readdir, and readlink replies.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3xdr.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 616d3267b7e7..3917e2fa4e40 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -506,9 +506,9 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
 	struct page **page;
-	int hdrlen, recvd;
+	size_t hdrlen;
+	u32 len, recvd, pglen;
 	int status, nr;
-	unsigned int len, pglen;
 	__be32 *entry, *end, *kaddr;
 
 	status = ntohl(*p++);
@@ -527,7 +527,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
 		dprintk("NFS: READDIR reply header overflowed:"
-				"length %d > %Zu\n", hdrlen, iov->iov_len);
+				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
 		dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
@@ -549,7 +549,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 		len = ntohl(*p++);		/* string length */
 		p += XDR_QUADLEN(len) + 2;	/* name + cookie */
 		if (len > NFS3_MAXNAMLEN) {
-			dprintk("NFS: giant filename in readdir (len %x)!\n",
+			dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
 						len);
 			goto err_unmap;
 		}
@@ -570,7 +570,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
 				len = ntohl(*p++);
 				if (len > NFS3_FHSIZE) {
 					dprintk("NFS: giant filehandle in "
-						"readdir (len %x)!\n", len);
+						"readdir (len 0x%x)!\n", len);
 					goto err_unmap;
 				}
 				p += XDR_QUADLEN(len);
@@ -815,7 +815,8 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
-	int hdrlen, len, recvd;
+	size_t hdrlen;
+	u32 len, recvd;
 	char	*kaddr;
 	int	status;
 
@@ -827,7 +828,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 
 	/* Convert length of symlink */
 	len = ntohl(*p++);
-	if (len >= rcvbuf->page_len || len <= 0) {
+	if (len >= rcvbuf->page_len) {
 		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
 	}
@@ -835,7 +836,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
 		dprintk("NFS: READLINK reply header overflowed:"
-				"length %d > %Zu\n", hdrlen, iov->iov_len);
+				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
 		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
 		dprintk("NFS: READLINK header is short. "
@@ -863,7 +864,9 @@ static int
 nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 {
 	struct kvec *iov = req->rq_rcv_buf.head;
-	int	status, count, ocount, recvd, hdrlen;
+	size_t hdrlen;
+	u32 count, ocount, recvd;
+	int status;
 
 	status = ntohl(*p++);
 	p = xdr_decode_post_op_attr(p, res->fattr);
@@ -871,7 +874,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	if (status != 0)
 		return -nfs_stat_to_errno(status);
 
-	/* Decode reply could and EOF flag. NFSv3 is somewhat redundant
+	/* Decode reply count and EOF flag. NFSv3 is somewhat redundant
 	 * in that it puts the count both in the res struct and in the
 	 * opaque data count. */
 	count    = ntohl(*p++);
@@ -886,7 +889,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	if (iov->iov_len < hdrlen) {
 		dprintk("NFS: READ reply header overflowed:"
-				"length %d > %Zu\n", hdrlen, iov->iov_len);
+				"length %Zu > %Zu\n", hdrlen, iov->iov_len);
        		return -errno_NFSERR_IO;
 	} else if (iov->iov_len != hdrlen) {
 		dprintk("NFS: READ header is short. iovec will be shifted.\n");
@@ -896,7 +899,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
 		dprintk("NFS: server cheating in read reply: "
-			"count %d > recvd %d\n", count, recvd);
+			"count %u > recvd %u\n", count, recvd);
 		count = recvd;
 		res->eof = 0;
 	}
-- 
cgit v1.2.3


From bcecff77a9c743ff67fdddeabc30ef76a6877886 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:32:03 -0400
Subject: NFS: Use unsigned intermediates for manipulating header lengths
 (NFSv4 XDR)

Clean up: prevent length underflow and mixed sign comparison when
unmarshalling NFS version 4 getacl, readdir, and readlink replies.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 51dd3804866f..2e1fe171bf73 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3476,10 +3476,11 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 	struct xdr_buf	*rcvbuf = &req->rq_rcv_buf;
 	struct page	*page = *rcvbuf->pages;
 	struct kvec	*iov = rcvbuf->head;
-	unsigned int	nr, pglen = rcvbuf->page_len;
+	size_t		hdrlen;
+	u32		recvd, pglen = rcvbuf->page_len;
 	__be32		*end, *entry, *p, *kaddr;
-	uint32_t	len, attrlen, xlen;
-	int 		hdrlen, recvd, status;
+	unsigned int	nr;
+	int		status;
 
 	status = decode_op_hdr(xdr, OP_READDIR);
 	if (status)
@@ -3503,6 +3504,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 	end = p + ((pglen + readdir->pgbase) >> 2);
 	entry = p;
 	for (nr = 0; *p++; nr++) {
+		u32 len, attrlen, xlen;
 		if (end - p < 3)
 			goto short_pkt;
 		dprintk("cookie = %Lu, ", *((unsigned long long *)p));
@@ -3551,7 +3553,8 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 {
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
-	int hdrlen, len, recvd;
+	size_t hdrlen;
+	u32 len, recvd;
 	__be32 *p;
 	char *kaddr;
 	int status;
@@ -3646,7 +3649,8 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
-		int hdrlen, recvd;
+		size_t hdrlen;
+		u32 recvd;
 
 		/* We ignore &savep and don't do consistency checks on
 		 * the attr length.  Let userspace figure it out.... */
-- 
cgit v1.2.3


From 464ad6b1ade186b53a1dae863361853326b85694 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:32:08 -0400
Subject: NFS: Change sign of some loop indices in nfs4xdr.c

Nit: Eliminate some mixed sign comparisons in loop indices.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2e1fe171bf73..eae46f008da7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2515,14 +2515,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 
 static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 {
-	int n;
+	u32 n;
 	__be32 *p;
 	int status = 0;
 
 	READ_BUF(4);
 	READ32(n);
-	if (n < 0)
-		goto out_eio;
 	if (n == 0)
 		goto root_path;
 	dprintk("path ");
@@ -2579,13 +2577,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		goto out_eio;
 	res->nlocations = 0;
 	while (res->nlocations < n) {
-		int m;
+		u32 m;
 		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
 		READ_BUF(4);
 		READ32(m);
-		if (m <= 0)
-			goto out_eio;
 
 		loc->nservers = 0;
 		dprintk("%s: servers ", __FUNCTION__);
@@ -2598,8 +2594,12 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 			if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
 				loc->nservers++;
 			else {
-				int i;
-				dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
+				unsigned int i;
+				dprintk("%s: using first %u of %u servers "
+					"returned for location %u\n",
+						__FUNCTION__,
+						NFS4_FS_LOCATION_MAXSERVERS,
+						m, res->nlocations);
 				for (i = loc->nservers; i < m; i++) {
 					unsigned int len;
 					char *data;
-- 
cgit v1.2.3


From 28c494c5c8d425e15b7b82571e4df6d6bc34594d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:32:13 -0400
Subject: NFS: Prevent nfs_getattr() hang during heavy write workloads

POSIX requires that ctime and mtime, as reported by the stat(2) call,
reflect the activity of the most recent write(2).  To that end, nfs_getattr()
flushes pending dirty writes to a file before doing a GETATTR to allow the
NFS server to set the file's size, ctime, and mtime properly.

However, nfs_getattr() can be starved when a constant stream of application
writes to a file prevents nfs_wb_nocommit() from completing.  This usually
results in hangs of programs doing a stat against an NFS file that is being
written.  "ls -l" is a common victim of this behavior.

To prevent starvation, hold the file's i_mutex in nfs_getattr() to
freeze applications writes temporarily so the client can more quickly obtain
clean values for a file's size, mtime, and ctime.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cd0e57f3a00f..cc3a09db41a9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -461,9 +461,18 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
 	int err;
 
-	/* Flush out writes to the server in order to update c/mtime */
-	if (S_ISREG(inode->i_mode))
+	/*
+	 * Flush out writes to the server in order to update c/mtime.
+	 *
+	 * Hold the i_mutex to suspend application writes temporarily;
+	 * this prevents long-running writing applications from blocking
+	 * nfs_wb_nocommit.
+	 */
+	if (S_ISREG(inode->i_mode)) {
+		mutex_lock(&inode->i_mutex);
 		nfs_wb_nocommit(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
 
 	/*
 	 * We may force a getattr if the user cares about atime.
-- 
cgit v1.2.3


From 0eb2574121ef0ffbebe5335c66c227d1b987fa25 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:32:19 -0400
Subject: NFS: Ensure that NFS version 4 mounts use NFS_PORT if nfsport wasn't
 set

Text-based mount option parsing introduced a minor regression in the
behavior of NFS version 4 mounts.  NFS version 4 is not supposed to require
a running rpcbind service on the server in order for a mount to succeed.

In other words, if the mount options don't specify a port number, the port
number is supposed to default to 2049.  For earlier versions of NFS, the
default port number was zero in order to cause the RPC client to autobind
to the server's NFS service.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index fda1635dd133..7d84d94fa827 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1643,6 +1643,8 @@ static int nfs4_validate_mount_data(void *options,
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
+		if (args->nfs_server.address.sin_port == 0)
+			args->nfs_server.address.sin_port = htons(NFS_PORT);
 		if (!nfs_verify_server_address((struct sockaddr *)
 						&args->nfs_server.address))
 			return -EINVAL;
-- 
cgit v1.2.3


From ad879cef8554e20f9b5ca356c878712eb671228c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:32:24 -0400
Subject: NFS: Remove support for the 'nfsprog' option

Remove the mount option that allows users to specify an alternate NFS
program number.  The client hasn't support setting an alternate NFS
program number for a very long time.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h |  1 -
 fs/nfs/super.c    | 14 +-------------
 2 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 75793794aefe..a78a09b40d1b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -43,7 +43,6 @@ struct nfs_parsed_mount_data {
 	struct {
 		struct sockaddr_in	address;
 		char			*hostname;
-		unsigned int		program;
 		unsigned int		version;
 		unsigned short		port;
 		int			protocol;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 7d84d94fa827..1a18ca390ddf 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -84,7 +84,7 @@ enum {
 	Opt_namelen,
 	Opt_mountport,
 	Opt_mountprog, Opt_mountvers,
-	Opt_nfsprog, Opt_nfsvers,
+	Opt_nfsvers,
 
 	/* Mount options that take string arguments */
 	Opt_sec, Opt_proto, Opt_mountproto,
@@ -139,7 +139,6 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_mountport, "mountport=%u" },
 	{ Opt_mountprog, "mountprog=%u" },
 	{ Opt_mountvers, "mountvers=%u" },
-	{ Opt_nfsprog, "nfsprog=%u" },
 	{ Opt_nfsvers, "nfsvers=%u" },
 	{ Opt_nfsvers, "vers=%u" },
 
@@ -801,13 +800,6 @@ static int nfs_parse_mount_options(char *raw,
 				return 0;
 			mnt->mount_server.version = option;
 			break;
-		case Opt_nfsprog:
-			if (match_int(args, &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			mnt->nfs_server.program = option;
-			break;
 		case Opt_nfsvers:
 			if (match_int(args, &option))
 				return 0;
@@ -1067,9 +1059,6 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
  *
  * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
  *   mountproto=tcp after mountproto=udp, and so on
- *
- * XXX: as far as I can tell, changing the NFS program number is not
- *      supported in the NFS client.
  */
 static int nfs_validate_mount_data(void *options,
 				   struct nfs_parsed_mount_data *args,
@@ -1095,7 +1084,6 @@ static int nfs_validate_mount_data(void *options,
 	args->mount_server.protocol = XPRT_TRANSPORT_UDP;
 	args->mount_server.program = NFS_MNT_PROGRAM;
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-	args->nfs_server.program = NFS_PROGRAM;
 
 	switch (data->version) {
 	case 1:
-- 
cgit v1.2.3


From e887cbcf911b2d16742832b38411559273ce5d77 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:32:29 -0400
Subject: NFS: Remove support for the 'mountprog' option

Remove the mount option that allows users to specify an alternate mountd
program number.  The client hasn't support setting an alternate mountd
program number for a very long time.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h |  1 -
 fs/nfs/super.c    | 11 +----------
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a78a09b40d1b..058d503a0ee1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -52,7 +52,6 @@ struct nfs_parsed_mount_data {
 		struct sockaddr_in	address;
 		char			*hostname;
 		char			*export_path;
-		unsigned int		program;
 		int			protocol;
 	} nfs_server;
 };
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 1a18ca390ddf..330c3922739f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -83,7 +83,7 @@ enum {
 	Opt_actimeo,
 	Opt_namelen,
 	Opt_mountport,
-	Opt_mountprog, Opt_mountvers,
+	Opt_mountvers,
 	Opt_nfsvers,
 
 	/* Mount options that take string arguments */
@@ -137,7 +137,6 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_userspace, "retry=%u" },
 	{ Opt_namelen, "namlen=%u" },
 	{ Opt_mountport, "mountport=%u" },
-	{ Opt_mountprog, "mountprog=%u" },
 	{ Opt_mountvers, "mountvers=%u" },
 	{ Opt_nfsvers, "nfsvers=%u" },
 	{ Opt_nfsvers, "vers=%u" },
@@ -786,13 +785,6 @@ static int nfs_parse_mount_options(char *raw,
 				return 0;
 			mnt->mount_server.port = option;
 			break;
-		case Opt_mountprog:
-			if (match_int(args, &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			mnt->mount_server.program = option;
-			break;
 		case Opt_mountvers:
 			if (match_int(args, &option))
 				return 0;
@@ -1082,7 +1074,6 @@ static int nfs_validate_mount_data(void *options,
 	args->acdirmin		= 30;
 	args->acdirmax		= 60;
 	args->mount_server.protocol = XPRT_TRANSPORT_UDP;
-	args->mount_server.program = NFS_MNT_PROGRAM;
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 
 	switch (data->version) {
-- 
cgit v1.2.3


From 6a0ed1de8ecee0cde21ea667891a03f6c84ecd66 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:32:40 -0400
Subject: NFS: Clean up: copy hostname with kstrndup during mount processing

Clean up: mount option parsing uses kstrndup in several places, rather than
using kzalloc.  Replace the few remaining uses of kzalloc with kstrndup,
for consistency.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 330c3922739f..a3492d6f8f9b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1648,21 +1648,16 @@ static int nfs4_validate_mount_data(void *options,
 		len = c - dev_name;
 		if (len > NFS4_MAXNAMLEN)
 			return -ENAMETOOLONG;
-		args->nfs_server.hostname = kzalloc(len, GFP_KERNEL);
-		if (args->nfs_server.hostname == NULL)
-			return -ENOMEM;
-		strncpy(args->nfs_server.hostname, dev_name, len - 1);
+		/* N.B. caller will free nfs_server.hostname in all cases */
+		args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
 
 		c++;			/* step over the ':' */
 		len = strlen(c);
 		if (len > NFS4_MAXPATHLEN)
 			return -ENAMETOOLONG;
-		args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL);
-		if (args->nfs_server.export_path == NULL)
-			return -ENOMEM;
-		strncpy(args->nfs_server.export_path, c, len);
+		args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
 
-		dprintk("MNTPATH: %s\n", args->nfs_server.export_path);
+		dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
 
 		if (args->client_address == NULL)
 			goto out_no_client_address;
-- 
cgit v1.2.3


From d45273ed6f4613e81701c3e896d9db200c288fff Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:32:45 -0400
Subject: NFS: Clean up address comparison in __nfs_find_client()

The address comparison in the __nfs_find_client() function is deceptive.
It uses a memcmp() to check a pair of u32 fields for equality.  Not only is
this inefficient, but usually memcmp() is used for comparing two *whole*
sockaddr_in's (which includes comparisons of the address family and port
number), so it's easy to mistake the comparison here for a whole sockaddr
comparison, which it isn't.

So for clarity and efficiency, we replace the memcmp() with a simple test
for equality between the two s_addr fields.  This should have no
behavioral effect.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c3740f5ab978..8b5f9b9685dd 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -220,8 +220,7 @@ static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int
 		if (clp->cl_nfsversion != nfsversion)
 			continue;
 
-		if (memcmp(&clp->cl_addr.sin_addr, &addr->sin_addr,
-			   sizeof(clp->cl_addr.sin_addr)) != 0)
+		if (clp->cl_addr.sin_addr.s_addr != addr->sin_addr.s_addr)
 			continue;
 
 		if (!match_port || clp->cl_addr.sin_port == addr->sin_port)
-- 
cgit v1.2.3


From 5cce428d953cc3843b100e078dbc3c01c6411b85 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Oct 2007 13:33:01 -0400
Subject: NFS: Remove an unneeded check in decode_compound_header_arg()

Clean up:  The header tag length is unsigned, so checking that it is less
than zero is unnecessary.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 058ade7efe79..97abd829e432 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -139,7 +139,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
 	if (unlikely(status != 0))
 		return status;
 	/* We do not like overly long tags! */
-	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ-12 || hdr->taglen < 0) {
+	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
 		printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
 				__FUNCTION__, hdr->taglen);
 		return htonl(NFS4ERR_RESOURCE);
-- 
cgit v1.2.3


From bfc69a456642a51c89dfd8e5184468857cb44f32 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 15 Oct 2007 18:18:29 -0400
Subject: NFS: define a function to update nfsi->cache_change_attribute

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c      | 15 +++++++++++++++
 fs/nfs/inode.c    |  6 ++++--
 fs/nfs/nfs4proc.c |  2 +-
 3 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32c666c612a1..72d141a0dbd8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -638,6 +638,21 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
 	return 0;
 }
 
+/**
+ * nfs_force_lookup_revalidate - Mark the directory as having changed
+ * @dir - pointer to directory inode
+ *
+ * This forces the revalidation code in nfs_lookup_revalidate() to do a
+ * full lookup on all child dentries of 'dir' whenever a change occurs
+ * on the server that might have invalidated our dcache.
+ *
+ * The caller should be holding dir->i_lock
+ */
+void nfs_force_lookup_revalidate(struct inode *dir)
+{
+	NFS_I(dir)->cache_change_attribute = jiffies;
+}
+
 /*
  * A check for whether or not the parent directory has changed.
  * In the case it has, we assume that the dentries are untrustworthy
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index cc3a09db41a9..5747d49bdd76 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1029,7 +1029,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			dprintk("NFS: mtime change on server for file %s/%ld\n",
 					inode->i_sb->s_id, inode->i_ino);
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-			nfsi->cache_change_attribute = now;
+			if (S_ISDIR(inode->i_mode))
+				nfs_force_lookup_revalidate(inode);
 		}
 		/* If ctime has changed we should definitely clear access+acl caches */
 		if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
@@ -1038,7 +1039,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		dprintk("NFS: change_attr change on server for file %s/%ld\n",
 				inode->i_sb->s_id, inode->i_ino);
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-		nfsi->cache_change_attribute = now;
+		if (S_ISDIR(inode->i_mode))
+			nfs_force_lookup_revalidate(inode);
 	}
 
 	/* Check if our cached file size is stale */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 826b445b8c70..26192a703129 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -210,7 +210,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 	spin_lock(&dir->i_lock);
 	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
 	if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
-		nfsi->cache_change_attribute = jiffies;
+		nfs_force_lookup_revalidate(dir);
 	nfsi->change_attr = cinfo->after;
 	spin_unlock(&dir->i_lock);
 }
-- 
cgit v1.2.3


From 3a498026eef9603c14037e73a4a94cfdb2fa44eb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 14 Dec 2007 14:56:04 -0500
Subject: NFS: Clean up the nfs_client initialisation

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 51 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8b5f9b9685dd..d7f6d50442b7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -93,22 +93,26 @@ struct rpc_program		nfsacl_program = {
 };
 #endif  /* CONFIG_NFS_V3_ACL */
 
+struct nfs_client_initdata {
+	const char *hostname;
+	const struct sockaddr_in *addr;
+	int version;
+};
+
 /*
  * Allocate a shared client record
  *
  * Since these are allocated/deallocated very rarely, we don't
  * bother putting them in a slab cache...
  */
-static struct nfs_client *nfs_alloc_client(const char *hostname,
-					   const struct sockaddr_in *addr,
-					   int nfsversion)
+static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 {
 	struct nfs_client *clp;
 
 	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
 		goto error_0;
 
-	if (nfsversion == 4) {
+	if (cl_init->version == 4) {
 		if (nfs_callback_up() < 0)
 			goto error_2;
 		__set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
@@ -117,11 +121,11 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
 	atomic_set(&clp->cl_count, 1);
 	clp->cl_cons_state = NFS_CS_INITING;
 
-	clp->cl_nfsversion = nfsversion;
-	memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
+	clp->cl_nfsversion = cl_init->version;
+	memcpy(&clp->cl_addr, cl_init->addr, sizeof(clp->cl_addr));
 
-	if (hostname) {
-		clp->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+	if (cl_init->hostname) {
+		clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
 		if (!clp->cl_hostname)
 			goto error_3;
 	}
@@ -256,22 +260,20 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio
  * Look up a client by IP address and protocol version
  * - creates a new record if one doesn't yet exist
  */
-static struct nfs_client *nfs_get_client(const char *hostname,
-					 const struct sockaddr_in *addr,
-					 int nfsversion)
+static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
 {
 	struct nfs_client *clp, *new = NULL;
 	int error;
 
 	dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%d)\n",
-		hostname ?: "", NIPQUAD(addr->sin_addr),
-		addr->sin_port, nfsversion);
+		cl_init->hostname ?: "", NIPQUAD(cl_init->addr->sin_addr),
+		cl_init->addr->sin_port, cl_init->version);
 
 	/* see if the client already exists */
 	do {
 		spin_lock(&nfs_client_lock);
 
-		clp = __nfs_find_client(addr, nfsversion, 1);
+		clp = __nfs_find_client(cl_init->addr, cl_init->version, 1);
 		if (clp)
 			goto found_client;
 		if (new)
@@ -279,7 +281,7 @@ static struct nfs_client *nfs_get_client(const char *hostname,
 
 		spin_unlock(&nfs_client_lock);
 
-		new = nfs_alloc_client(hostname, addr, nfsversion);
+		new = nfs_alloc_client(cl_init);
 	} while (new);
 
 	return ERR_PTR(-ENOMEM);
@@ -540,19 +542,23 @@ error:
 static int nfs_init_server(struct nfs_server *server,
 			   const struct nfs_parsed_mount_data *data)
 {
+	struct nfs_client_initdata cl_init = {
+		.hostname = data->nfs_server.hostname,
+		.addr = &data->nfs_server.address,
+		.version = 2,
+	};
 	struct nfs_client *clp;
-	int error, nfsvers = 2;
+	int error;
 
 	dprintk("--> nfs_init_server()\n");
 
 #ifdef CONFIG_NFS_V3
 	if (data->flags & NFS_MOUNT_VER3)
-		nfsvers = 3;
+		cl_init.version = 3;
 #endif
 
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(data->nfs_server.hostname,
-				&data->nfs_server.address, nfsvers);
+	clp = nfs_get_client(&cl_init);
 	if (IS_ERR(clp)) {
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		return PTR_ERR(clp);
@@ -889,13 +895,18 @@ static int nfs4_set_client(struct nfs_server *server,
 		rpc_authflavor_t authflavour,
 		int proto, int timeo, int retrans)
 {
+	struct nfs_client_initdata cl_init = {
+		.hostname = hostname,
+		.addr = addr,
+		.version = 4,
+	};
 	struct nfs_client *clp;
 	int error;
 
 	dprintk("--> nfs4_set_client()\n");
 
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(hostname, addr, 4);
+	clp = nfs_get_client(&cl_init);
 	if (IS_ERR(clp)) {
 		error = PTR_ERR(clp);
 		goto error;
-- 
cgit v1.2.3


From c81468a1a766921f11ae44e8a99816ac8dc7b015 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 14 Dec 2007 14:56:05 -0500
Subject: NFS: Clean up the nfs_find_client function.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 52 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d7f6d50442b7..ff778ecee0bd 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -208,52 +208,60 @@ void nfs_put_client(struct nfs_client *clp)
 }
 
 /*
- * Find a client by address
- * - caller must hold nfs_client_lock
+ * Find a client by IP address and protocol version
+ * - returns NULL if no such client
  */
-static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port)
+struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
 {
 	struct nfs_client *clp;
 
+	spin_lock(&nfs_client_lock);
 	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
 		/* Don't match clients that failed to initialise properly */
-		if (clp->cl_cons_state < 0)
+		if (clp->cl_cons_state != NFS_CS_READY)
 			continue;
 
 		/* Different NFS versions cannot share the same nfs_client */
 		if (clp->cl_nfsversion != nfsversion)
 			continue;
 
+		/* Match only the IP address, not the port number */
 		if (clp->cl_addr.sin_addr.s_addr != addr->sin_addr.s_addr)
 			continue;
 
-		if (!match_port || clp->cl_addr.sin_port == addr->sin_port)
-			goto found;
+		atomic_inc(&clp->cl_count);
+		spin_unlock(&nfs_client_lock);
+		return clp;
 	}
-
+	spin_unlock(&nfs_client_lock);
 	return NULL;
-
-found:
-	atomic_inc(&clp->cl_count);
-	return clp;
 }
 
 /*
- * Find a client by IP address and protocol version
- * - returns NULL if no such client
+ * Find an nfs_client on the list that matches the initialisation data
+ * that is supplied.
  */
-struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
 {
 	struct nfs_client *clp;
 
-	spin_lock(&nfs_client_lock);
-	clp = __nfs_find_client(addr, nfsversion, 0);
-	spin_unlock(&nfs_client_lock);
-	if (clp != NULL && clp->cl_cons_state != NFS_CS_READY) {
-		nfs_put_client(clp);
-		clp = NULL;
+	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		/* Don't match clients that failed to initialise properly */
+		if (clp->cl_cons_state < 0)
+			continue;
+
+		/* Different NFS versions cannot share the same nfs_client */
+		if (clp->cl_nfsversion != data->version)
+			continue;
+
+		/* Match the full socket address */
+		if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0)
+			continue;
+
+		atomic_inc(&clp->cl_count);
+		return clp;
 	}
-	return clp;
+	return NULL;
 }
 
 /*
@@ -273,7 +281,7 @@ static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_in
 	do {
 		spin_lock(&nfs_client_lock);
 
-		clp = __nfs_find_client(cl_init->addr, cl_init->version, 1);
+		clp = nfs_match_client(cl_init);
 		if (clp)
 			goto found_client;
 		if (new)
-- 
cgit v1.2.3


From 40c553193df41920de659f0446e5d214c862e827 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 14 Dec 2007 14:56:07 -0500
Subject: NFS: Remove the redundant nfs_client->cl_nfsversion

We can get the same information from the rpc_ops structure instead.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c    | 41 ++++++++++++++++++-----------------------
 fs/nfs/namespace.c |  2 +-
 fs/nfs/super.c     |  2 +-
 3 files changed, 20 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ff778ecee0bd..3b21731ae571 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -96,7 +96,7 @@ struct rpc_program		nfsacl_program = {
 struct nfs_client_initdata {
 	const char *hostname;
 	const struct sockaddr_in *addr;
-	int version;
+	const struct nfs_rpc_ops *rpc_ops;
 };
 
 /*
@@ -112,7 +112,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
 		goto error_0;
 
-	if (cl_init->version == 4) {
+	clp->rpc_ops = cl_init->rpc_ops;
+
+	if (cl_init->rpc_ops->version == 4) {
 		if (nfs_callback_up() < 0)
 			goto error_2;
 		__set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
@@ -121,7 +123,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	atomic_set(&clp->cl_count, 1);
 	clp->cl_cons_state = NFS_CS_INITING;
 
-	clp->cl_nfsversion = cl_init->version;
 	memcpy(&clp->cl_addr, cl_init->addr, sizeof(clp->cl_addr));
 
 	if (cl_init->hostname) {
@@ -170,7 +171,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
  */
 static void nfs_free_client(struct nfs_client *clp)
 {
-	dprintk("--> nfs_free_client(%d)\n", clp->cl_nfsversion);
+	dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
 
 	nfs4_shutdown_client(clp);
 
@@ -222,7 +223,7 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio
 			continue;
 
 		/* Different NFS versions cannot share the same nfs_client */
-		if (clp->cl_nfsversion != nfsversion)
+		if (clp->rpc_ops->version != nfsversion)
 			continue;
 
 		/* Match only the IP address, not the port number */
@@ -251,7 +252,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 			continue;
 
 		/* Different NFS versions cannot share the same nfs_client */
-		if (clp->cl_nfsversion != data->version)
+		if (clp->rpc_ops != data->rpc_ops)
 			continue;
 
 		/* Match the full socket address */
@@ -273,9 +274,9 @@ static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_in
 	struct nfs_client *clp, *new = NULL;
 	int error;
 
-	dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%d)\n",
+	dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%u)\n",
 		cl_init->hostname ?: "", NIPQUAD(cl_init->addr->sin_addr),
-		cl_init->addr->sin_port, cl_init->version);
+		cl_init->addr->sin_port, cl_init->rpc_ops->version);
 
 	/* see if the client already exists */
 	do {
@@ -430,7 +431,7 @@ static int nfs_start_lockd(struct nfs_server *server)
 {
 	int error = 0;
 
-	if (server->nfs_client->cl_nfsversion > 3)
+	if (server->nfs_client->rpc_ops->version > 3)
 		goto out;
 	if (server->flags & NFS_MOUNT_NONLM)
 		goto out;
@@ -450,7 +451,7 @@ out:
 #ifdef CONFIG_NFS_V3_ACL
 static void nfs_init_server_aclclient(struct nfs_server *server)
 {
-	if (server->nfs_client->cl_nfsversion != 3)
+	if (server->nfs_client->rpc_ops->version != 3)
 		goto out_noacl;
 	if (server->flags & NFS_MOUNT_NOACL)
 		goto out_noacl;
@@ -521,12 +522,6 @@ static int nfs_init_client(struct nfs_client *clp,
 		return 0;
 	}
 
-	/* Check NFS protocol revision and initialize RPC op vector */
-	clp->rpc_ops = &nfs_v2_clientops;
-#ifdef CONFIG_NFS_V3
-	if (clp->cl_nfsversion == 3)
-		clp->rpc_ops = &nfs_v3_clientops;
-#endif
 	/*
 	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
 	 * - RFC 2623, sec 2.3.2
@@ -553,7 +548,7 @@ static int nfs_init_server(struct nfs_server *server,
 	struct nfs_client_initdata cl_init = {
 		.hostname = data->nfs_server.hostname,
 		.addr = &data->nfs_server.address,
-		.version = 2,
+		.rpc_ops = &nfs_v2_clientops,
 	};
 	struct nfs_client *clp;
 	int error;
@@ -562,7 +557,7 @@ static int nfs_init_server(struct nfs_server *server,
 
 #ifdef CONFIG_NFS_V3
 	if (data->flags & NFS_MOUNT_VER3)
-		cl_init.version = 3;
+		cl_init.rpc_ops = &nfs_v3_clientops;
 #endif
 
 	/* Allocate or find a client reference we can use */
@@ -906,7 +901,7 @@ static int nfs4_set_client(struct nfs_server *server,
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
 		.addr = addr,
-		.version = 4,
+		.rpc_ops = &nfs_v4_clientops,
 	};
 	struct nfs_client *clp;
 	int error;
@@ -1284,8 +1279,8 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 	/* display one transport per line on subsequent lines */
 	clp = list_entry(v, struct nfs_client, cl_share_link);
 
-	seq_printf(m, "v%d %02x%02x%02x%02x %4hx %3d %s\n",
-		   clp->cl_nfsversion,
+	seq_printf(m, "v%u %02x%02x%02x%02x %4hx %3d %s\n",
+		   clp->rpc_ops->version,
 		   NIPQUAD(clp->cl_addr.sin_addr),
 		   ntohs(clp->cl_addr.sin_port),
 		   atomic_read(&clp->cl_count),
@@ -1363,8 +1358,8 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 		 (unsigned long long) server->fsid.major,
 		 (unsigned long long) server->fsid.minor);
 
-	seq_printf(m, "v%d %02x%02x%02x%02x %4hx %-7s %-17s\n",
-		   clp->cl_nfsversion,
+	seq_printf(m, "v%u %02x%02x%02x%02x %4hx %-7s %-17s\n",
+		   clp->rpc_ops->version,
 		   NIPQUAD(clp->cl_addr.sin_addr),
 		   ntohs(clp->cl_addr.sin_port),
 		   dev,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index acfc56f9edc0..be4ce1c3a3d8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -188,7 +188,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 {
 #ifdef CONFIG_NFS_V4
 	struct vfsmount *mnt = NULL;
-	switch (server->nfs_client->cl_nfsversion) {
+	switch (server->nfs_client->rpc_ops->version) {
 		case 2:
 		case 3:
 			mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a3492d6f8f9b..5608e6a4c1e1 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -529,7 +529,7 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	seq_printf(m, ",namelen=%d", nfss->namelen);
 
 #ifdef CONFIG_NFS_V4
-	if (nfss->nfs_client->cl_nfsversion == 4) {
+	if (nfss->nfs_client->rpc_ops->version == 4) {
 		seq_printf(m, "\n\tnfsv4:\t");
 		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
 		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
-- 
cgit v1.2.3


From cc38bac3a0093b3b7928efc6ff8e9faf9e75f41d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:56:54 -0500
Subject: NFS: Ensure NFSv4 SETCLIENTID send buffer is large enough

Ensure that the RPC buffer size specified for NFSv4 SETCLIENTID procedures
matches what we are encoding into the buffer.  See the definition of
struct nfs4_setclientid {} and the encode_setclientid() function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index eae46f008da7..db1ed9c46ede 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -116,10 +116,12 @@ static int nfs4_stat_to_errno(int);
 #define decode_renew_maxsz	(op_decode_hdr_maxsz)
 #define encode_setclientid_maxsz \
 				(op_encode_hdr_maxsz + \
-				4 /*server->ip_addr*/ + \
-				1 /*Netid*/ + \
-				6 /*uaddr*/ + \
-				6 + (NFS4_VERIFIER_SIZE >> 2))
+				XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+				XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
+				1 /* sc_prog */ + \
+				XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+				XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
+				1) /* sc_cb_ident */
 #define decode_setclientid_maxsz \
 				(op_decode_hdr_maxsz + \
 				2 + \
-- 
cgit v1.2.3


From d4d3c507493afd3c9d19fbe9762f44e790909dbe Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:57:09 -0500
Subject: NFS: Enable NFS client to generate CLIENTID strings with IPv6
 addresses

We recently added methods to RPC transports that provide string versions of
the remote peer address information.  Convert the NFSv4 SETCLIENTID
procedure to use those methods instead of building the client ID out of
whole cloth.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 26192a703129..5e8c4cf7959e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2891,14 +2891,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
 
 	for(;;) {
 		setclientid.sc_name_len = scnprintf(setclientid.sc_name,
-				sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u",
-				clp->cl_ipaddr, NIPQUAD(clp->cl_addr.sin_addr),
+				sizeof(setclientid.sc_name), "%s/%s %s %u",
+				clp->cl_ipaddr,
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_ADDR),
 				cred->cr_ops->cr_name,
 				clp->cl_id_uniquifier);
 		setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
-				sizeof(setclientid.sc_netid), "tcp");
+				sizeof(setclientid.sc_netid),
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_NETID));
 		setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
-				sizeof(setclientid.sc_uaddr), "%s.%d.%d",
+				sizeof(setclientid.sc_uaddr), "%s.%u.%u",
 				clp->cl_ipaddr, port >> 8, port & 255);
 
 		status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
-- 
cgit v1.2.3


From 5d8515caeb99940f5ed56d22a03aba20bbe7fdcb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:57:16 -0500
Subject: NFS: eliminate NIPQUAD(clp->cl_addr.sin_addr)

To ensure the NFS client displays IPv6 addresses properly, replace
address family-specific NIPQUAD() invocations with a call to the RPC
client to get a formatted string representing the remote peer's
address.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c     | 12 ++++++------
 fs/nfs/delegation.c | 10 ++++++----
 fs/nfs/nfs4state.c  |  9 +++++----
 fs/nfs/super.c      |  5 +++--
 4 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3b21731ae571..701cd193a014 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1279,10 +1279,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 	/* display one transport per line on subsequent lines */
 	clp = list_entry(v, struct nfs_client, cl_share_link);
 
-	seq_printf(m, "v%u %02x%02x%02x%02x %4hx %3d %s\n",
+	seq_printf(m, "v%u %s %s %3d %s\n",
 		   clp->rpc_ops->version,
-		   NIPQUAD(clp->cl_addr.sin_addr),
-		   ntohs(clp->cl_addr.sin_port),
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
 		   atomic_read(&clp->cl_count),
 		   clp->cl_hostname);
 
@@ -1358,10 +1358,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 		 (unsigned long long) server->fsid.major,
 		 (unsigned long long) server->fsid.minor);
 
-	seq_printf(m, "v%u %02x%02x%02x%02x %4hx %-7s %-17s\n",
+	seq_printf(m, "v%u %s %s %-7s %-17s\n",
 		   clp->rpc_ops->version,
-		   NIPQUAD(clp->cl_addr.sin_addr),
-		   ntohs(clp->cl_addr.sin_port),
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
 		   dev,
 		   fsid);
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 11833f4caeaa..b03dcd8403f1 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -156,8 +156,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 		if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
 					sizeof(delegation->stateid)) != 0 ||
 				delegation->type != nfsi->delegation->type) {
-			printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n",
-					__FUNCTION__, NIPQUAD(clp->cl_addr.sin_addr));
+			printk(KERN_WARNING "%s: server %s handed out "
+					"a duplicate delegation!\n",
+					__FUNCTION__, clp->cl_hostname);
 			status = -EIO;
 		}
 	}
@@ -314,8 +315,9 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
 	__module_get(THIS_MODULE);
 	atomic_inc(&clp->cl_count);
 	task = kthread_run(nfs_do_expire_all_delegations, clp,
-			"%u.%u.%u.%u-delegreturn",
-			NIPQUAD(clp->cl_addr.sin_addr));
+				"%s-delegreturn",
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_ADDR));
 	if (!IS_ERR(task))
 		return;
 	nfs_put_client(clp);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index bf94c6e0a503..f9c7432471dc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -758,8 +758,9 @@ static void nfs4_recover_state(struct nfs_client *clp)
 
 	__module_get(THIS_MODULE);
 	atomic_inc(&clp->cl_count);
-	task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim",
-			NIPQUAD(clp->cl_addr.sin_addr));
+	task = kthread_run(reclaimer, clp, "%s-reclaim",
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_ADDR));
 	if (!IS_ERR(task))
 		return;
 	nfs4_clear_recover_bit(clp);
@@ -970,8 +971,8 @@ out:
 	module_put_and_exit(0);
 	return 0;
 out_error:
-	printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n",
-				NIPQUAD(clp->cl_addr.sin_addr), -status);
+	printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s"
+			" with error %d\n", clp->cl_hostname, -status);
 	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 	goto out;
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5608e6a4c1e1..75f3cbf922a3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -491,8 +491,9 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 	nfs_show_mount_options(m, nfss, 0);
 
-	seq_printf(m, ",addr="NIPQUAD_FMT,
-		NIPQUAD(nfss->nfs_client->cl_addr.sin_addr));
+	seq_printf(m, ",addr=%s",
+			rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
+							RPC_DISPLAY_ADDR));
 
 	return 0;
 }
-- 
cgit v1.2.3


From 1d98fe6717c5786394268da430a4354f6205da54 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:57:23 -0500
Subject: NFS: Move dprintks from callback.c to callback_proc.c

Clean up: The client side peer address is available in callback_proc.c,
so move a dprintk out of fs/nfs/callback.c and into
fs/nfs/callback_proc.c.

This is more consistent with other debugging messages, and the proc
routines have more information about each request to display.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c      |  4 ----
 fs/nfs/callback_proc.c | 12 +++++++++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a796be5051bf..bbf67f148ff9 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -73,8 +73,6 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
 	complete(&nfs_callback_info.started);
 
 	for(;;) {
-		char buf[RPC_MAX_ADDRBUFLEN];
-
 		if (signalled()) {
 			if (nfs_callback_info.users == 0)
 				break;
@@ -92,8 +90,6 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
 					__FUNCTION__, -err);
 			break;
 		}
-		dprintk("%s: request from %s\n", __FUNCTION__,
-				svc_print_addr(rqstp, buf, sizeof(buf)));
 		svc_process(rqstp);
 	}
 
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 72e55d83756d..e89a9007c91c 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,7 +12,9 @@
 #include "delegation.h"
 #include "internal.h"
 
+#ifdef NFS_DEBUG
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
+#endif
  
 __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
 {
@@ -20,12 +22,16 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 	struct nfs_delegation *delegation;
 	struct nfs_inode *nfsi;
 	struct inode *inode;
-	
+
 	res->bitmap[0] = res->bitmap[1] = 0;
 	res->status = htonl(NFS4ERR_BADHANDLE);
 	clp = nfs_find_client(args->addr, 4);
 	if (clp == NULL)
 		goto out;
+
+	dprintk("NFS: GETATTR callback request from %s\n",
+		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
 	inode = nfs_delegation_find_inode(clp, &args->fh);
 	if (inode == NULL)
 		goto out_putclient;
@@ -65,6 +71,10 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 	clp = nfs_find_client(args->addr, 4);
 	if (clp == NULL)
 		goto out;
+
+	dprintk("NFS: RECALL callback request from %s\n",
+		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
 	inode = nfs_delegation_find_inode(clp, &args->fh);
 	if (inode == NULL)
 		goto out_putclient;
-- 
cgit v1.2.3


From 3f43c6667acb4e02962b2829a4d4ebb6b6e6f70e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:57:31 -0500
Subject: NFS: Address a couple of nits in nfs_follow_referral()

Clean up: fix an outdated block comment, and address a comparison
between a signed and unsigned integer.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4namespace.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index dd5fef20c702..bd1b1617905d 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -114,10 +114,7 @@ static inline int valid_ipaddr4(const char *buf)
  * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
  * @mnt_parent - mountpoint of parent directory
  * @dentry - parent directory
- * @fspath - fs path returned in fs_locations
- * @mntpath - mount path to new server
- * @hostname - hostname of new server
- * @addr - host addr of new server
+ * @locations - array of NFSv4 server location information
  *
  */
 static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
@@ -131,7 +128,8 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
 	};
 	char *page = NULL, *page2 = NULL;
-	int loc, s, error;
+	unsigned int s;
+	int loc, error;
 
 	if (locations == NULL || locations->nlocations <= 0)
 		goto out;
-- 
cgit v1.2.3


From fd00a8ff8e37815c9df49f5cf09786e441e1396b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:57:38 -0500
Subject: NFS: Add support for AF_INET6 addresses in nfs_compare_super()

Refactor nfs_compare_super() and add AF_INET6 support.

Replace the generic memcmp() to document explicitly what parts of the
addresses must match in this check, and make the comparison independent
of the lengths of both addresses.

A side benefit is both tests are more computationally efficient than a
memcmp().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 75f3cbf922a3..c3d8fcf38523 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -45,6 +45,8 @@
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/inet.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/magic.h>
 #include <linux/parser.h>
@@ -1326,15 +1328,50 @@ static int nfs_set_super(struct super_block *s, void *data)
 	return ret;
 }
 
+static int nfs_compare_super_address(struct nfs_server *server1,
+				     struct nfs_server *server2)
+{
+	struct sockaddr *sap1, *sap2;
+
+	sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
+	sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
+
+	if (sap1->sa_family != sap2->sa_family)
+		return 0;
+
+	switch (sap1->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1;
+		struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2;
+		if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr)
+			return 0;
+		if (sin1->sin_port != sin2->sin_port)
+			return 0;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1;
+		struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2;
+		if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+			return 0;
+		if (sin1->sin6_port != sin2->sin6_port)
+			return 0;
+		break;
+	}
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
 static int nfs_compare_super(struct super_block *sb, void *data)
 {
 	struct nfs_sb_mountdata *sb_mntdata = data;
 	struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb);
 	int mntflags = sb_mntdata->mntflags;
 
-	if (memcmp(&old->nfs_client->cl_addr,
-				&server->nfs_client->cl_addr,
-				sizeof(old->nfs_client->cl_addr)) != 0)
+	if (!nfs_compare_super_address(old, server))
 		return 0;
 	/* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
 	if (old->flags & NFS_MOUNT_UNSHARED)
-- 
cgit v1.2.3


From cdcd7f9abc8c95524376835fbe8e11c5f7bf588e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:57:45 -0500
Subject: NFS: Verify IPv6 addresses properly

Add support to nfs_verify_server_address for recognizing AF_INET6
addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c3d8fcf38523..038b20b38b22 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -600,16 +600,21 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 }
 
 /*
- * Sanity-check a server address provided by the mount command
+ * Sanity-check a server address provided by the mount command.
+ *
+ * Address family must be initialized, and address must not be
+ * the ANY address for that family.
  */
 static int nfs_verify_server_address(struct sockaddr *addr)
 {
 	switch (addr->sa_family) {
 	case AF_INET: {
-		struct sockaddr_in *sa = (struct sockaddr_in *) addr;
-		if (sa->sin_addr.s_addr != INADDR_ANY)
-			return 1;
-		break;
+		struct sockaddr_in *sa = (struct sockaddr_in *)addr;
+		return sa->sin_addr.s_addr != INADDR_ANY;
+	}
+	case AF_INET6: {
+		struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr;
+		return !ipv6_addr_any(sa);
 	}
 	}
 
-- 
cgit v1.2.3


From 04dcd6e3aceedff9fcc96ce3014688d5b642d627 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:57:53 -0500
Subject: NFS: Make setting a port number agostic

We'll need to set the port number of an AF_INET or AF_INET6 address in
several places in fs/nfs/super.c, so introduce a helper that can manage
this for us.  We put this helper to immediate use.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 038b20b38b22..ef1aad774e6c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -599,6 +599,25 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 		rpc_killall_tasks(rpc);
 }
 
+/*
+ * Set the port number in an address.  Be agnostic about the address family.
+ */
+static void nfs_set_port(struct sockaddr *sap, unsigned short port)
+{
+	switch (sap->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+		ap->sin_port = htons(port);
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
+		ap->sin6_port = htons(port);
+		break;
+	}
+	}
+}
+
 /*
  * Sanity-check a server address provided by the mount command.
  *
@@ -629,6 +648,7 @@ static int nfs_parse_mount_options(char *raw,
 				   struct nfs_parsed_mount_data *mnt)
 {
 	char *p, *string;
+	unsigned short port = 0;
 
 	if (!raw) {
 		dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -731,7 +751,7 @@ static int nfs_parse_mount_options(char *raw,
 				return 0;
 			if (option < 0 || option > 65535)
 				return 0;
-			mnt->nfs_server.address.sin_port = htons(option);
+			port = option;
 			break;
 		case Opt_rsize:
 			if (match_int(args, &mnt->rsize))
@@ -973,6 +993,8 @@ static int nfs_parse_mount_options(char *raw,
 		}
 	}
 
+	nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, port);
+
 	return 1;
 
 out_nomem:
@@ -1023,7 +1045,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	/*
 	 * autobind will be used if mount_server.port == 0
 	 */
-	sin.sin_port = htons(args->mount_server.port);
+	nfs_set_port((struct sockaddr *)&sin, args->mount_server.port);
 
 	/*
 	 * Now ask the mount server to map our export path
-- 
cgit v1.2.3


From 0d0f0c192df0282600c6d11c8cc252e7e7a80afc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:58:00 -0500
Subject: NFS: Set default port for NFSv4, with support for AF_INET6

Create a helper function to set the default NFS port for NFSv4 mount
points.  The helper supports both AF_INET and AF_INET6 family addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ef1aad774e6c..a88697ff19ef 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1595,6 +1595,28 @@ static void nfs4_fill_super(struct super_block *sb)
 	nfs_initialise_sb(sb);
 }
 
+/*
+ * If the user didn't specify a port, set the port number to
+ * the NFS version 4 default port.
+ */
+static void nfs4_default_port(struct sockaddr *sap)
+{
+	switch (sap->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+		if (ap->sin_port == 0)
+			ap->sin_port = htons(NFS_PORT);
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
+		if (ap->sin6_port == 0)
+			ap->sin6_port = htons(NFS_PORT);
+		break;
+	}
+	}
+}
+
 /*
  * Validate NFSv4 mount options
  */
@@ -1628,12 +1650,13 @@ static int nfs4_validate_mount_data(void *options,
 				   data->host_addr,
 				   sizeof(args->nfs_server.address)))
 			return -EFAULT;
-		if (args->nfs_server.address.sin_port == 0)
-			args->nfs_server.address.sin_port = htons(NFS_PORT);
 		if (!nfs_verify_server_address((struct sockaddr *)
 						&args->nfs_server.address))
 			goto out_no_address;
 
+		nfs4_default_port((struct sockaddr *)
+				  &args->nfs_server.address);
+
 		switch (data->auth_flavourlen) {
 		case 0:
 			args->auth_flavors[0] = RPC_AUTH_UNIX;
@@ -1687,12 +1710,13 @@ static int nfs4_validate_mount_data(void *options,
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
-		if (args->nfs_server.address.sin_port == 0)
-			args->nfs_server.address.sin_port = htons(NFS_PORT);
 		if (!nfs_verify_server_address((struct sockaddr *)
 						&args->nfs_server.address))
 			return -EINVAL;
 
+		nfs4_default_port((struct sockaddr *)
+				  &args->nfs_server.address);
+
 		switch (args->auth_flavor_len) {
 		case 0:
 			args->auth_flavors[0] = RPC_AUTH_UNIX;
-- 
cgit v1.2.3


From 3b0d3f93d01bb013c3dcf9555d2d111c91ac6a1e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 3 Jan 2008 13:28:58 -0500
Subject: NFS: Add support for AF_INET6 addresses in __nfs_find_client()

Introduce AF_INET6-specific address checking to __nfs_find_client().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 701cd193a014..876162cddf1e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -34,6 +34,8 @@
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
 #include <linux/inet.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 
 #include <asm/system.h>
@@ -208,16 +210,44 @@ void nfs_put_client(struct nfs_client *clp)
 	}
 }
 
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+				 const struct sockaddr_in *sa2)
+{
+	return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
+}
+
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1,
+				 const struct sockaddr_in6 *sa2)
+{
+	return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr);
+}
+
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+				 const struct sockaddr *sa2)
+{
+	switch (sa1->sa_family) {
+	case AF_INET:
+		return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
+				(const struct sockaddr_in *)sa2);
+	case AF_INET6:
+		return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1,
+				(const struct sockaddr_in6 *)sa2);
+	}
+	BUG();
+}
+
 /*
  * Find a client by IP address and protocol version
  * - returns NULL if no such client
  */
-struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+struct nfs_client *_nfs_find_client(const struct sockaddr *addr, int nfsversion)
 {
 	struct nfs_client *clp;
 
 	spin_lock(&nfs_client_lock);
 	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+
 		/* Don't match clients that failed to initialise properly */
 		if (clp->cl_cons_state != NFS_CS_READY)
 			continue;
@@ -226,8 +256,10 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio
 		if (clp->rpc_ops->version != nfsversion)
 			continue;
 
+		if (addr->sa_family != clap->sa_family)
+			continue;
 		/* Match only the IP address, not the port number */
-		if (clp->cl_addr.sin_addr.s_addr != addr->sin_addr.s_addr)
+		if (!nfs_sockaddr_match_ipaddr(addr, clap))
 			continue;
 
 		atomic_inc(&clp->cl_count);
@@ -238,6 +270,11 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio
 	return NULL;
 }
 
+struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+{
+	return _nfs_find_client((const struct sockaddr *)addr, nfsversion);
+}
+
 /*
  * Find an nfs_client on the list that matches the initialisation data
  * that is supplied.
-- 
cgit v1.2.3


From 6e4cffd7b2cf86022dcf9cceeb63f16ff852caa1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:58:15 -0500
Subject: NFS: Expand server address storage in nfs_client struct

Prepare for managing larger addresses in the NFS client by widening the
nfs_client struct's cl_addr field.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>

(Modified to work with the new parameters for nfs_alloc_client)
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 876162cddf1e..44fe7fd7bfbf 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -98,6 +98,7 @@ struct rpc_program		nfsacl_program = {
 struct nfs_client_initdata {
 	const char *hostname;
 	const struct sockaddr_in *addr;
+	size_t addrlen;
 	const struct nfs_rpc_ops *rpc_ops;
 };
 
@@ -125,7 +126,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	atomic_set(&clp->cl_count, 1);
 	clp->cl_cons_state = NFS_CS_INITING;
 
-	memcpy(&clp->cl_addr, cl_init->addr, sizeof(clp->cl_addr));
+	memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
+	clp->cl_addrlen = cl_init->addrlen;
 
 	if (cl_init->hostname) {
 		clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
@@ -425,7 +427,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
 	struct rpc_create_args args = {
 		.protocol	= proto,
 		.address	= (struct sockaddr *)&clp->cl_addr,
-		.addrsize	= sizeof(clp->cl_addr),
+		.addrsize	= clp->cl_addrlen,
 		.timeout	= &timeparms,
 		.servername	= clp->cl_hostname,
 		.program	= &nfs_program,
@@ -585,6 +587,7 @@ static int nfs_init_server(struct nfs_server *server,
 	struct nfs_client_initdata cl_init = {
 		.hostname = data->nfs_server.hostname,
 		.addr = &data->nfs_server.address,
+		.addrlen = sizeof(data->nfs_server.address),
 		.rpc_ops = &nfs_v2_clientops,
 	};
 	struct nfs_client *clp;
@@ -938,6 +941,7 @@ static int nfs4_set_client(struct nfs_server *server,
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
 		.addr = addr,
+		.addrlen = sizeof(*addr),
 		.rpc_ops = &nfs_v4_clientops,
 	};
 	struct nfs_client *clp;
-- 
cgit v1.2.3


From 671beed7e28d9d27eef256862f6c1783a1da147e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:58:22 -0500
Subject: NFS: Change cb_getattrargs to pass "struct sockaddr *" instead of
 sockaddr_in

Change the addr field in the cb_getattrargs struct to a "struct sockaddr *"
to support non-IPv4 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.h      | 2 +-
 fs/nfs/callback_proc.c | 2 +-
 fs/nfs/callback_xdr.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index c2bb14e053e1..ec0ffd9641c6 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,7 +38,7 @@ struct cb_compound_hdr_res {
 };
 
 struct cb_getattrargs {
-	struct sockaddr_in *addr;
+	struct sockaddr *addr;
 	struct nfs_fh fh;
 	uint32_t bitmap[2];
 };
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e89a9007c91c..32f0df0a9572 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -25,7 +25,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 
 	res->bitmap[0] = res->bitmap[1] = 0;
 	res->status = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs_find_client(args->addr, 4);
+	clp = nfs_find_client((struct sockaddr_in *)args->addr, 4);
 	if (clp == NULL)
 		goto out;
 
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 97abd829e432..3eda1bc00ecc 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -176,7 +176,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
 	status = decode_fh(xdr, &args->fh);
 	if (unlikely(status != 0))
 		goto out;
-	args->addr = svc_addr_in(rqstp);
+	args->addr = svc_addr(rqstp);
 	status = decode_bitmap(xdr, args->bitmap);
 out:
 	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
-- 
cgit v1.2.3


From c1d35866566bc2b270a82445271fcce1e391c4b9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:58:29 -0500
Subject: NFS: Change cb_recallargs to pass "struct sockaddr *" instead of
 sockaddr_in

Change the addr field in the cb_recallargs struct to a "struct sockaddr *"
to support non-IPv4 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.h      | 2 +-
 fs/nfs/callback_proc.c | 2 +-
 fs/nfs/callback_xdr.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index ec0ffd9641c6..bb25d2135ff1 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -53,7 +53,7 @@ struct cb_getattrres {
 };
 
 struct cb_recallargs {
-	struct sockaddr_in *addr;
+	struct sockaddr *addr;
 	struct nfs_fh fh;
 	nfs4_stateid stateid;
 	uint32_t truncate;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 32f0df0a9572..fa9586dcc3dd 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -68,7 +68,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 	__be32 res;
 	
 	res = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs_find_client(args->addr, 4);
+	clp = nfs_find_client((struct sockaddr_in *)args->addr, 4);
 	if (clp == NULL)
 		goto out;
 
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 3eda1bc00ecc..c63eb720b68b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -188,7 +188,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	__be32 *p;
 	__be32 status;
 
-	args->addr = svc_addr_in(rqstp);
+	args->addr = svc_addr(rqstp);
 	status = decode_stateid(xdr, &args->stateid);
 	if (unlikely(status != 0))
 		goto out;
-- 
cgit v1.2.3


From ff052645c939b2fd8d467105adf98fa621cc244b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:58:44 -0500
Subject: NFS: Change nfs_find_client() to take "struct sockaddr *"

Adjust arguments and callers of nfs_find_client() to pass a
"struct sockaddr *" instead of "struct sockaddr_in *" to support non-IPv4
addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>

Trond: Also fix up protocol version number argument in nfs_find_client() to
use the correct u32 type.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c      | 3 +--
 fs/nfs/callback_proc.c | 4 ++--
 fs/nfs/client.c        | 7 +------
 fs/nfs/internal.h      | 2 +-
 4 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index bbf67f148ff9..9b6bbf1b9787 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -164,12 +164,11 @@ void nfs_callback_down(void)
 
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
-	struct sockaddr_in *addr = svc_addr_in(rqstp);
 	struct nfs_client *clp;
 	char buf[RPC_MAX_ADDRBUFLEN];
 
 	/* Don't talk to strangers */
-	clp = nfs_find_client(addr, 4);
+	clp = nfs_find_client(svc_addr(rqstp), 4);
 	if (clp == NULL)
 		return SVC_DROP;
 
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index fa9586dcc3dd..e89a9007c91c 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -25,7 +25,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
 
 	res->bitmap[0] = res->bitmap[1] = 0;
 	res->status = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs_find_client((struct sockaddr_in *)args->addr, 4);
+	clp = nfs_find_client(args->addr, 4);
 	if (clp == NULL)
 		goto out;
 
@@ -68,7 +68,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 	__be32 res;
 	
 	res = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs_find_client((struct sockaddr_in *)args->addr, 4);
+	clp = nfs_find_client(args->addr, 4);
 	if (clp == NULL)
 		goto out;
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 44fe7fd7bfbf..73bf4ecad030 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -242,7 +242,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
  * Find a client by IP address and protocol version
  * - returns NULL if no such client
  */
-struct nfs_client *_nfs_find_client(const struct sockaddr *addr, int nfsversion)
+struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
 {
 	struct nfs_client *clp;
 
@@ -272,11 +272,6 @@ struct nfs_client *_nfs_find_client(const struct sockaddr *addr, int nfsversion)
 	return NULL;
 }
 
-struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
-{
-	return _nfs_find_client((const struct sockaddr *)addr, nfsversion);
-}
-
 /*
  * Find an nfs_client on the list that matches the initialisation data
  * that is supplied.
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 058d503a0ee1..c8458b168018 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -60,7 +60,7 @@ struct nfs_parsed_mount_data {
 extern struct rpc_program nfs_program;
 
 extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
+extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
 extern struct nfs_server *nfs_create_server(
 					const struct nfs_parsed_mount_data *,
 					struct nfs_fh *);
-- 
cgit v1.2.3


From d7422c472bbaa419876b91e8823c6219c4a144cb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:58:51 -0500
Subject: NFS: Change nfs_get_client() to take sockaddr *

Adjust arguments and callers of nfs_get_client() to pass a
"struct sockaddr *" instead of "struct sockaddr_in *" to support
non-IPv4 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 73bf4ecad030..e43072bdbb0c 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -97,7 +97,7 @@ struct rpc_program		nfsacl_program = {
 
 struct nfs_client_initdata {
 	const char *hostname;
-	const struct sockaddr_in *addr;
+	const struct sockaddr *addr;
 	size_t addrlen;
 	const struct nfs_rpc_ops *rpc_ops;
 };
@@ -308,9 +308,8 @@ static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_in
 	struct nfs_client *clp, *new = NULL;
 	int error;
 
-	dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%u)\n",
-		cl_init->hostname ?: "", NIPQUAD(cl_init->addr->sin_addr),
-		cl_init->addr->sin_port, cl_init->rpc_ops->version);
+	dprintk("--> nfs_get_client(%s,v%u)\n",
+		cl_init->hostname ?: "", cl_init->rpc_ops->version);
 
 	/* see if the client already exists */
 	do {
@@ -581,7 +580,7 @@ static int nfs_init_server(struct nfs_server *server,
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = data->nfs_server.hostname,
-		.addr = &data->nfs_server.address,
+		.addr = (const struct sockaddr *)&data->nfs_server.address,
 		.addrlen = sizeof(data->nfs_server.address),
 		.rpc_ops = &nfs_v2_clientops,
 	};
@@ -935,7 +934,7 @@ static int nfs4_set_client(struct nfs_server *server,
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
-		.addr = addr,
+		.addr = (const struct sockaddr *)addr,
 		.addrlen = sizeof(*addr),
 		.rpc_ops = &nfs_v4_clientops,
 	};
-- 
cgit v1.2.3


From dcecae0ff44dceea7adb6bef5c8eb660fe87a93c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:58:59 -0500
Subject: NFS: Change nfs4_set_client() to accept struct sockaddr *

Adjust the arguments and callers of nfs4_set_client() to pass a "struct
sockaddr *" instead of a "struct sockaddr_in *" to support non-IPv4
addresses in the NFS client.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e43072bdbb0c..11380601fc78 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -927,15 +927,17 @@ error:
  * Set up an NFS4 client
  */
 static int nfs4_set_client(struct nfs_server *server,
-		const char *hostname, const struct sockaddr_in *addr,
+		const char *hostname,
+		const struct sockaddr *addr,
+		const size_t addrlen,
 		const char *ip_addr,
 		rpc_authflavor_t authflavour,
 		int proto, int timeo, int retrans)
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
-		.addr = (const struct sockaddr *)addr,
-		.addrlen = sizeof(*addr),
+		.addr = addr,
+		.addrlen = addrlen,
 		.rpc_ops = &nfs_v4_clientops,
 	};
 	struct nfs_client *clp;
@@ -1015,7 +1017,8 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 	/* Get a client record */
 	error = nfs4_set_client(server,
 			data->nfs_server.hostname,
-			&data->nfs_server.address,
+			(struct sockaddr *)&data->nfs_server.address,
+			sizeof(data->nfs_server.address),
 			data->client_address,
 			data->auth_flavors[0],
 			data->nfs_server.protocol,
@@ -1090,12 +1093,14 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 
 	/* Get a client representation.
 	 * Note: NFSv4 always uses TCP, */
-	error = nfs4_set_client(server, data->hostname, data->addr,
-			parent_client->cl_ipaddr,
-			data->authflavor,
-			parent_server->client->cl_xprt->prot,
-			parent_client->retrans_timeo,
-			parent_client->retrans_count);
+	error = nfs4_set_client(server, data->hostname,
+				(struct sockaddr *)data->addr,
+				sizeof(*data->addr),
+				parent_client->cl_ipaddr,
+				data->authflavor,
+				parent_server->client->cl_xprt->prot,
+				parent_client->retrans_timeo,
+				parent_client->retrans_count);
 	if (error < 0)
 		goto error;
 
-- 
cgit v1.2.3


From 6677d09513e35ac2f38d3a8c8a26fbd7bbcef192 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:59:06 -0500
Subject: NFS: Adjust nfs_clone_mount structure to store "struct sockaddr *"

Change the addr field in the nfs_clone_mount structure to store a "struct
sockaddr *" to support non-IPv4 addresses in the NFS client.

Note this is mostly a cosmetic change, and does not actually allow
referrals using IPv6 addresses.  The existing referral code assumes that
the server returns a string that represents an IPv4 address.  This code
needs to support hostnames and IPv6 addresses as well as IPv4 addresses,
thus it will need to be reorganized completely (to handle DNS resolution
in user space).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c        |  4 ++--
 fs/nfs/internal.h      |  3 ++-
 fs/nfs/nfs4namespace.c | 12 +++++++-----
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 11380601fc78..ba114faf195f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1094,8 +1094,8 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	/* Get a client representation.
 	 * Note: NFSv4 always uses TCP, */
 	error = nfs4_set_client(server, data->hostname,
-				(struct sockaddr *)data->addr,
-				sizeof(*data->addr),
+				data->addr,
+				data->addrlen,
 				parent_client->cl_ipaddr,
 				data->authflavor,
 				parent_server->client->cl_xprt->prot,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c8458b168018..75dd4e252cae 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -21,7 +21,8 @@ struct nfs_clone_mount {
 	struct nfs_fattr *fattr;
 	char *hostname;
 	char *mnt_path;
-	struct sockaddr_in *addr;
+	struct sockaddr *addr;
+	size_t addrlen;
 	rpc_authflavor_t authflavor;
 };
 
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index bd1b1617905d..5f9ba41ed5bf 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -172,7 +172,10 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 
 		s = 0;
 		while (s < location->nservers) {
-			struct sockaddr_in addr = {};
+			struct sockaddr_in addr = {
+				.sin_family	= AF_INET,
+				.sin_port	= htons(NFS_PORT),
+			};
 
 			if (location->servers[s].len <= 0 ||
 			    valid_ipaddr4(location->servers[s].data) < 0) {
@@ -181,10 +184,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 			}
 
 			mountdata.hostname = location->servers[s].data;
-			addr.sin_addr.s_addr = in_aton(mountdata.hostname);
-			addr.sin_family = AF_INET;
-			addr.sin_port = htons(NFS_PORT);
-			mountdata.addr = &addr;
+			addr.sin_addr.s_addr = in_aton(mountdata.hostname),
+			mountdata.addr = (struct sockaddr *)&addr;
+			mountdata.addrlen = sizeof(addr);
 
 			snprintf(page, PAGE_SIZE, "%s:%s",
 					mountdata.hostname,
-- 
cgit v1.2.3


From 338320345b40eb7c63592f40d25cbd58ccf99548 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:59:13 -0500
Subject: NFS: Remove the NIPQUAD from nfs_try_mount

In the name of address family compatibility, we can't have the NIP_FMT and
NIPQUAD macros in nfs_try_mount().  Instead, we can make use of an unused
mount option to display the mount server's hostname.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a88697ff19ef..f120be43d543 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -89,7 +89,7 @@ enum {
 	Opt_nfsvers,
 
 	/* Mount options that take string arguments */
-	Opt_sec, Opt_proto, Opt_mountproto,
+	Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
 	Opt_addr, Opt_mountaddr, Opt_clientaddr,
 
 	/* Mount options that are ignored */
@@ -148,7 +148,7 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_mountproto, "mountproto=%s" },
 	{ Opt_addr, "addr=%s" },
 	{ Opt_clientaddr, "clientaddr=%s" },
-	{ Opt_userspace, "mounthost=%s" },
+	{ Opt_mounthost, "mounthost=%s" },
 	{ Opt_mountaddr, "mountaddr=%s" },
 
 	{ Opt_err, NULL }
@@ -974,6 +974,12 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			mnt->client_address = string;
 			break;
+		case Opt_mounthost:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			mnt->mount_server.hostname = string;
+			break;
 		case Opt_mountaddr:
 			string = match_strdup(args);
 			if (string == NULL)
@@ -1027,6 +1033,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 {
 	struct sockaddr_in sin;
 	int status;
+	char *hostname;
 
 	if (args->mount_server.version == 0) {
 		if (args->flags & NFS_MOUNT_VER3)
@@ -1035,6 +1042,11 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 			args->mount_server.version = NFS_MNT_VERSION;
 	}
 
+	if (args->mount_server.hostname)
+		hostname = args->mount_server.hostname;
+	else
+		hostname = args->nfs_server.hostname;
+
 	/*
 	 * Construct the mount server's address.
 	 */
@@ -1053,7 +1065,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	 */
 	status = nfs_mount((struct sockaddr *) &sin,
 			   sizeof(sin),
-			   args->nfs_server.hostname,
+			   hostname,
 			   args->nfs_server.export_path,
 			   args->mount_server.version,
 			   args->mount_server.protocol,
@@ -1061,8 +1073,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	if (status == 0)
 		return 0;
 
-	dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT
-			", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status);
+	dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
+			hostname, status);
 	return status;
 }
 
@@ -1468,6 +1480,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 
 out:
 	kfree(data.nfs_server.hostname);
+	kfree(data.mount_server.hostname);
 	return error;
 
 out_err_nosb:
-- 
cgit v1.2.3


From 9412b92772c1d80ea8284583b6aad0260e13515f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:59:21 -0500
Subject: NFS: Refactor mount option address parsing into separate function

Refactor the logic to parse incoming text-based IP addresses.  Use the
in4_pton() function instead of the older in_aton(), following the lead
of the in-kernel CIFS client.

Later we'll add IPv6 address parsing using the matching in6_pton()
function.  For now we can't allow IPv6 address parsing: we must expand
the size of the address storage fields in the nfs_parsed_mount_options
struct before we can parse and store IPv6 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f120be43d543..041fe9e9b74d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -640,6 +640,26 @@ static int nfs_verify_server_address(struct sockaddr *addr)
 	return 0;
 }
 
+/*
+ * Parse string addresses passed in via a mount option,
+ * and construct a sockaddr based on the result.
+ *
+ * If address parsing fails, set the sockaddr's address
+ * family to AF_UNSPEC to force nfs_verify_server_address()
+ * to punt the mount.
+ */
+static void nfs_parse_server_address(char *value,
+				     struct sockaddr *sap)
+{
+	struct sockaddr_in *ap = (void *)sap;
+
+	ap->sin_family = AF_INET;
+	if (in4_pton(value, -1, (u8 *)&ap->sin_addr.s_addr, '\0', NULL))
+		return;
+
+	sap->sa_family = AF_UNSPEC;
+}
+
 /*
  * Error-check and convert a string of mount options from user space into
  * a data structure
@@ -963,9 +983,8 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-			mnt->nfs_server.address.sin_family = AF_INET;
-			mnt->nfs_server.address.sin_addr.s_addr =
-							in_aton(string);
+			nfs_parse_server_address(string, (struct sockaddr *)
+						 &mnt->nfs_server.address);
 			kfree(string);
 			break;
 		case Opt_clientaddr:
@@ -984,9 +1003,8 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-			mnt->mount_server.address.sin_family = AF_INET;
-			mnt->mount_server.address.sin_addr.s_addr =
-							in_aton(string);
+			nfs_parse_server_address(string, (struct sockaddr *)
+						 &mnt->mount_server.address);
 			kfree(string);
 			break;
 
-- 
cgit v1.2.3


From 4c5680177012a2b5c0f3fdf58f4375dd84a1da67 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:59:28 -0500
Subject: NFS: Support non-IPv4 addresses in nfs_parsed_mount_data

Replace the nfs_server and mount_server address fields in the
nfs_parsed_mount_data structure with a "struct sockaddr_storage"
instead of a "struct sockaddr_in".

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aurelien Charbon <aurelien.charbon@ext.bull.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c   |  4 ++--
 fs/nfs/internal.h |  6 ++++--
 fs/nfs/super.c    | 54 ++++++++++++++++++++++++++++++++++--------------------
 3 files changed, 40 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ba114faf195f..906613362a56 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -581,7 +581,7 @@ static int nfs_init_server(struct nfs_server *server,
 	struct nfs_client_initdata cl_init = {
 		.hostname = data->nfs_server.hostname,
 		.addr = (const struct sockaddr *)&data->nfs_server.address,
-		.addrlen = sizeof(data->nfs_server.address),
+		.addrlen = data->nfs_server.addrlen,
 		.rpc_ops = &nfs_v2_clientops,
 	};
 	struct nfs_client *clp;
@@ -1018,7 +1018,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 	error = nfs4_set_client(server,
 			data->nfs_server.hostname,
 			(struct sockaddr *)&data->nfs_server.address,
-			sizeof(data->nfs_server.address),
+			data->nfs_server.addrlen,
 			data->client_address,
 			data->auth_flavors[0],
 			data->nfs_server.protocol,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 75dd4e252cae..a80621199086 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -42,7 +42,8 @@ struct nfs_parsed_mount_data {
 	char			*client_address;
 
 	struct {
-		struct sockaddr_in	address;
+		struct sockaddr_storage	address;
+		size_t			addrlen;
 		char			*hostname;
 		unsigned int		version;
 		unsigned short		port;
@@ -50,7 +51,8 @@ struct nfs_parsed_mount_data {
 	} mount_server;
 
 	struct {
-		struct sockaddr_in	address;
+		struct sockaddr_storage	address;
+		size_t			addrlen;
 		char			*hostname;
 		char			*export_path;
 		int			protocol;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 041fe9e9b74d..7efc6a34b56b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -649,15 +649,18 @@ static int nfs_verify_server_address(struct sockaddr *addr)
  * to punt the mount.
  */
 static void nfs_parse_server_address(char *value,
-				     struct sockaddr *sap)
+				     struct sockaddr *sap,
+				     size_t *len)
 {
 	struct sockaddr_in *ap = (void *)sap;
 
 	ap->sin_family = AF_INET;
+	*len = sizeof(*ap);
 	if (in4_pton(value, -1, (u8 *)&ap->sin_addr.s_addr, '\0', NULL))
 		return;
 
 	sap->sa_family = AF_UNSPEC;
+	*len = 0;
 }
 
 /*
@@ -984,7 +987,8 @@ static int nfs_parse_mount_options(char *raw,
 			if (string == NULL)
 				goto out_nomem;
 			nfs_parse_server_address(string, (struct sockaddr *)
-						 &mnt->nfs_server.address);
+						 &mnt->nfs_server.address,
+						 &mnt->nfs_server.addrlen);
 			kfree(string);
 			break;
 		case Opt_clientaddr:
@@ -1004,7 +1008,8 @@ static int nfs_parse_mount_options(char *raw,
 			if (string == NULL)
 				goto out_nomem;
 			nfs_parse_server_address(string, (struct sockaddr *)
-						 &mnt->mount_server.address);
+						 &mnt->mount_server.address,
+						 &mnt->mount_server.addrlen);
 			kfree(string);
 			break;
 
@@ -1049,9 +1054,9 @@ out_unknown:
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 			 struct nfs_fh *root_fh)
 {
-	struct sockaddr_in sin;
-	int status;
+	struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address;
 	char *hostname;
+	int status;
 
 	if (args->mount_server.version == 0) {
 		if (args->flags & NFS_MOUNT_VER3)
@@ -1068,21 +1073,23 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	/*
 	 * Construct the mount server's address.
 	 */
-	if (args->mount_server.address.sin_addr.s_addr != INADDR_ANY)
-		sin = args->mount_server.address;
-	else
-		sin = args->nfs_server.address;
+	if (args->mount_server.address.ss_family == AF_UNSPEC) {
+		memcpy(sap, &args->nfs_server.address,
+		       args->nfs_server.addrlen);
+		args->mount_server.addrlen = args->nfs_server.addrlen;
+	}
+
 	/*
 	 * autobind will be used if mount_server.port == 0
 	 */
-	nfs_set_port((struct sockaddr *)&sin, args->mount_server.port);
+	nfs_set_port(sap, args->mount_server.port);
 
 	/*
 	 * Now ask the mount server to map our export path
 	 * to a file handle.
 	 */
-	status = nfs_mount((struct sockaddr *) &sin,
-			   sizeof(sin),
+	status = nfs_mount(sap,
+			   args->mount_server.addrlen,
 			   hostname,
 			   args->nfs_server.export_path,
 			   args->mount_server.version,
@@ -1165,9 +1172,6 @@ static int nfs_validate_mount_data(void *options,
 			memset(mntfh->data + mntfh->size, 0,
 			       sizeof(mntfh->data) - mntfh->size);
 
-		if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
-			goto out_no_address;
-
 		/*
 		 * Translate to nfs_parsed_mount_data, which nfs_fill_super
 		 * can deal with.
@@ -1182,7 +1186,14 @@ static int nfs_validate_mount_data(void *options,
 		args->acregmax		= data->acregmax;
 		args->acdirmin		= data->acdirmin;
 		args->acdirmax		= data->acdirmax;
-		args->nfs_server.address = data->addr;
+
+		memcpy(&args->nfs_server.address, &data->addr,
+		       sizeof(data->addr));
+		args->nfs_server.addrlen = sizeof(data->addr);
+		if (!nfs_verify_server_address((struct sockaddr *)
+						&args->nfs_server.address))
+			goto out_no_address;
+
 		if (!(data->flags & NFS_MOUNT_TCP))
 			args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 		/* N.B. caller will free nfs_server.hostname in all cases */
@@ -1655,6 +1666,7 @@ static int nfs4_validate_mount_data(void *options,
 				    struct nfs_parsed_mount_data *args,
 				    const char *dev_name)
 {
+	struct sockaddr_in *ap;
 	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
 	char *c;
 
@@ -1675,11 +1687,13 @@ static int nfs4_validate_mount_data(void *options,
 
 	switch (data->version) {
 	case 1:
-		if (data->host_addrlen != sizeof(args->nfs_server.address))
+		ap = (struct sockaddr_in *)&args->nfs_server.address;
+		if (data->host_addrlen > sizeof(args->nfs_server.address))
+			goto out_no_address;
+		if (data->host_addrlen == 0)
 			goto out_no_address;
-		if (copy_from_user(&args->nfs_server.address,
-				   data->host_addr,
-				   sizeof(args->nfs_server.address)))
+		args->nfs_server.addrlen = data->host_addrlen;
+		if (copy_from_user(ap, data->host_addr, data->host_addrlen))
 			return -EFAULT;
 		if (!nfs_verify_server_address((struct sockaddr *)
 						&args->nfs_server.address))
-- 
cgit v1.2.3


From 3c7c7e4812e40e50a9ce9d687432ab5515cb3f2f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 10 Dec 2007 14:59:35 -0500
Subject: NFS: Pull covers off IPv6 address parsing

Now that the needed IPv6 infrastructure is in place, allow the NFS client's
IP address parser to generate AF_INET6 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 7efc6a34b56b..3cbe32f3e88b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -652,12 +652,23 @@ static void nfs_parse_server_address(char *value,
 				     struct sockaddr *sap,
 				     size_t *len)
 {
-	struct sockaddr_in *ap = (void *)sap;
+	if (strchr(value, ':')) {
+		struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
+		u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
 
-	ap->sin_family = AF_INET;
-	*len = sizeof(*ap);
-	if (in4_pton(value, -1, (u8 *)&ap->sin_addr.s_addr, '\0', NULL))
-		return;
+		ap->sin6_family = AF_INET6;
+		*len = sizeof(*ap);
+		if (in6_pton(value, -1, addr, '\0', NULL))
+			return;
+	} else {
+		struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+		u8 *addr = (u8 *)&ap->sin_addr.s_addr;
+
+		ap->sin_family = AF_INET;
+		*len = sizeof(*ap);
+		if (in4_pton(value, -1, addr, '\0', NULL))
+			return;
+	}
 
 	sap->sa_family = AF_UNSPEC;
 	*len = 0;
-- 
cgit v1.2.3


From 69dd716c5ffd89f5ba14ffb871d633ecea74d13a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 14 Dec 2007 14:56:07 -0500
Subject: NFSv4: Add socket proto argument to setclientid

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5e8c4cf7959e..b3d4e8e5696a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2891,10 +2891,12 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
 
 	for(;;) {
 		setclientid.sc_name_len = scnprintf(setclientid.sc_name,
-				sizeof(setclientid.sc_name), "%s/%s %s %u",
+				sizeof(setclientid.sc_name), "%s/%s %s %s %u",
 				clp->cl_ipaddr,
 				rpc_peeraddr2str(clp->cl_rpcclient,
 							RPC_DISPLAY_ADDR),
+				rpc_peeraddr2str(clp->cl_rpcclient,
+							RPC_DISPLAY_PROTO),
 				cred->cr_ops->cr_name,
 				clp->cl_id_uniquifier);
 		setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
-- 
cgit v1.2.3


From 7a3e3e18e40848b6f01d44407ce86b91b8535fbd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 20 Dec 2007 16:03:57 -0500
Subject: NFS: Ensure that we respect NFS_MAX_TCP_TIMEOUT

It isn't sufficient just to limit timeout->to_initval, we also need to
limit to_maxval.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 906613362a56..59a6dccab548 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -387,12 +387,16 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 	switch (proto) {
 	case XPRT_TRANSPORT_TCP:
 	case XPRT_TRANSPORT_RDMA:
-		if (!to->to_initval)
+		if (to->to_initval == 0)
 			to->to_initval = 60 * HZ;
 		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
 			to->to_initval = NFS_MAX_TCP_TIMEOUT;
 		to->to_increment = to->to_initval;
 		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+		if (to->to_maxval > NFS_MAX_TCP_TIMEOUT)
+			to->to_maxval = NFS_MAX_TCP_TIMEOUT;
+		if (to->to_maxval < to->to_initval)
+			to->to_maxval = to->to_initval;
 		to->to_exponential = 0;
 		break;
 	case XPRT_TRANSPORT_UDP:
-- 
cgit v1.2.3


From 331702337f2b2e7cef40366ee207a25604df4671 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 20 Dec 2007 16:03:59 -0500
Subject: NFS: Support per-mountpoint timeout parameters.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 82 +++++++++++++++++++++++++++++++++------------------------
 fs/nfs/super.c  |  4 +--
 2 files changed, 49 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 59a6dccab548..03d9bed7849a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -415,18 +415,16 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
  * Create an RPC client handle
  */
 static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
-						unsigned int timeo,
-						unsigned int retrans,
-						rpc_authflavor_t flavor,
-						int flags)
+				 const struct rpc_timeout *timeparms,
+				 rpc_authflavor_t flavor,
+				 int flags)
 {
-	struct rpc_timeout	timeparms;
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_create_args args = {
 		.protocol	= proto,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
-		.timeout	= &timeparms,
+		.timeout	= timeparms,
 		.servername	= clp->cl_hostname,
 		.program	= &nfs_program,
 		.version	= clp->rpc_ops->version,
@@ -437,10 +435,6 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
 	if (!IS_ERR(clp->cl_rpcclient))
 		return 0;
 
-	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
-	clp->retrans_timeo = timeparms.to_initval;
-	clp->retrans_count = timeparms.to_retries;
-
 	clnt = rpc_create(&args);
 	if (IS_ERR(clnt)) {
 		dprintk("%s: cannot create RPC client. Error = %ld\n",
@@ -515,7 +509,9 @@ static inline void nfs_init_server_aclclient(struct nfs_server *server)
 /*
  * Create a general RPC client
  */
-static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t pseudoflavour)
+static int nfs_init_server_rpcclient(struct nfs_server *server,
+		const struct rpc_timeout *timeo,
+		rpc_authflavor_t pseudoflavour)
 {
 	struct nfs_client *clp = server->nfs_client;
 
@@ -525,6 +521,11 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
 		return PTR_ERR(server->client);
 	}
 
+	memcpy(&server->client->cl_timeout_default,
+			timeo,
+			sizeof(server->client->cl_timeout_default));
+	server->client->cl_timeout = &server->client->cl_timeout_default;
+
 	if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
 		struct rpc_auth *auth;
 
@@ -549,6 +550,7 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
  * Initialise an NFS2 or NFS3 client
  */
 static int nfs_init_client(struct nfs_client *clp,
+			   const struct rpc_timeout *timeparms,
 			   const struct nfs_parsed_mount_data *data)
 {
 	int error;
@@ -564,7 +566,7 @@ static int nfs_init_client(struct nfs_client *clp,
 	 * - RFC 2623, sec 2.3.2
 	 */
 	error = nfs_create_rpc_client(clp, data->nfs_server.protocol,
-				data->timeo, data->retrans, RPC_AUTH_UNIX, 0);
+				timeparms, RPC_AUTH_UNIX, 0);
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -588,6 +590,7 @@ static int nfs_init_server(struct nfs_server *server,
 		.addrlen = data->nfs_server.addrlen,
 		.rpc_ops = &nfs_v2_clientops,
 	};
+	struct rpc_timeout timeparms;
 	struct nfs_client *clp;
 	int error;
 
@@ -605,7 +608,9 @@ static int nfs_init_server(struct nfs_server *server,
 		return PTR_ERR(clp);
 	}
 
-	error = nfs_init_client(clp, data);
+	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+			data->timeo, data->retrans);
+	error = nfs_init_client(clp, &timeparms, data);
 	if (error < 0)
 		goto error;
 
@@ -629,7 +634,7 @@ static int nfs_init_server(struct nfs_server *server,
 	if (error < 0)
 		goto error;
 
-	error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
+	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
 	if (error < 0)
 		goto error;
 
@@ -889,7 +894,8 @@ error:
  * Initialise an NFS4 client record
  */
 static int nfs4_init_client(struct nfs_client *clp,
-		int proto, int timeo, int retrans,
+		int proto,
+		const struct rpc_timeout *timeparms,
 		const char *ip_addr,
 		rpc_authflavor_t authflavour)
 {
@@ -904,7 +910,7 @@ static int nfs4_init_client(struct nfs_client *clp,
 	/* Check NFS protocol revision and initialize RPC op vector */
 	clp->rpc_ops = &nfs_v4_clientops;
 
-	error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour,
+	error = nfs_create_rpc_client(clp, proto, timeparms, authflavour,
 					RPC_CLNT_CREATE_DISCRTRY);
 	if (error < 0)
 		goto error;
@@ -936,7 +942,7 @@ static int nfs4_set_client(struct nfs_server *server,
 		const size_t addrlen,
 		const char *ip_addr,
 		rpc_authflavor_t authflavour,
-		int proto, int timeo, int retrans)
+		int proto, const struct rpc_timeout *timeparms)
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
@@ -955,7 +961,7 @@ static int nfs4_set_client(struct nfs_server *server,
 		error = PTR_ERR(clp);
 		goto error;
 	}
-	error = nfs4_init_client(clp, proto, timeo, retrans, ip_addr, authflavour);
+	error = nfs4_init_client(clp, proto, timeparms, ip_addr, authflavour);
 	if (error < 0)
 		goto error_put;
 
@@ -976,10 +982,26 @@ error:
 static int nfs4_init_server(struct nfs_server *server,
 		const struct nfs_parsed_mount_data *data)
 {
+	struct rpc_timeout timeparms;
 	int error;
 
 	dprintk("--> nfs4_init_server()\n");
 
+	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+			data->timeo, data->retrans);
+
+	/* Get a client record */
+	error = nfs4_set_client(server,
+			data->nfs_server.hostname,
+			(const struct sockaddr *)&data->nfs_server.address,
+			data->nfs_server.addrlen,
+			data->client_address,
+			data->auth_flavors[0],
+			data->nfs_server.protocol,
+			&timeparms);
+	if (error < 0)
+		goto error;
+
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
 	server->caps |= NFS_CAP_ATOMIC_OPEN;
@@ -994,8 +1016,9 @@ static int nfs4_init_server(struct nfs_server *server,
 	server->acdirmin = data->acdirmin * HZ;
 	server->acdirmax = data->acdirmax * HZ;
 
-	error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
+	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
 
+error:
 	/* Done */
 	dprintk("<-- nfs4_init_server() = %d\n", error);
 	return error;
@@ -1018,18 +1041,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
-	/* Get a client record */
-	error = nfs4_set_client(server,
-			data->nfs_server.hostname,
-			(struct sockaddr *)&data->nfs_server.address,
-			data->nfs_server.addrlen,
-			data->client_address,
-			data->auth_flavors[0],
-			data->nfs_server.protocol,
-			data->timeo, data->retrans);
-	if (error < 0)
-		goto error;
-
 	/* set up the general RPC client */
 	error = nfs4_init_server(server, data);
 	if (error < 0)
@@ -1103,8 +1114,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				parent_client->cl_ipaddr,
 				data->authflavor,
 				parent_server->client->cl_xprt->prot,
-				parent_client->retrans_timeo,
-				parent_client->retrans_count);
+				parent_server->client->cl_timeout);
 	if (error < 0)
 		goto error;
 
@@ -1112,7 +1122,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	nfs_server_copy_userdata(server, parent_server);
 	server->caps |= NFS_CAP_ATOMIC_OPEN;
 
-	error = nfs_init_server_rpcclient(server, data->authflavor);
+	error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
 	if (error < 0)
 		goto error;
 
@@ -1181,7 +1191,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 
 	server->fsid = fattr->fsid;
 
-	error = nfs_init_server_rpcclient(server, source->client->cl_auth->au_flavor);
+	error = nfs_init_server_rpcclient(server,
+			source->client->cl_timeout,
+			source->client->cl_auth->au_flavor);
 	if (error < 0)
 		goto out_free_server;
 	if (!IS_ERR(source->client_acl))
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3cbe32f3e88b..0d1bc61d0b68 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -479,8 +479,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	}
 	seq_printf(m, ",proto=%s",
 		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
-	seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ);
-	seq_printf(m, ",retrans=%u", clp->retrans_count);
+	seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ);
+	seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries);
 	seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
 }
 
-- 
cgit v1.2.3


From 59dca3b28cb915745019d4f4c27d97b6b6ab12c6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 3 Jan 2008 16:29:06 -0500
Subject: NFS: Fix the 'proto=' mount option

Currently, if you have a server mounted using networking protocol, you
cannot specify a different value using the 'proto=' option on another
mountpoint.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 03d9bed7849a..18fcb05a0707 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -100,6 +100,7 @@ struct nfs_client_initdata {
 	const struct sockaddr *addr;
 	size_t addrlen;
 	const struct nfs_rpc_ops *rpc_ops;
+	int proto;
 };
 
 /*
@@ -138,6 +139,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	INIT_LIST_HEAD(&clp->cl_superblocks);
 	clp->cl_rpcclient = ERR_PTR(-EINVAL);
 
+	clp->cl_proto = cl_init->proto;
+
 #ifdef CONFIG_NFS_V4
 	init_rwsem(&clp->cl_sem);
 	INIT_LIST_HEAD(&clp->cl_delegations);
@@ -289,6 +292,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 		if (clp->rpc_ops != data->rpc_ops)
 			continue;
 
+		if (clp->cl_proto != data->proto)
+			continue;
+
 		/* Match the full socket address */
 		if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0)
 			continue;
@@ -414,14 +420,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 /*
  * Create an RPC client handle
  */
-static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
+static int nfs_create_rpc_client(struct nfs_client *clp,
 				 const struct rpc_timeout *timeparms,
 				 rpc_authflavor_t flavor,
 				 int flags)
 {
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_create_args args = {
-		.protocol	= proto,
+		.protocol	= clp->cl_proto,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
 		.timeout	= timeparms,
@@ -565,8 +571,7 @@ static int nfs_init_client(struct nfs_client *clp,
 	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
 	 * - RFC 2623, sec 2.3.2
 	 */
-	error = nfs_create_rpc_client(clp, data->nfs_server.protocol,
-				timeparms, RPC_AUTH_UNIX, 0);
+	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0);
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -589,6 +594,7 @@ static int nfs_init_server(struct nfs_server *server,
 		.addr = (const struct sockaddr *)&data->nfs_server.address,
 		.addrlen = data->nfs_server.addrlen,
 		.rpc_ops = &nfs_v2_clientops,
+		.proto = data->nfs_server.protocol,
 	};
 	struct rpc_timeout timeparms;
 	struct nfs_client *clp;
@@ -894,7 +900,6 @@ error:
  * Initialise an NFS4 client record
  */
 static int nfs4_init_client(struct nfs_client *clp,
-		int proto,
 		const struct rpc_timeout *timeparms,
 		const char *ip_addr,
 		rpc_authflavor_t authflavour)
@@ -910,7 +915,7 @@ static int nfs4_init_client(struct nfs_client *clp,
 	/* Check NFS protocol revision and initialize RPC op vector */
 	clp->rpc_ops = &nfs_v4_clientops;
 
-	error = nfs_create_rpc_client(clp, proto, timeparms, authflavour,
+	error = nfs_create_rpc_client(clp, timeparms, authflavour,
 					RPC_CLNT_CREATE_DISCRTRY);
 	if (error < 0)
 		goto error;
@@ -949,6 +954,7 @@ static int nfs4_set_client(struct nfs_server *server,
 		.addr = addr,
 		.addrlen = addrlen,
 		.rpc_ops = &nfs_v4_clientops,
+		.proto = proto,
 	};
 	struct nfs_client *clp;
 	int error;
@@ -961,7 +967,7 @@ static int nfs4_set_client(struct nfs_server *server,
 		error = PTR_ERR(clp);
 		goto error;
 	}
-	error = nfs4_init_client(clp, proto, timeparms, ip_addr, authflavour);
+	error = nfs4_init_client(clp, timeparms, ip_addr, authflavour);
 	if (error < 0)
 		goto error_put;
 
-- 
cgit v1.2.3


From 369af0f1166f7a637751110395496cee156b4297 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Dec 2007 14:54:35 -0500
Subject: NFS: Clean up fs/nfs/idmap.c

Clean up white space damage and use standard kernel coding conventions for
return statements.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c | 87 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 44 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index d11eb055265c..c56fc7d5a46e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -72,39 +72,39 @@ module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
 		 &nfs_idmap_cache_timeout, 0644);
 
 struct idmap_hashent {
-	unsigned long ih_expires;
-	__u32 ih_id;
-	int ih_namelen;
-	char ih_name[IDMAP_NAMESZ];
+	unsigned long		ih_expires;
+	__u32			ih_id;
+	int			ih_namelen;
+	char			ih_name[IDMAP_NAMESZ];
 };
 
 struct idmap_hashtable {
-	__u8 h_type;
-	struct idmap_hashent h_entries[IDMAP_HASH_SZ];
+	__u8			h_type;
+	struct idmap_hashent	h_entries[IDMAP_HASH_SZ];
 };
 
 struct idmap {
-	struct dentry        *idmap_dentry;
-	wait_queue_head_t     idmap_wq;
-	struct idmap_msg      idmap_im;
-	struct mutex          idmap_lock;    /* Serializes upcalls */
-	struct mutex          idmap_im_lock; /* Protects the hashtable */
-	struct idmap_hashtable idmap_user_hash;
-	struct idmap_hashtable idmap_group_hash;
+	struct dentry		*idmap_dentry;
+	wait_queue_head_t	idmap_wq;
+	struct idmap_msg	idmap_im;
+	struct mutex		idmap_lock;	/* Serializes upcalls */
+	struct mutex		idmap_im_lock;	/* Protects the hashtable */
+	struct idmap_hashtable	idmap_user_hash;
+	struct idmap_hashtable	idmap_group_hash;
 };
 
-static ssize_t   idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-		     char __user *, size_t);
-static ssize_t   idmap_pipe_downcall(struct file *, const char __user *,
-		     size_t);
-static void      idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
+static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+				 char __user *, size_t);
+static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
+				   size_t);
+static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 
 static unsigned int fnvhash32(const void *, size_t);
 
 static struct rpc_pipe_ops idmap_upcall_ops = {
-        .upcall         = idmap_pipe_upcall,
-        .downcall       = idmap_pipe_downcall,
-        .destroy_msg    = idmap_pipe_destroy_msg,
+	.upcall		= idmap_pipe_upcall,
+	.downcall	= idmap_pipe_downcall,
+	.destroy_msg	= idmap_pipe_destroy_msg,
 };
 
 int
@@ -115,19 +115,20 @@ nfs_idmap_new(struct nfs_client *clp)
 
 	BUG_ON(clp->cl_idmap != NULL);
 
-        if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL)
-                return -ENOMEM;
+	idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
+	if (idmap == NULL)
+		return -ENOMEM;
 
-        idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
-	    idmap, &idmap_upcall_ops, 0);
-        if (IS_ERR(idmap->idmap_dentry)) {
+	idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
+					 idmap, &idmap_upcall_ops, 0);
+	if (IS_ERR(idmap->idmap_dentry)) {
 		error = PTR_ERR(idmap->idmap_dentry);
 		kfree(idmap);
 		return error;
 	}
 
-        mutex_init(&idmap->idmap_lock);
-        mutex_init(&idmap->idmap_im_lock);
+	mutex_init(&idmap->idmap_lock);
+	mutex_init(&idmap->idmap_im_lock);
 	init_waitqueue_head(&idmap->idmap_wq);
 	idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
 	idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
@@ -285,7 +286,7 @@ nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
 	memset(im, 0, sizeof(*im));
 	mutex_unlock(&idmap->idmap_im_lock);
 	mutex_unlock(&idmap->idmap_lock);
-	return (ret);
+	return ret;
 }
 
 /*
@@ -354,16 +355,16 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
 /* RPC pipefs upcall/downcall routines */
 static ssize_t
 idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-    char __user *dst, size_t buflen)
+		  char __user *dst, size_t buflen)
 {
-        char *data = (char *)msg->data + msg->copied;
-        ssize_t mlen = msg->len - msg->copied;
-        ssize_t left;
+	char *data = (char *)msg->data + msg->copied;
+	ssize_t mlen = msg->len - msg->copied;
+	ssize_t left;
 
-        if (mlen > buflen)
-                mlen = buflen;
+	if (mlen > buflen)
+		mlen = buflen;
 
-        left = copy_to_user(dst, data, mlen);
+	left = copy_to_user(dst, data, mlen);
 	if (left < 0) {
 		msg->errno = left;
 		return left;
@@ -371,13 +372,13 @@ idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
 	mlen -= left;
 	msg->copied += mlen;
 	msg->errno = 0;
-        return mlen;
+	return mlen;
 }
 
 static ssize_t
 idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
-        struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
+	struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
 	struct idmap *idmap = (struct idmap *)rpci->private;
 	struct idmap_msg im_in, *im = &idmap->idmap_im;
 	struct idmap_hashtable *h;
@@ -385,11 +386,11 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	int namelen_in;
 	int ret;
 
-        if (mlen != sizeof(im_in))
-                return (-ENOSPC);
+	if (mlen != sizeof(im_in))
+		return -ENOSPC;
 
-        if (copy_from_user(&im_in, src, mlen) != 0)
-		return (-EFAULT);
+	if (copy_from_user(&im_in, src, mlen) != 0)
+		return -EFAULT;
 
 	mutex_lock(&idmap->idmap_im_lock);
 
@@ -487,7 +488,7 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
 		hash ^= (unsigned int)*p;
 	}
 
-	return (hash);
+	return hash;
 }
 
 int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
-- 
cgit v1.2.3


From a661b77fc12a172edea4b709e37f8cd58a6bd500 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Dec 2007 14:54:42 -0500
Subject: NFS: Fix use of copy_to_user() in idmap_pipe_upcall

The idmap_pipe_upcall() function expects the copy_to_user() function to
return a negative error value if the call fails, but copy_to_user()
returns an unsigned long number of bytes that couldn't be copied.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c56fc7d5a46e..d93e071b900c 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -358,17 +358,15 @@ idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
 		  char __user *dst, size_t buflen)
 {
 	char *data = (char *)msg->data + msg->copied;
-	ssize_t mlen = msg->len - msg->copied;
-	ssize_t left;
-
-	if (mlen > buflen)
-		mlen = buflen;
+	size_t mlen = min(msg->len, buflen);
+	unsigned long left;
 
 	left = copy_to_user(dst, data, mlen);
-	if (left < 0) {
-		msg->errno = left;
-		return left;
+	if (left == mlen) {
+		msg->errno = -EFAULT;
+		return -EFAULT;
 	}
+
 	mlen -= left;
 	msg->copied += mlen;
 	msg->errno = 0;
-- 
cgit v1.2.3


From d24aae41b4d4141d4f3cffdbf4c31d85637ba691 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Dec 2007 14:54:49 -0500
Subject: NFS: Use size_t for storing name lengths

Clean up: always use the same type when handling buffer lengths.  As a
bonus, this prevents a mixed sign comparison in idmap_lookup_name.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index d93e071b900c..8ae5dba2d4e5 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -74,7 +74,7 @@ module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
 struct idmap_hashent {
 	unsigned long		ih_expires;
 	__u32			ih_id;
-	int			ih_namelen;
+	size_t			ih_namelen;
 	char			ih_name[IDMAP_NAMESZ];
 };
 
@@ -193,7 +193,7 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
  * pretty trivial.
  */
 static inline struct idmap_hashent *
-idmap_alloc_name(struct idmap_hashtable *h, char *name, unsigned len)
+idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
 {
 	return idmap_name_hash(h, name, len);
 }
@@ -381,7 +381,7 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	struct idmap_msg im_in, *im = &idmap->idmap_im;
 	struct idmap_hashtable *h;
 	struct idmap_hashent *he = NULL;
-	int namelen_in;
+	size_t namelen_in;
 	int ret;
 
 	if (mlen != sizeof(im_in))
-- 
cgit v1.2.3


From bf4285e75c3272ad9bfdeb886d247962bb2985f8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Dec 2007 14:54:57 -0500
Subject: NFS: Fix minor mixed sign comparison in NFS client's write logic

Clean up: PAGE_CACHE_SIZE is unsigned, and nfs_pageio_init() takes a size_t.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fbd64f2fa7f9..5ac5b27b639a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -939,7 +939,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 				  struct inode *inode, int ioflags)
 {
-	int wsize = NFS_SERVER(inode)->wsize;
+	size_t wsize = NFS_SERVER(inode)->wsize;
 
 	if (wsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
-- 
cgit v1.2.3


From 3d509e5454a0a5ac88bf3191ab65d85952c1de31 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Dec 2007 14:55:04 -0500
Subject: NFS: nfs_write_end clean up

Clean up: commit 4899f9c8 added nfs_write_end(), which introduces a
conditional expression that returns an unsigned integer in one arm and
a signed integer in the other.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 4560fc2ddb4a..ef57a5ae5904 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -349,7 +349,9 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 	unlock_page(page);
 	page_cache_release(page);
 
-	return status < 0 ? status : copied;
+	if (status < 0)
+		return status;
+	return copied;
 }
 
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
-- 
cgit v1.2.3


From cab6fc1b77c3ec4471d7d54ff6db9ad2dd59c2f5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Dec 2007 14:55:11 -0500
Subject: lockd: Eliminate harmless mixed sign comparison in nlmdbg_cookie2a()

The cookie->len field is unsigned, so the loop index variable in
nlmdbg_cookie2a() should also be unsigned.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/xdr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 633653bff944..3e459e18cc31 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -612,8 +612,7 @@ const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
 	 * called with BKL held.
 	 */
 	static char buf[2*NLM_MAXCOOKIELEN+1];
-	int i;
-	int len = sizeof(buf);
+	unsigned int i, len = sizeof(buf);
 	char *p = buf;
 
 	len--;	/* allow for trailing \0 */
-- 
cgit v1.2.3


From 52c4044d00fe703eb3fb18e0d8dfd1c196eb28be Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 11 Jan 2008 17:09:44 -0500
Subject: NLM: Introduce external nlm_host set-up and tear-down functions

We would like to remove the per-lock-operation nlm_lookup_host() call from
nlmclnt_proc().

The new architecture pins an nlm_host structure to each NFS client
superblock that has the "lock" mount option set.  The NFS client passes
in the pinned nlm_host structure during each call to nlmclnt_proc().  NFS
client unmount processing "puts" the nlm_host so it can be garbage-
collected later.

This patch introduces externally callable NLM functions that handle
mount-time nlm_host set up and tear-down.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index d070b18e539d..9a8f4f45c19e 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -41,6 +41,54 @@ struct nlm_wait {
 
 static LIST_HEAD(nlm_blocked);
 
+/**
+ * nlmclnt_init - Set up per-NFS mount point lockd data structures
+ * @server_name: server's hostname
+ * @server_address: server's network address
+ * @server_addrlen: length of server's address
+ * @protocol: transport protocol lockd should use
+ * @nfs_version: NFS protocol version for this mount point
+ *
+ * Returns pointer to an appropriate nlm_host struct,
+ * or an ERR_PTR value.
+ */
+struct nlm_host *nlmclnt_init(const char *server_name,
+			      const struct sockaddr *server_address,
+			      size_t server_addrlen,
+			      unsigned short protocol, u32 nfs_version)
+{
+	struct nlm_host *host;
+	u32 nlm_version = (nfs_version == 2) ? 1 : 4;
+	int status;
+
+	status = lockd_up(protocol);
+	if (status < 0)
+		return ERR_PTR(status);
+
+	host = nlmclnt_lookup_host((struct sockaddr_in *)server_address,
+				   protocol, nlm_version,
+				   server_name, strlen(server_name));
+	if (host == NULL) {
+		lockd_down();
+		return ERR_PTR(-ENOLCK);
+	}
+
+	return host;
+}
+EXPORT_SYMBOL_GPL(nlmclnt_init);
+
+/**
+ * nlmclnt_done - Release resources allocated by nlmclnt_init()
+ * @host: nlm_host structure reserved by nlmclnt_init()
+ *
+ */
+void nlmclnt_done(struct nlm_host *host)
+{
+	nlm_release_host(host);
+	lockd_down();
+}
+EXPORT_SYMBOL_GPL(nlmclnt_done);
+
 /*
  * Queue up a lock for blocking so that the GRANTED request can see it
  */
-- 
cgit v1.2.3


From 9289e7f91add1c09c3ec8571a2080f7507730b8d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 11 Jan 2008 17:09:52 -0500
Subject: NFS: Invoke nlmclnt_init during NFS mount processing

Cache an appropriate nlm_host structure in the NFS client's mount point
metadata for later use.

Note that there is no need to set NFS_MOUNT_NONLM in the error case -- if
nfs_start_lockd() returns a non-zero value, its callers ensure that the
mount request fails outright.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 18fcb05a0707..0b3ce86f6fc9 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -458,7 +458,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 static void nfs_destroy_server(struct nfs_server *server)
 {
 	if (!(server->flags & NFS_MOUNT_NONLM))
-		lockd_down();	/* release rpc.lockd */
+		nlmclnt_done(server->nlm_host);
 }
 
 /*
@@ -466,20 +466,26 @@ static void nfs_destroy_server(struct nfs_server *server)
  */
 static int nfs_start_lockd(struct nfs_server *server)
 {
-	int error = 0;
+	struct nlm_host *host;
+	struct nfs_client *clp = server->nfs_client;
+	u32 nfs_version = clp->rpc_ops->version;
+	unsigned short protocol = server->flags & NFS_MOUNT_TCP ?
+						IPPROTO_TCP : IPPROTO_UDP;
 
-	if (server->nfs_client->rpc_ops->version > 3)
-		goto out;
+	if (nfs_version > 3)
+		return 0;
 	if (server->flags & NFS_MOUNT_NONLM)
-		goto out;
-	error = lockd_up((server->flags & NFS_MOUNT_TCP) ?
-			IPPROTO_TCP : IPPROTO_UDP);
-	if (error < 0)
-		server->flags |= NFS_MOUNT_NONLM;
-	else
-		server->destroy = nfs_destroy_server;
-out:
-	return error;
+		return 0;
+
+	host = nlmclnt_init(clp->cl_hostname,
+			    (struct sockaddr *)&clp->cl_addr,
+			    clp->cl_addrlen, protocol, nfs_version);
+	if (IS_ERR(host))
+		return PTR_ERR(host);
+
+	server->nlm_host = host;
+	server->destroy = nfs_destroy_server;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 1093a60ef34bb12010fe7ea4b780bee1c57cfbbe Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 11 Jan 2008 17:09:59 -0500
Subject: NLM/NFS: Use cached nlm_host when calling nlmclnt_proc()

Now that each NFS mount point caches its own nlm_host structure, it can be
passed to nlmclnt_proc() for each lock request.  By pinning an nlm_host for
each mount point, we trade the overhead of looking up or creating a fresh
nlm_host struct during every NLM procedure call for a little extra memory.

We also restrict the nlmclnt_proc symbol to limit the use of this call to
in-tree modules.

Note that nlm_lookup_host() (just removed from the client's per-request
NLM processing) could also trigger an nlm_host garbage collection.  Now
client-side nlm_host garbage collection occurs only during NFS mount
processing.  Since the NFS client now holds a reference on these nlm_host
structures, they wouldn't have been affected by garbage collection
anyway.

Given that nlm_lookup_host() reorders the global nlm_host chain after
every successful lookup, and that a garbage collection could be triggered
during the call, we've removed a significant amount of per-NLM-request
CPU processing overhead.

Sidebar: there are only a few remaining references to the internals of
NFS inodes in the client-side NLM code.  The only references I found are
related to extracting or comparing the inode's file handle via NFS_FH().
One is in nlmclnt_grant(); the other is in nlmclnt_setlockargs().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 33 ++++++++++-----------------------
 fs/nfs/nfs3proc.c   |  4 +++-
 fs/nfs/proc.c       |  4 +++-
 3 files changed, 16 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index a10343bed160..b1a4dba443bc 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -145,34 +145,21 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
 	BUG_ON(req->a_args.lock.fl.fl_ops != NULL);
 }
 
-/*
- * This is the main entry point for the NLM client.
+/**
+ * nlmclnt_proc - Perform a single client-side lock request
+ * @host: address of a valid nlm_host context representing the NLM server
+ * @cmd: fcntl-style file lock operation to perform
+ * @fl: address of arguments for the lock operation
+ *
  */
-int
-nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
+int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 {
-	struct rpc_clnt		*client = NFS_CLIENT(inode);
-	struct sockaddr_in	addr;
-	struct nfs_server	*nfssrv = NFS_SERVER(inode);
-	struct nlm_host		*host;
 	struct nlm_rqst		*call;
 	sigset_t		oldset;
 	unsigned long		flags;
-	int			status, vers;
-
-	vers = (NFS_PROTO(inode)->version == 3) ? 4 : 1;
-	if (NFS_PROTO(inode)->version > 3) {
-		printk(KERN_NOTICE "NFSv4 file locking not implemented!\n");
-		return -ENOLCK;
-	}
-
-	rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr));
-	host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers,
-				   nfssrv->nfs_client->cl_hostname,
-				   strlen(nfssrv->nfs_client->cl_hostname));
-	if (host == NULL)
-		return -ENOLCK;
+	int			status;
 
+	nlm_get_host(host);
 	call = nlm_alloc_call(host);
 	if (call == NULL)
 		return -ENOMEM;
@@ -219,7 +206,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 	dprintk("lockd: clnt proc returns %d\n", status);
 	return status;
 }
-EXPORT_SYMBOL(nlmclnt_proc);
+EXPORT_SYMBOL_GPL(nlmclnt_proc);
 
 /*
  * Allocate an NLM RPC call struct
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index e68580ebaa47..b353c1a05bfd 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -767,7 +767,9 @@ static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 static int
 nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl);
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
 
 const struct nfs_rpc_ops nfs_v3_clientops = {
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c9f46a24e75c..5ccf7faee19c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -593,7 +593,9 @@ nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
 static int
 nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl);
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
 
 
-- 
cgit v1.2.3


From 883bb163f84e0a54b29846c61621f52db3f27393 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 15 Jan 2008 16:04:20 -0500
Subject: NLM: Introduce an arguments structure for nlmclnt_init()

Clean up: pass 5 arguments to nlmclnt_init() in a structure similar to the
new nfs_client_initdata structure.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/clntlock.c | 22 ++++++++--------------
 fs/nfs/client.c     | 17 ++++++++++-------
 2 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 9a8f4f45c19e..0b45fd3a4bfd 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -43,31 +43,25 @@ static LIST_HEAD(nlm_blocked);
 
 /**
  * nlmclnt_init - Set up per-NFS mount point lockd data structures
- * @server_name: server's hostname
- * @server_address: server's network address
- * @server_addrlen: length of server's address
- * @protocol: transport protocol lockd should use
- * @nfs_version: NFS protocol version for this mount point
+ * @nlm_init: pointer to arguments structure
  *
  * Returns pointer to an appropriate nlm_host struct,
  * or an ERR_PTR value.
  */
-struct nlm_host *nlmclnt_init(const char *server_name,
-			      const struct sockaddr *server_address,
-			      size_t server_addrlen,
-			      unsigned short protocol, u32 nfs_version)
+struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
 {
 	struct nlm_host *host;
-	u32 nlm_version = (nfs_version == 2) ? 1 : 4;
+	u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
 	int status;
 
-	status = lockd_up(protocol);
+	status = lockd_up(nlm_init->protocol);
 	if (status < 0)
 		return ERR_PTR(status);
 
-	host = nlmclnt_lookup_host((struct sockaddr_in *)server_address,
-				   protocol, nlm_version,
-				   server_name, strlen(server_name));
+	host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
+				   nlm_init->protocol, nlm_version,
+				   nlm_init->hostname,
+				   strlen(nlm_init->hostname));
 	if (host == NULL) {
 		lockd_down();
 		return ERR_PTR(-ENOLCK);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0b3ce86f6fc9..7a15832369e9 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -468,18 +468,21 @@ static int nfs_start_lockd(struct nfs_server *server)
 {
 	struct nlm_host *host;
 	struct nfs_client *clp = server->nfs_client;
-	u32 nfs_version = clp->rpc_ops->version;
-	unsigned short protocol = server->flags & NFS_MOUNT_TCP ?
-						IPPROTO_TCP : IPPROTO_UDP;
+	struct nlmclnt_initdata nlm_init = {
+		.hostname	= clp->cl_hostname,
+		.address	= (struct sockaddr *)&clp->cl_addr,
+		.addrlen	= clp->cl_addrlen,
+		.protocol	= server->flags & NFS_MOUNT_TCP ?
+						IPPROTO_TCP : IPPROTO_UDP,
+		.nfs_version	= clp->rpc_ops->version,
+	};
 
-	if (nfs_version > 3)
+	if (nlm_init.nfs_version > 3)
 		return 0;
 	if (server->flags & NFS_MOUNT_NONLM)
 		return 0;
 
-	host = nlmclnt_init(clp->cl_hostname,
-			    (struct sockaddr *)&clp->cl_addr,
-			    clp->cl_addrlen, protocol, nfs_version);
+	host = nlmclnt_init(&nlm_init);
 	if (IS_ERR(host))
 		return PTR_ERR(host);
 
-- 
cgit v1.2.3


From 65fdf7d264213a9a8de44f9a20e002a26c267a76 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 11 Jan 2008 17:41:29 -0500
Subject: NLM: Fix a bogus 'return' in nlmclnt_rpc_release

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index b1a4dba443bc..b6b74a60e1eb 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -244,7 +244,7 @@ void nlm_release_call(struct nlm_rqst *call)
 
 static void nlmclnt_rpc_release(void *data)
 {
-	return nlm_release_call(data);
+	nlm_release_call(data);
 }
 
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
-- 
cgit v1.2.3


From f3c391e89c92651105364c6645244118ec9b3952 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 15 Jan 2008 14:17:12 -0500
Subject: NFS: Optimise away the sigmask code in aio/dio reads and writes

There are no interruptible waits for asynchronous RPC tasks, so we don't
need to wrap calls to rpc_run_task() with an
rpc_clnt_sigmask/rpc_clnt_unsigmask pair.

Instead we can wrap the wait_for_completion_interruptible() in
nfs_direct_wait(). This means that we completely optimise away sigmask
setting for the case of non-blocking aio/dio.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index eadd87f7159f..f8e165c7d5a6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -188,12 +188,17 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
 static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
 {
 	ssize_t result = -EIOCBQUEUED;
+	struct rpc_clnt *clnt;
+	sigset_t oldset;
 
 	/* Async requests don't wait here */
 	if (dreq->iocb)
 		goto out;
 
+	clnt = NFS_CLIENT(dreq->inode);
+	rpc_clnt_sigmask(clnt, &oldset);
 	result = wait_for_completion_interruptible(&dreq->completion);
+	rpc_clnt_sigunmask(clnt, &oldset);
 
 	if (!result)
 		result = dreq->error;
@@ -403,9 +408,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 			       unsigned long nr_segs, loff_t pos)
 {
 	ssize_t result = 0;
-	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
-	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
 
 	dreq = nfs_direct_req_alloc();
@@ -417,11 +420,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	rpc_clnt_sigmask(clnt, &oldset);
 	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
 	if (!result)
 		result = nfs_direct_wait(dreq);
-	rpc_clnt_sigunmask(clnt, &oldset);
 	nfs_direct_req_release(dreq);
 
 	return result;
@@ -816,9 +817,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 				size_t count)
 {
 	ssize_t result = 0;
-	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
-	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	int sync = NFS_UNSTABLE;
@@ -836,11 +835,9 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	rpc_clnt_sigmask(clnt, &oldset);
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
 	if (!result)
 		result = nfs_direct_wait(dreq);
-	rpc_clnt_sigunmask(clnt, &oldset);
 	nfs_direct_req_release(dreq);
 
 	return result;
-- 
cgit v1.2.3


From 3d1c550874bcaf0d9b7fb66f601caed109074f4b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 15 Jan 2008 16:43:19 -0500
Subject: nfs4: allow nfsv4 acls on non-regular-files

The rfc doesn't give any reason it shouldn't be possible to set an
attribute on a non-regular file.  And if the server supports it, then it
shouldn't be up to us to prevent it.

Thanks to Erez for the report and Trond for further analysis.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Tested-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b3d4e8e5696a..89efbcd6fd53 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3617,10 +3617,6 @@ int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
 	if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
 		return -EOPNOTSUPP;
 
-	if (!S_ISREG(inode->i_mode) &&
-	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-		return -EPERM;
-
 	return nfs4_proc_set_acl(inode, buf, buflen);
 }
 
-- 
cgit v1.2.3


From fc6014771bde8a215a9a4ea24b45f76afeb3c922 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 16 Jan 2008 16:38:10 -0500
Subject: NFS: Address memory leaks in the NFS client mount option parser

David Howells noticed that repeating the same mount option twice during an
NFS mount request can result in orphaned memory in certain cases.

Only the client_address and mount_server.hostname strings are initialized
in the mount parsing loop, so those appear to be the only two pointers that
might be written over by repeating a mount option.  The strings in the
nfs_server section of the nfs_parsed_mount_data structure are set only once
after the options are parsed, thus these are not susceptible to being
overwritten.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0d1bc61d0b68..22c49c02897d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1006,12 +1006,14 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
+			kfree(mnt->client_address);
 			mnt->client_address = string;
 			break;
 		case Opt_mounthost:
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
+			kfree(mnt->mount_server.hostname);
 			mnt->mount_server.hostname = string;
 			break;
 		case Opt_mountaddr:
-- 
cgit v1.2.3


From 3a10c30acc4821ca000b52ed0edafd0d3bf26a52 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 23 Jan 2008 08:58:59 +0200
Subject: nfs: obliterate NFS_FLAGS macro

use NFS_I(inode)->flags instead

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c   | 8 ++++----
 fs/nfs/inode.c | 6 +++---
 fs/nfs/read.c  | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 72d141a0dbd8..c578d942f000 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -192,7 +192,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
 			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
-			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			desc->plus = 0;
 			goto again;
 		}
@@ -577,7 +577,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			break;
 		}
 		if (res == -ETOOSMALL && desc->plus) {
-			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			nfs_zap_caches(inode);
 			desc->plus = 0;
 			desc->entry->eof = 0;
@@ -1760,7 +1760,7 @@ static void __nfs_access_zap_cache(struct inode *inode)
 void nfs_access_zap_cache(struct inode *inode)
 {
 	/* Remove from global LRU init */
-	if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+	if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
 		spin_lock(&nfs_access_lru_lock);
 		list_del_init(&NFS_I(inode)->access_cache_inode_lru);
 		spin_unlock(&nfs_access_lru_lock);
@@ -1874,7 +1874,7 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
 	smp_mb__after_atomic_inc();
 
 	/* Add inode to global LRU list */
-	if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+	if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
 		spin_lock(&nfs_access_lru_lock);
 		list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
 		spin_unlock(&nfs_access_lru_lock);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5747d49bdd76..9d7b08c43865 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -192,7 +192,7 @@ void nfs_invalidate_atime(struct inode *inode)
  */
 static void nfs_invalidate_inode(struct inode *inode)
 {
-	set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
+	set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
 	nfs_zap_caches_locked(inode);
 }
 
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			inode->i_fop = &nfs_dir_operations;
 			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
-				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			/* Deal with crossing mountpoints */
 			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
 				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
@@ -668,7 +668,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		if (status == -ESTALE) {
 			nfs_zap_caches(inode);
 			if (!S_ISDIR(inode->i_mode))
-				set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
+				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
 		}
 		goto out;
 	}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index efc121c494fe..8fd6dfbe1bc3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -336,7 +336,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 	nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count);
 
 	if (task->tk_status == -ESTALE) {
-		set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
+		set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags);
 		nfs_mark_for_revalidate(data->inode);
 	}
 	return 0;
-- 
cgit v1.2.3


From 99fadcd76465842c014c88b8c9c19b457e9debc0 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Wed, 23 Jan 2008 08:59:08 +0200
Subject: nfs: convert NFS_*(inode) helpers to static inline

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9d7b08c43865..5d381cfbfe7e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -229,7 +229,7 @@ nfs_init_locked(struct inode *inode, void *opaque)
 	struct nfs_find_desc	*desc = (struct nfs_find_desc *)opaque;
 	struct nfs_fattr	*fattr = desc->fattr;
 
-	NFS_FILEID(inode) = fattr->fileid;
+	set_nfs_fileid(inode, fattr->fileid);
 	nfs_copy_fh(NFS_FH(inode), desc->fh);
 	return 0;
 }
-- 
cgit v1.2.3


From e6f810759505bc86c009854b82cc495ffd8eb020 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 24 Jan 2008 18:14:34 -0500
Subject: NFS: Add an asynchronous delegreturn operation for use in
 nfs_clear_inode

Otherwise, there is a potential deadlock if the last dput() from an NFSv4
close() or other asynchronous operation leads to nfs_clear_inode calling
the synchronous delegreturn.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c | 29 +++++++++++++++++++++++++----
 fs/nfs/delegation.h |  3 ++-
 fs/nfs/dir.c        |  1 -
 fs/nfs/inode.c      |  2 +-
 fs/nfs/nfs4proc.c   | 22 +++++++++++++---------
 5 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index b03dcd8403f1..2dead8d1dd55 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -174,11 +174,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 	return status;
 }
 
-static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
+static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
 {
 	int res = 0;
 
-	res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
+	res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
 	nfs_free_delegation(delegation);
 	return res;
 }
@@ -208,7 +208,7 @@ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegat
 	up_read(&clp->cl_sem);
 	nfs_msync_inode(inode);
 
-	return nfs_do_return_delegation(inode, delegation);
+	return nfs_do_return_delegation(inode, delegation, 1);
 }
 
 static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
@@ -228,6 +228,27 @@ nomatch:
 	return NULL;
 }
 
+/*
+ * This function returns the delegation without reclaiming opens
+ * or protecting against delegation reclaims.
+ * It is therefore really only safe to be called from
+ * nfs4_clear_inode()
+ */
+void nfs_inode_return_delegation_noreclaim(struct inode *inode)
+{
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_delegation *delegation;
+
+	if (rcu_dereference(nfsi->delegation) != NULL) {
+		spin_lock(&clp->cl_lock);
+		delegation = nfs_detach_delegation_locked(nfsi, NULL);
+		spin_unlock(&clp->cl_lock);
+		if (delegation != NULL)
+			nfs_do_return_delegation(inode, delegation, 0);
+	}
+}
+
 int nfs_inode_return_delegation(struct inode *inode)
 {
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
@@ -388,7 +409,7 @@ static int recall_thread(void *data)
 	nfs_msync_inode(inode);
 
 	if (delegation != NULL)
-		nfs_do_return_delegation(inode, delegation);
+		nfs_do_return_delegation(inode, delegation, 1);
 	iput(inode);
 	module_put_and_exit(0);
 }
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 5874ce7fdbae..f1c5e2a5d88e 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -29,6 +29,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
 int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
 void nfs_return_all_delegations(struct super_block *sb);
@@ -39,7 +40,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 
 /* NFSv4 delegation-related procedures */
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid);
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
 int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c578d942f000..5ca762de88bf 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -864,7 +864,6 @@ static int nfs_dentry_delete(struct dentry *dentry)
  */
 static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 {
-	nfs_inode_return_delegation(inode);
 	if (S_ISDIR(inode->i_mode))
 		/* drop any readdir cache as it could easily be old */
 		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5d381cfbfe7e..3f332e54e760 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1145,7 +1145,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 void nfs4_clear_inode(struct inode *inode)
 {
 	/* If we are holding a delegation, return it! */
-	nfs_inode_return_delegation(inode);
+	nfs_inode_return_delegation_noreclaim(inode);
 	/* First call standard NFS clear_inode() code */
 	nfs_clear_inode(inode);
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 89efbcd6fd53..5c189bd57eb2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2991,7 +2991,7 @@ static const struct rpc_call_ops nfs4_delegreturn_ops = {
 	.rpc_release = nfs4_delegreturn_release,
 };
 
-static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
+static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
 {
 	struct nfs4_delegreturndata *data;
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -3006,7 +3006,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 		.callback_ops = &nfs4_delegreturn_ops,
 		.flags = RPC_TASK_ASYNC,
 	};
-	int status;
+	int status = 0;
 
 	data = kmalloc(sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
@@ -3028,23 +3028,27 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
+	if (!issync)
+		goto out;
 	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status == 0) {
-		status = data->rpc_status;
-		if (status == 0)
-			nfs_refresh_inode(inode, &data->fattr);
-	}
+	if (status != 0)
+		goto out;
+	status = data->rpc_status;
+	if (status != 0)
+		goto out;
+	nfs_refresh_inode(inode, &data->fattr);
+out:
 	rpc_put_task(task);
 	return status;
 }
 
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_exception exception = { };
 	int err;
 	do {
-		err = _nfs4_proc_delegreturn(inode, cred, stateid);
+		err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
 		switch (err) {
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_EXPIRED:
-- 
cgit v1.2.3


From 6f23e3872cff238589f9bf39c71db2ea880c9a26 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 Jan 2008 16:38:17 -0500
Subject: NFS: Fix a potential race between umount and
 nfs_access_cache_shrinker()

Thanks to Yawei Niu for spotting the race.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5ca762de88bf..476cb0f837fd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1694,13 +1694,19 @@ int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
 restart:
 	spin_lock(&nfs_access_lru_lock);
 	list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+		struct rw_semaphore *s_umount;
 		struct inode *inode;
 
 		if (nr_to_scan-- == 0)
 			break;
+		s_umount = &nfsi->vfs_inode.i_sb->s_umount;
+		if (!down_read_trylock(s_umount))
+			continue;
 		inode = igrab(&nfsi->vfs_inode);
-		if (inode == NULL)
+		if (inode == NULL) {
+			up_read(s_umount);
 			continue;
+		}
 		spin_lock(&inode->i_lock);
 		if (list_empty(&nfsi->access_cache_entry_lru))
 			goto remove_lru_entry;
@@ -1719,6 +1725,7 @@ remove_lru_entry:
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&nfs_access_lru_lock);
 		iput(inode);
+		up_read(s_umount);
 		goto restart;
 	}
 	spin_unlock(&nfs_access_lru_lock);
-- 
cgit v1.2.3


From 57bfa89171e50cddf51a4f62c90e47c6259857b4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 25 Jan 2008 16:38:18 -0500
Subject: NFSv4: Deal more correctly with duplicate delegations

If a (broken?) server hands out two different delegations for the same
file, then we should return one of them.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c | 89 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 51 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2dead8d1dd55..b9eadd18ba70 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,32 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
 	put_rpccred(oldcred);
 }
 
+static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+	int res = 0;
+
+	res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+	nfs_free_delegation(delegation);
+	return res;
+}
+
+static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
+{
+	struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+
+	if (delegation == NULL)
+		goto nomatch;
+	if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
+				sizeof(delegation->stateid.data)) != 0)
+		goto nomatch;
+	list_del_rcu(&delegation->super_list);
+	nfsi->delegation_state = 0;
+	rcu_assign_pointer(nfsi->delegation, NULL);
+	return delegation;
+nomatch:
+	return NULL;
+}
+
 /*
  * Set up a delegation on an inode
  */
@@ -133,6 +159,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
+	struct nfs_delegation *freeme = NULL;
 	int status = 0;
 
 	delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
@@ -147,42 +174,45 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 	delegation->inode = inode;
 
 	spin_lock(&clp->cl_lock);
-	if (rcu_dereference(nfsi->delegation) == NULL) {
-		list_add_rcu(&delegation->super_list, &clp->cl_delegations);
-		nfsi->delegation_state = delegation->type;
-		rcu_assign_pointer(nfsi->delegation, delegation);
-		delegation = NULL;
-	} else {
+	if (rcu_dereference(nfsi->delegation) != NULL) {
 		if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
-					sizeof(delegation->stateid)) != 0 ||
-				delegation->type != nfsi->delegation->type) {
-			printk(KERN_WARNING "%s: server %s handed out "
-					"a duplicate delegation!\n",
-					__FUNCTION__, clp->cl_hostname);
-			status = -EIO;
+					sizeof(delegation->stateid)) == 0 &&
+				delegation->type == nfsi->delegation->type) {
+			goto out;
+		}
+		/*
+		 * Deal with broken servers that hand out two
+		 * delegations for the same file.
+		 */
+		dfprintk(FILE, "%s: server %s handed out "
+				"a duplicate delegation!\n",
+				__FUNCTION__, clp->cl_hostname);
+		if (delegation->type <= nfsi->delegation->type) {
+			freeme = delegation;
+			delegation = NULL;
+			goto out;
 		}
+		freeme = nfs_detach_delegation_locked(nfsi, NULL);
 	}
+	list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+	nfsi->delegation_state = delegation->type;
+	rcu_assign_pointer(nfsi->delegation, delegation);
+	delegation = NULL;
 
 	/* Ensure we revalidate the attributes and page cache! */
 	spin_lock(&inode->i_lock);
 	nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
 	spin_unlock(&inode->i_lock);
 
+out:
 	spin_unlock(&clp->cl_lock);
 	if (delegation != NULL)
 		nfs_free_delegation(delegation);
+	if (freeme != NULL)
+		nfs_do_return_delegation(inode, freeme, 0);
 	return status;
 }
 
-static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
-{
-	int res = 0;
-
-	res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
-	nfs_free_delegation(delegation);
-	return res;
-}
-
 /* Sync all data to disk upon delegation return */
 static void nfs_msync_inode(struct inode *inode)
 {
@@ -211,23 +241,6 @@ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegat
 	return nfs_do_return_delegation(inode, delegation, 1);
 }
 
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
-{
-	struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
-
-	if (delegation == NULL)
-		goto nomatch;
-	if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
-				sizeof(delegation->stateid.data)) != 0)
-		goto nomatch;
-	list_del_rcu(&delegation->super_list);
-	nfsi->delegation_state = 0;
-	rcu_assign_pointer(nfsi->delegation, NULL);
-	return delegation;
-nomatch:
-	return NULL;
-}
-
 /*
  * This function returns the delegation without reclaiming opens
  * or protecting against delegation reclaims.
-- 
cgit v1.2.3


From 3fbd67ad61f6d5a09ea717b56c50bc5c3d8042a8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 26 Jan 2008 01:06:40 -0500
Subject: NFSv4: Iterate through all nfs_clients when the server recalls a
 delegation

The same delegation may have been handed out to more than one nfs_client.
Ensure that if a recall occurs, we return all instances.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_proc.c | 39 ++++++++++++++++++++++-----------------
 fs/nfs/client.c        | 35 +++++++++++++++++++++++++++++++++++
 fs/nfs/internal.h      |  1 +
 3 files changed, 58 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e89a9007c91c..15f7785048d3 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -75,23 +75,28 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 	dprintk("NFS: RECALL callback request from %s\n",
 		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 
-	inode = nfs_delegation_find_inode(clp, &args->fh);
-	if (inode == NULL)
-		goto out_putclient;
-	/* Set up a helper thread to actually return the delegation */
-	switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
-		case 0:
-			res = 0;
-			break;
-		case -ENOENT:
-			res = htonl(NFS4ERR_BAD_STATEID);
-			break;
-		default:
-			res = htonl(NFS4ERR_RESOURCE);
-	}
-	iput(inode);
-out_putclient:
-	nfs_put_client(clp);
+	do {
+		struct nfs_client *prev = clp;
+
+		inode = nfs_delegation_find_inode(clp, &args->fh);
+		if (inode != NULL) {
+			/* Set up a helper thread to actually return the delegation */
+			switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
+				case 0:
+					res = 0;
+					break;
+				case -ENOENT:
+					if (res != 0)
+						res = htonl(NFS4ERR_BAD_STATEID);
+					break;
+				default:
+					res = htonl(NFS4ERR_RESOURCE);
+			}
+			iput(inode);
+		}
+		clp = nfs_find_client_next(prev);
+		nfs_put_client(prev);
+	} while (clp != NULL);
 out:
 	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
 	return res;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 7a15832369e9..685c43f810c1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -275,6 +275,41 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
 	return NULL;
 }
 
+/*
+ * Find a client by IP address and protocol version
+ * - returns NULL if no such client
+ */
+struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
+{
+	struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
+	u32 nfsvers = clp->rpc_ops->version;
+
+	spin_lock(&nfs_client_lock);
+	list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
+		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+
+		/* Don't match clients that failed to initialise properly */
+		if (clp->cl_cons_state != NFS_CS_READY)
+			continue;
+
+		/* Different NFS versions cannot share the same nfs_client */
+		if (clp->rpc_ops->version != nfsvers)
+			continue;
+
+		if (sap->sa_family != clap->sa_family)
+			continue;
+		/* Match only the IP address, not the port number */
+		if (!nfs_sockaddr_match_ipaddr(sap, clap))
+			continue;
+
+		atomic_inc(&clp->cl_count);
+		spin_unlock(&nfs_client_lock);
+		return clp;
+	}
+	spin_unlock(&nfs_client_lock);
+	return NULL;
+}
+
 /*
  * Find an nfs_client on the list that matches the initialisation data
  * that is supplied.
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a80621199086..0f5619611b8d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -64,6 +64,7 @@ extern struct rpc_program nfs_program;
 
 extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
+extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
 extern struct nfs_server *nfs_create_server(
 					const struct nfs_parsed_mount_data *,
 					struct nfs_fh *);
-- 
cgit v1.2.3


From c1d171a002942ea2d93b4fbd0c9583c56fce0772 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Wed, 30 Jan 2008 13:30:40 +0100
Subject: x86: randomize brk

Randomize the location of the heap (brk) for i386 and x86_64.  The range is
randomized in the range starting at current brk location up to 0x02000000
offset for both architectures.  This, together with
pie-executable-randomization.patch and
pie-executable-randomization-fix.patch, should make the address space
randomization on i386 and x86_64 complete.

Arjan says:

This is known to break older versions of some emacs variants, whose dumper
code assumed that the last variable declared in the program is equal to the
start of the dynamically allocated memory region.

(The dumper is the code where emacs effectively dumps core at the end of it's
compilation stage; this coredump is then loaded as the main program during
normal use)

iirc this was 5 years or so; we found this way back when I was at RH and we
first did the security stuff there (including this brk randomization).  It
wasn't all variants of emacs, and it got fixed as a result (I vaguely remember
that emacs already had code to deal with it for other archs/oses, just
ifdeffed wrongly).

It's a rare and wrong assumption as a general thing, just on x86 it mostly
happened to be true (but to be honest, it'll break too if gcc does
something fancy or if the linker does a non-standard order).  Still its
something we should at least document.

Note 2: afaik it only broke the emacs *build*.  I'm not 100% sure about that
(it IS 5 years ago) though.

[ akpm@linux-foundation.org: deuglification ]

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/binfmt_elf.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f0b3171842f2..043a800c8f71 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1021,6 +1021,12 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	current->mm->end_data = end_data;
 	current->mm->start_stack = bprm->p;
 
+#ifdef arch_randomize_brk
+	if (current->flags & PF_RANDOMIZE)
+		current->mm->brk = current->mm->start_brk =
+			arch_randomize_brk(current->mm);
+#endif
+
 	if (current->personality & MMAP_PAGE_ZERO) {
 		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
 		   and some applications "depend" upon this behavior.
-- 
cgit v1.2.3


From cc503c1b43e002e3f1fed70f46d947e2bf349bb6 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Wed, 30 Jan 2008 13:31:07 +0100
Subject: x86: PIE executable randomization

main executable of (specially compiled/linked -pie/-fpie) ET_DYN binaries
onto a random address (in cases in which mmap() is allowed to perform a
randomization).

The code has been extraced from Ingo's exec-shield patch
http://people.redhat.com/mingo/exec-shield/

[akpm@linux-foundation.org: fix used-uninitialsied warning]
[kamezawa.hiroyu@jp.fujitsu.com: fixed ia32 ELF on x86_64 handling]

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/binfmt_elf.c | 107 ++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 85 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 043a800c8f71..8193d24be159 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,7 @@
 
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
 static int load_elf_library(struct file *);
-static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
+static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);
 
 /*
  * If we don't support core dumping, then supply a NULL so we
@@ -298,33 +298,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 #ifndef elf_map
 
 static unsigned long elf_map(struct file *filep, unsigned long addr,
-		struct elf_phdr *eppnt, int prot, int type)
+		struct elf_phdr *eppnt, int prot, int type,
+		unsigned long total_size)
 {
 	unsigned long map_addr;
-	unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
+	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
+	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
+	addr = ELF_PAGESTART(addr);
+	size = ELF_PAGEALIGN(size);
 
-	down_write(&current->mm->mmap_sem);
 	/* mmap() will return -EINVAL if given a zero size, but a
 	 * segment with zero filesize is perfectly valid */
-	if (eppnt->p_filesz + pageoffset)
-		map_addr = do_mmap(filep, ELF_PAGESTART(addr),
-				   eppnt->p_filesz + pageoffset, prot, type,
-				   eppnt->p_offset - pageoffset);
-	else
-		map_addr = ELF_PAGESTART(addr);
+	if (!size)
+		return addr;
+
+	down_write(&current->mm->mmap_sem);
+	/*
+	* total_size is the size of the ELF (interpreter) image.
+	* The _first_ mmap needs to know the full size, otherwise
+	* randomization might put this image into an overlapping
+	* position with the ELF binary image. (since size < total_size)
+	* So we first map the 'big' image - and unmap the remainder at
+	* the end. (which unmap is needed for ELF images with holes.)
+	*/
+	if (total_size) {
+		total_size = ELF_PAGEALIGN(total_size);
+		map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+		if (!BAD_ADDR(map_addr))
+			do_munmap(current->mm, map_addr+size, total_size-size);
+	} else
+		map_addr = do_mmap(filep, addr, size, prot, type, off);
+
 	up_write(&current->mm->mmap_sem);
 	return(map_addr);
 }
 
 #endif /* !elf_map */
 
+static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+{
+	int i, first_idx = -1, last_idx = -1;
+
+	for (i = 0; i < nr; i++) {
+		if (cmds[i].p_type == PT_LOAD) {
+			last_idx = i;
+			if (first_idx == -1)
+				first_idx = i;
+		}
+	}
+	if (first_idx == -1)
+		return 0;
+
+	return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
+				ELF_PAGESTART(cmds[first_idx].p_vaddr);
+}
+
+
 /* This is much more generalized than the library routine read function,
    so we keep this separate.  Technically the library read function
    is only provided so that we can read a.out libraries that have
    an ELF header */
 
 static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
-		struct file *interpreter, unsigned long *interp_load_addr)
+		struct file *interpreter, unsigned long *interp_map_addr,
+		unsigned long no_base)
 {
 	struct elf_phdr *elf_phdata;
 	struct elf_phdr *eppnt;
@@ -332,6 +369,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	int load_addr_set = 0;
 	unsigned long last_bss = 0, elf_bss = 0;
 	unsigned long error = ~0UL;
+	unsigned long total_size;
 	int retval, i, size;
 
 	/* First of all, some simple consistency checks */
@@ -370,6 +408,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		goto out_close;
 	}
 
+	total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
+	if (!total_size) {
+		error = -EINVAL;
+		goto out_close;
+	}
+
 	eppnt = elf_phdata;
 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
 		if (eppnt->p_type == PT_LOAD) {
@@ -387,9 +431,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			vaddr = eppnt->p_vaddr;
 			if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
 				elf_type |= MAP_FIXED;
+			else if (no_base && interp_elf_ex->e_type == ET_DYN)
+				load_addr = -vaddr;
 
 			map_addr = elf_map(interpreter, load_addr + vaddr,
-					   eppnt, elf_prot, elf_type);
+					   eppnt, elf_prot, elf_type, total_size);
+			total_size = 0;
+			if (!*interp_map_addr)
+				*interp_map_addr = map_addr;
 			error = map_addr;
 			if (BAD_ADDR(map_addr))
 				goto out_close;
@@ -455,8 +504,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			goto out_close;
 	}
 
-	*interp_load_addr = load_addr;
-	error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
+	error = load_addr;
 
 out_close:
 	kfree(elf_phdata);
@@ -553,7 +601,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	int elf_exec_fileno;
 	int retval, i;
 	unsigned int size;
-	unsigned long elf_entry, interp_load_addr = 0;
+	unsigned long elf_entry;
+	unsigned long interp_load_addr = 0;
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long reloc_func_desc = 0;
 	char passed_fileno[6];
@@ -825,9 +874,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	current->mm->start_stack = bprm->p;
 
 	/* Now we do a little grungy work by mmaping the ELF image into
-	   the correct location in memory.  At this point, we assume that
-	   the image should be loaded at fixed address, not at a variable
-	   address. */
+	   the correct location in memory. */
 	for(i = 0, elf_ppnt = elf_phdata;
 	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
 		int elf_prot = 0, elf_flags;
@@ -881,11 +928,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * default mmap base, as well as whatever program they
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
+#ifdef CONFIG_X86
+			load_bias = 0;
+#else
 			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+#endif
 		}
 
 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-				elf_prot, elf_flags);
+				elf_prot, elf_flags,0);
 		if (BAD_ADDR(error)) {
 			send_sig(SIGKILL, current, 0);
 			retval = IS_ERR((void *)error) ?
@@ -961,13 +1012,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	}
 
 	if (elf_interpreter) {
-		if (interpreter_type == INTERPRETER_AOUT)
+		if (interpreter_type == INTERPRETER_AOUT) {
 			elf_entry = load_aout_interp(&loc->interp_ex,
 						     interpreter);
-		else
+		} else {
+			unsigned long uninitialized_var(interp_map_addr);
+
 			elf_entry = load_elf_interp(&loc->interp_elf_ex,
 						    interpreter,
-						    &interp_load_addr);
+						    &interp_map_addr,
+						    load_bias);
+			if (!IS_ERR((void *)elf_entry)) {
+				/*
+				 * load_elf_interp() returns relocation
+				 * adjustment
+				 */
+				interp_load_addr = elf_entry;
+				elf_entry += loc->interp_elf_ex.e_entry;
+			}
+		}
 		if (BAD_ADDR(elf_entry)) {
 			force_sig(SIGSEGV, current);
 			retval = IS_ERR((void *)elf_entry) ?
-- 
cgit v1.2.3


From bb1ad8205be4cb95e3286d7442596da6fd70409f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 30 Jan 2008 13:31:07 +0100
Subject: x86: PIE executable randomization, checkpatch fixes

#39: FILE: arch/ia64/ia32/binfmt_elf32.c:229:
+elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)

WARNING: no space between function name and open parenthesis '('
#39: FILE: arch/ia64/ia32/binfmt_elf32.c:229:
+elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)

WARNING: line over 80 characters
#67: FILE: arch/x86/kernel/sys_x86_64.c:80:
+			new_begin = randomize_range(*begin, *begin + 0x02000000, 0);

ERROR: use tabs not spaces
#110: FILE: arch/x86/kernel/sys_x86_64.c:185:
+ ^I        mm->cached_hole_size = 0;$

ERROR: use tabs not spaces
#111: FILE: arch/x86/kernel/sys_x86_64.c:186:
+ ^I^Imm->free_area_cache = mm->mmap_base;$

ERROR: use tabs not spaces
#112: FILE: arch/x86/kernel/sys_x86_64.c:187:
+ ^I}$

ERROR: use tabs not spaces
#141: FILE: arch/x86/kernel/sys_x86_64.c:216:
+ ^I^I/* remember the largest hole we saw so far */$

ERROR: use tabs not spaces
#142: FILE: arch/x86/kernel/sys_x86_64.c:217:
+ ^I^Iif (addr + mm->cached_hole_size < vma->vm_start)$

ERROR: use tabs not spaces
#143: FILE: arch/x86/kernel/sys_x86_64.c:218:
+ ^I^I        mm->cached_hole_size = vma->vm_start - addr;$

ERROR: use tabs not spaces
#157: FILE: arch/x86/kernel/sys_x86_64.c:232:
+  ^Imm->free_area_cache = TASK_UNMAPPED_BASE;$

ERROR: need a space before the open parenthesis '('
#291: FILE: arch/x86/mm/mmap_64.c:101:
+	} else if(mmap_is_legacy()) {

WARNING: braces {} are not necessary for single statement blocks
#302: FILE: arch/x86/mm/mmap_64.c:112:
+	if (current->flags & PF_RANDOMIZE) {
+		mm->mmap_base += ((long)rnd) << PAGE_SHIFT;
+	}

WARNING: line over 80 characters
#314: FILE: fs/binfmt_elf.c:48:
+static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);

WARNING: no space between function name and open parenthesis '('
#314: FILE: fs/binfmt_elf.c:48:
+static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);

WARNING: line over 80 characters
#429: FILE: fs/binfmt_elf.c:438:
+					   eppnt, elf_prot, elf_type, total_size);

ERROR: need space after that ',' (ctx:VxV)
#480: FILE: fs/binfmt_elf.c:939:
+				elf_prot, elf_flags,0);
 				                   ^

total: 9 errors, 7 warnings, 461 lines checked
Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

Please run checkpatch prior to sending patches

Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/binfmt_elf.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8193d24be159..b8bca1ebc1a0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,8 @@
 
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
 static int load_elf_library(struct file *);
-static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);
+static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
+				int, int, unsigned long);
 
 /*
  * If we don't support core dumping, then supply a NULL so we
@@ -435,7 +436,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				load_addr = -vaddr;
 
 			map_addr = elf_map(interpreter, load_addr + vaddr,
-					   eppnt, elf_prot, elf_type, total_size);
+					eppnt, elf_prot, elf_type, total_size);
 			total_size = 0;
 			if (!*interp_map_addr)
 				*interp_map_addr = map_addr;
@@ -936,7 +937,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		}
 
 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-				elf_prot, elf_flags,0);
+				elf_prot, elf_flags, 0);
 		if (BAD_ADDR(error)) {
 			send_sig(SIGKILL, current, 0);
 			retval = IS_ERR((void *)error) ?
-- 
cgit v1.2.3


From 56c4da454de1264e381256f658f61b9ef690dd21 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:31:17 +0100
Subject: core: remove last users of empty FASTCALL macro

FASTCALL is always empty after the x86 removal.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 9dec7d2d546e..8a37dbbf3437 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -397,7 +397,7 @@ void fastcall __put_ioctx(struct kioctx *ctx)
  * This prevents races between the aio code path referencing the
  * req (after submitting it) and aio_complete() freeing the req.
  */
-static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx));
+static struct kiocb *__aio_get_req(struct kioctx *ctx);
 static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
 {
 	struct kiocb *req = NULL;
-- 
cgit v1.2.3


From 95c354fe9f7d6decc08a92aa26eb233ecc2155bf Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 30 Jan 2008 13:31:20 +0100
Subject: spinlock: lockbreak cleanup

The break_lock data structure and code for spinlocks is quite nasty.
Not only does it double the size of a spinlock but it changes locking to
a potentially less optimal trylock.

Put all of that under CONFIG_GENERIC_LOCKBREAK, and introduce a
__raw_spin_is_contended that uses the lock data itself to determine whether
there are waiters on the lock, to be used if CONFIG_GENERIC_LOCKBREAK is
not set.

Rename need_lockbreak to spin_needbreak, make it use spin_is_contended to
decouple it from the spinlock implementation, and make it typesafe (rwlocks
do not have any need_lockbreak sites -- why do they even get bloated up
with that break_lock then?).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jbd/checkpoint.c  | 3 ++-
 fs/jbd/commit.c      | 2 +-
 fs/jbd2/checkpoint.c | 3 ++-
 fs/jbd2/commit.c     | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 0f69c416eebc..a5432bbbfb88 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -347,7 +347,8 @@ restart:
 				break;
 			}
 			retry = __process_buffer(journal, jh, bhs,&batch_count);
-			if (!retry && lock_need_resched(&journal->j_list_lock)){
+			if (!retry && (need_resched() ||
+				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 610264b99a8e..31853eb65b4c 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -265,7 +265,7 @@ write_out_data:
 			put_bh(bh);
 		}
 
-		if (lock_need_resched(&journal->j_list_lock)) {
+		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 			spin_unlock(&journal->j_list_lock);
 			goto write_out_data;
 		}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1b7f282c1ae9..6914598022ce 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -353,7 +353,8 @@ restart:
 			}
 			retry = __process_buffer(journal, jh, bhs, &batch_count,
 						 transaction);
-			if (!retry && lock_need_resched(&journal->j_list_lock)){
+			if (!retry && (need_resched() ||
+				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index da8d0eb3b7b9..4f302d279279 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -341,7 +341,7 @@ write_out_data:
 			put_bh(bh);
 		}
 
-		if (lock_need_resched(&journal->j_list_lock)) {
+		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 			spin_unlock(&journal->j_list_lock);
 			goto write_out_data;
 		}
-- 
cgit v1.2.3


From 3aba481fc94d83ff630d4b7cd2f7447010c4c6df Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:44 +0100
Subject: elf core dump: notes reorg

This pulls out the code for writing the notes segment of an ELF core dump
into separate functions.  This cleanly isolates into one cluster of
functions everything that deals with the note formats and the hooks into
arch code to fill them.  The top-level elf_core_dump function itself now
deals purely with the generic ELF format and the memory segments.

This only moves code around into functions that can be inlined away.
It should not change any behavior at all.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/binfmt_elf.c | 324 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 194 insertions(+), 130 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b8bca1ebc1a0..4510429b973e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1395,7 +1395,8 @@ static int writenote(struct memelfnote *men, struct file *file,
 	if (!dump_seek(file, (off))) \
 		goto end_coredump;
 
-static void fill_elf_header(struct elfhdr *elf, int segs)
+static void fill_elf_header(struct elfhdr *elf, int segs,
+			    u16 machine, u32 flags, u8 osabi)
 {
 	memcpy(elf->e_ident, ELFMAG, SELFMAG);
 	elf->e_ident[EI_CLASS] = ELF_CLASS;
@@ -1405,12 +1406,12 @@ static void fill_elf_header(struct elfhdr *elf, int segs)
 	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
 
 	elf->e_type = ET_CORE;
-	elf->e_machine = ELF_ARCH;
+	elf->e_machine = machine;
 	elf->e_version = EV_CURRENT;
 	elf->e_entry = 0;
 	elf->e_phoff = sizeof(struct elfhdr);
 	elf->e_shoff = 0;
-	elf->e_flags = ELF_CORE_EFLAGS;
+	elf->e_flags = flags;
 	elf->e_ehsize = sizeof(struct elfhdr);
 	elf->e_phentsize = sizeof(struct elf_phdr);
 	elf->e_phnum = segs;
@@ -1517,6 +1518,16 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	return 0;
 }
 
+static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
+{
+	elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv;
+	int i = 0;
+	do
+		i += 2;
+	while (auxv[i - 2] != AT_NULL);
+	fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+}
+
 /* Here is the structure in which status of each thread is captured. */
 struct elf_thread_status
 {
@@ -1569,6 +1580,174 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 	return sz;
 }
 
+struct elf_note_info {
+	struct memelfnote *notes;
+	struct elf_prstatus *prstatus;	/* NT_PRSTATUS */
+	struct elf_prpsinfo *psinfo;	/* NT_PRPSINFO */
+	struct list_head thread_list;
+	elf_fpregset_t *fpu;
+#ifdef ELF_CORE_COPY_XFPREGS
+	elf_fpxregset_t *xfpu;
+#endif
+	int thread_status_size;
+	int numnote;
+};
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+			  struct elf_note_info *info,
+			  long signr, struct pt_regs *regs)
+{
+#define	NUM_NOTES	6
+	struct list_head *t;
+	struct task_struct *g, *p;
+
+	info->notes = NULL;
+	info->prstatus = NULL;
+	info->psinfo = NULL;
+	info->fpu = NULL;
+#ifdef ELF_CORE_COPY_XFPREGS
+	info->xfpu = NULL;
+#endif
+	INIT_LIST_HEAD(&info->thread_list);
+
+	info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote),
+			      GFP_KERNEL);
+	if (!info->notes)
+		return 0;
+	info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
+	if (!info->psinfo)
+		return 0;
+	info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
+	if (!info->prstatus)
+		return 0;
+	info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
+	if (!info->fpu)
+		return 0;
+#ifdef ELF_CORE_COPY_XFPREGS
+	info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
+	if (!info->xfpu)
+		return 0;
+#endif
+
+	info->thread_status_size = 0;
+	if (signr) {
+		struct elf_thread_status *tmp;
+		rcu_read_lock();
+		do_each_thread(g, p)
+			if (current->mm == p->mm && current != p) {
+				tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
+				if (!tmp) {
+					rcu_read_unlock();
+					return 0;
+				}
+				tmp->thread = p;
+				list_add(&tmp->list, &info->thread_list);
+			}
+		while_each_thread(g, p);
+		rcu_read_unlock();
+		list_for_each(t, &info->thread_list) {
+			struct elf_thread_status *tmp;
+			int sz;
+
+			tmp = list_entry(t, struct elf_thread_status, list);
+			sz = elf_dump_thread_status(signr, tmp);
+			info->thread_status_size += sz;
+		}
+	}
+	/* now collect the dump for the current */
+	memset(info->prstatus, 0, sizeof(*info->prstatus));
+	fill_prstatus(info->prstatus, current, signr);
+	elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+
+	/* Set up header */
+	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
+
+	/*
+	 * Set up the notes in similar form to SVR4 core dumps made
+	 * with info from their /proc.
+	 */
+
+	fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
+		  sizeof(*info->prstatus), info->prstatus);
+	fill_psinfo(info->psinfo, current->group_leader, current->mm);
+	fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
+		  sizeof(*info->psinfo), info->psinfo);
+
+	info->numnote = 2;
+
+	fill_auxv_note(&info->notes[info->numnote++], current->mm);
+
+	/* Try to dump the FPU. */
+	info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
+							       info->fpu);
+	if (info->prstatus->pr_fpvalid)
+		fill_note(info->notes + info->numnote++,
+			  "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+	if (elf_core_copy_task_xfpregs(current, info->xfpu))
+		fill_note(info->notes + info->numnote++,
+			  "LINUX", ELF_CORE_XFPREG_TYPE,
+			  sizeof(*info->xfpu), info->xfpu);
+#endif
+
+	return 1;
+
+#undef NUM_NOTES
+}
+
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+	int sz = 0;
+	int i;
+
+	for (i = 0; i < info->numnote; i++)
+		sz += notesize(info->notes + i);
+
+	sz += info->thread_status_size;
+
+	return sz;
+}
+
+static int write_note_info(struct elf_note_info *info,
+			   struct file *file, loff_t *foffset)
+{
+	int i;
+	struct list_head *t;
+
+	for (i = 0; i < info->numnote; i++)
+		if (!writenote(info->notes + i, file, foffset))
+			return 0;
+
+	/* write out the thread status notes section */
+	list_for_each(t, &info->thread_list) {
+		struct elf_thread_status *tmp =
+				list_entry(t, struct elf_thread_status, list);
+
+		for (i = 0; i < tmp->num_notes; i++)
+			if (!writenote(&tmp->notes[i], file, foffset))
+				return 0;
+	}
+
+	return 1;
+}
+
+static void free_note_info(struct elf_note_info *info)
+{
+	while (!list_empty(&info->thread_list)) {
+		struct list_head *tmp = info->thread_list.next;
+		list_del(tmp);
+		kfree(list_entry(tmp, struct elf_thread_status, list));
+	}
+
+	kfree(info->prstatus);
+	kfree(info->psinfo);
+	kfree(info->notes);
+	kfree(info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+	kfree(info->xfpu);
+#endif
+}
+
 static struct vm_area_struct *first_vma(struct task_struct *tsk,
 					struct vm_area_struct *gate_vma)
 {
@@ -1604,29 +1783,15 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
  */
 static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
 {
-#define	NUM_NOTES	6
 	int has_dumped = 0;
 	mm_segment_t fs;
 	int segs;
 	size_t size = 0;
-	int i;
 	struct vm_area_struct *vma, *gate_vma;
 	struct elfhdr *elf = NULL;
 	loff_t offset = 0, dataoff, foffset;
-	int numnote;
-	struct memelfnote *notes = NULL;
-	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
-	struct elf_prpsinfo *psinfo = NULL;	/* NT_PRPSINFO */
- 	struct task_struct *g, *p;
- 	LIST_HEAD(thread_list);
- 	struct list_head *t;
-	elf_fpregset_t *fpu = NULL;
-#ifdef ELF_CORE_COPY_XFPREGS
-	elf_fpxregset_t *xfpu = NULL;
-#endif
-	int thread_status_size = 0;
-	elf_addr_t *auxv;
 	unsigned long mm_flags;
+	struct elf_note_info info;
 
 	/*
 	 * We no longer stop all VM operations.
@@ -1644,52 +1809,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
 	if (!elf)
 		goto cleanup;
-	prstatus = kmalloc(sizeof(*prstatus), GFP_KERNEL);
-	if (!prstatus)
-		goto cleanup;
-	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-	if (!psinfo)
-		goto cleanup;
-	notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL);
-	if (!notes)
-		goto cleanup;
-	fpu = kmalloc(sizeof(*fpu), GFP_KERNEL);
-	if (!fpu)
-		goto cleanup;
-#ifdef ELF_CORE_COPY_XFPREGS
-	xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL);
-	if (!xfpu)
-		goto cleanup;
-#endif
-
-	if (signr) {
-		struct elf_thread_status *tmp;
-		rcu_read_lock();
-		do_each_thread(g,p)
-			if (current->mm == p->mm && current != p) {
-				tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
-				if (!tmp) {
-					rcu_read_unlock();
-					goto cleanup;
-				}
-				tmp->thread = p;
-				list_add(&tmp->list, &thread_list);
-			}
-		while_each_thread(g,p);
-		rcu_read_unlock();
-		list_for_each(t, &thread_list) {
-			struct elf_thread_status *tmp;
-			int sz;
-
-			tmp = list_entry(t, struct elf_thread_status, list);
-			sz = elf_dump_thread_status(signr, tmp);
-			thread_status_size += sz;
-		}
-	}
-	/* now collect the dump for the current */
-	memset(prstatus, 0, sizeof(*prstatus));
-	fill_prstatus(prstatus, current, signr);
-	elf_core_copy_regs(&prstatus->pr_reg, regs);
 	
 	segs = current->mm->map_count;
 #ifdef ELF_CORE_EXTRA_PHDRS
@@ -1700,42 +1819,16 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 	if (gate_vma != NULL)
 		segs++;
 
-	/* Set up header */
-	fill_elf_header(elf, segs + 1);	/* including notes section */
-
-	has_dumped = 1;
-	current->flags |= PF_DUMPCORE;
-
 	/*
-	 * Set up the notes in similar form to SVR4 core dumps made
-	 * with info from their /proc.
+	 * Collect all the non-memory information about the process for the
+	 * notes.  This also sets up the file header.
 	 */
+	if (!fill_note_info(elf, segs + 1, /* including notes section */
+			    &info, signr, regs))
+		goto cleanup;
 
-	fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
-	fill_psinfo(psinfo, current->group_leader, current->mm);
-	fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
-	
-	numnote = 2;
-
-	auxv = (elf_addr_t *)current->mm->saved_auxv;
-
-	i = 0;
-	do
-		i += 2;
-	while (auxv[i - 2] != AT_NULL);
-	fill_note(&notes[numnote++], "CORE", NT_AUXV,
-		  i * sizeof(elf_addr_t), auxv);
-
-  	/* Try to dump the FPU. */
-	if ((prstatus->pr_fpvalid =
-	     elf_core_copy_task_fpregs(current, regs, fpu)))
-		fill_note(notes + numnote++,
-			  "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
-#ifdef ELF_CORE_COPY_XFPREGS
-	if (elf_core_copy_task_xfpregs(current, xfpu))
-		fill_note(notes + numnote++,
-			  "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
-#endif	
+	has_dumped = 1;
+	current->flags |= PF_DUMPCORE;
   
 	fs = get_fs();
 	set_fs(KERNEL_DS);
@@ -1748,12 +1841,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 	/* Write notes phdr entry */
 	{
 		struct elf_phdr phdr;
-		int sz = 0;
-
-		for (i = 0; i < numnote; i++)
-			sz += notesize(notes + i);
-		
-		sz += thread_status_size;
+		size_t sz = get_note_info_size(&info);
 
 		sz += elf_coredump_extra_notes_size();
 
@@ -1798,23 +1886,12 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 #endif
 
  	/* write out the notes section */
-	for (i = 0; i < numnote; i++)
-		if (!writenote(notes + i, file, &foffset))
-			goto end_coredump;
+	if (!write_note_info(&info, file, &foffset))
+		goto end_coredump;
 
 	if (elf_coredump_extra_notes_write(file, &foffset))
 		goto end_coredump;
 
-	/* write out the thread status notes section */
-	list_for_each(t, &thread_list) {
-		struct elf_thread_status *tmp =
-				list_entry(t, struct elf_thread_status, list);
-
-		for (i = 0; i < tmp->num_notes; i++)
-			if (!writenote(&tmp->notes[i], file, &foffset))
-				goto end_coredump;
-	}
-
 	/* Align to page */
 	DUMP_SEEK(dataoff - foffset);
 
@@ -1865,22 +1942,9 @@ end_coredump:
 	set_fs(fs);
 
 cleanup:
-	while (!list_empty(&thread_list)) {
-		struct list_head *tmp = thread_list.next;
-		list_del(tmp);
-		kfree(list_entry(tmp, struct elf_thread_status, list));
-	}
-
 	kfree(elf);
-	kfree(prstatus);
-	kfree(psinfo);
-	kfree(notes);
-	kfree(fpu);
-#ifdef ELF_CORE_COPY_XFPREGS
-	kfree(xfpu);
-#endif
+	free_note_info(&info);
 	return has_dumped;
-#undef NUM_NOTES
 }
 
 #endif		/* USE_ELF_CORE_DUMP */
-- 
cgit v1.2.3


From 4206d3aa1978e44f58bfa4e1c9d8d35cbf19c187 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:45 +0100
Subject: elf core dump: notes user_regset

This modifies the ELF core dump code under #ifdef CORE_DUMP_USE_REGSET.
It changes nothing when this macro is not defined.  When it's #define'd
by some arch header (e.g. asm/elf.h), the arch must support the
user_regset (linux/regset.h) interface for reading thread state.

This provides an alternate version of note segment writing that is based
purely on the user_regset interfaces.  When CORE_DUMP_USE_REGSET is set,
the arch need not define macros such as ELF_CORE_COPY_REGS and ELF_ARCH.
All that information is taken from the user_regset data structures.
The core dumps come out exactly the same if arch's definitions for its
user_regset details are correct.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/binfmt_elf.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 224 insertions(+)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4510429b973e..786ee275ec0a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1528,6 +1528,228 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
 	fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
 }
 
+#ifdef CORE_DUMP_USE_REGSET
+#include <linux/regset.h>
+
+struct elf_thread_core_info {
+	struct elf_thread_core_info *next;
+	struct task_struct *task;
+	struct elf_prstatus prstatus;
+	struct memelfnote notes[0];
+};
+
+struct elf_note_info {
+	struct elf_thread_core_info *thread;
+	struct memelfnote psinfo;
+	struct memelfnote auxv;
+	size_t size;
+	int thread_notes;
+};
+
+static int fill_thread_core_info(struct elf_thread_core_info *t,
+				 const struct user_regset_view *view,
+				 long signr, size_t *total)
+{
+	unsigned int i;
+
+	/*
+	 * NT_PRSTATUS is the one special case, because the regset data
+	 * goes into the pr_reg field inside the note contents, rather
+	 * than being the whole note contents.  We fill the reset in here.
+	 * We assume that regset 0 is NT_PRSTATUS.
+	 */
+	fill_prstatus(&t->prstatus, t->task, signr);
+	(void) view->regsets[0].get(t->task, &view->regsets[0],
+				    0, sizeof(t->prstatus.pr_reg),
+				    &t->prstatus.pr_reg, NULL);
+
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
+		  sizeof(t->prstatus), &t->prstatus);
+	*total += notesize(&t->notes[0]);
+
+	/*
+	 * Each other regset might generate a note too.  For each regset
+	 * that has no core_note_type or is inactive, we leave t->notes[i]
+	 * all zero and we'll know to skip writing it later.
+	 */
+	for (i = 1; i < view->n; ++i) {
+		const struct user_regset *regset = &view->regsets[i];
+		if (regset->core_note_type &&
+		    (!regset->active || regset->active(t->task, regset))) {
+			int ret;
+			size_t size = regset->n * regset->size;
+			void *data = kmalloc(size, GFP_KERNEL);
+			if (unlikely(!data))
+				return 0;
+			ret = regset->get(t->task, regset,
+					  0, size, data, NULL);
+			if (unlikely(ret))
+				kfree(data);
+			else {
+				if (regset->core_note_type != NT_PRFPREG)
+					fill_note(&t->notes[i], "LINUX",
+						  regset->core_note_type,
+						  size, data);
+				else {
+					t->prstatus.pr_fpvalid = 1;
+					fill_note(&t->notes[i], "CORE",
+						  NT_PRFPREG, size, data);
+				}
+				*total += notesize(&t->notes[i]);
+			}
+		}
+	}
+
+	return 1;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+			  struct elf_note_info *info,
+			  long signr, struct pt_regs *regs)
+{
+	struct task_struct *dump_task = current;
+	const struct user_regset_view *view = task_user_regset_view(dump_task);
+	struct elf_thread_core_info *t;
+	struct elf_prpsinfo *psinfo;
+	struct task_struct *g, *p;
+	unsigned int i;
+
+	info->size = 0;
+	info->thread = NULL;
+
+	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
+	fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+
+	if (psinfo == NULL)
+		return 0;
+
+	/*
+	 * Figure out how many notes we're going to need for each thread.
+	 */
+	info->thread_notes = 0;
+	for (i = 0; i < view->n; ++i)
+		if (view->regsets[i].core_note_type != 0)
+			++info->thread_notes;
+
+	/*
+	 * Sanity check.  We rely on regset 0 being in NT_PRSTATUS,
+	 * since it is our one special case.
+	 */
+	if (unlikely(info->thread_notes == 0) ||
+	    unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) {
+		WARN_ON(1);
+		return 0;
+	}
+
+	/*
+	 * Initialize the ELF file header.
+	 */
+	fill_elf_header(elf, phdrs,
+			view->e_machine, view->e_flags, view->ei_osabi);
+
+	/*
+	 * Allocate a structure for each thread.
+	 */
+	rcu_read_lock();
+	do_each_thread(g, p)
+		if (p->mm == dump_task->mm) {
+			t = kzalloc(offsetof(struct elf_thread_core_info,
+					     notes[info->thread_notes]),
+				    GFP_ATOMIC);
+			if (unlikely(!t)) {
+				rcu_read_unlock();
+				return 0;
+			}
+			t->task = p;
+			if (p == dump_task || !info->thread) {
+				t->next = info->thread;
+				info->thread = t;
+			} else {
+				/*
+				 * Make sure to keep the original task at
+				 * the head of the list.
+				 */
+				t->next = info->thread->next;
+				info->thread->next = t;
+			}
+		}
+	while_each_thread(g, p);
+	rcu_read_unlock();
+
+	/*
+	 * Now fill in each thread's information.
+	 */
+	for (t = info->thread; t != NULL; t = t->next)
+		if (!fill_thread_core_info(t, view, signr, &info->size))
+			return 0;
+
+	/*
+	 * Fill in the two process-wide notes.
+	 */
+	fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
+	info->size += notesize(&info->psinfo);
+
+	fill_auxv_note(&info->auxv, current->mm);
+	info->size += notesize(&info->auxv);
+
+	return 1;
+}
+
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+	return info->size;
+}
+
+/*
+ * Write all the notes for each thread.  When writing the first thread, the
+ * process-wide notes are interleaved after the first thread-specific note.
+ */
+static int write_note_info(struct elf_note_info *info,
+			   struct file *file, loff_t *foffset)
+{
+	bool first = 1;
+	struct elf_thread_core_info *t = info->thread;
+
+	do {
+		int i;
+
+		if (!writenote(&t->notes[0], file, foffset))
+			return 0;
+
+		if (first && !writenote(&info->psinfo, file, foffset))
+			return 0;
+		if (first && !writenote(&info->auxv, file, foffset))
+			return 0;
+
+		for (i = 1; i < info->thread_notes; ++i)
+			if (t->notes[i].data &&
+			    !writenote(&t->notes[i], file, foffset))
+				return 0;
+
+		first = 0;
+		t = t->next;
+	} while (t);
+
+	return 1;
+}
+
+static void free_note_info(struct elf_note_info *info)
+{
+	struct elf_thread_core_info *threads = info->thread;
+	while (threads) {
+		unsigned int i;
+		struct elf_thread_core_info *t = threads;
+		threads = t->next;
+		WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
+		for (i = 1; i < info->thread_notes; ++i)
+			kfree(t->notes[i].data);
+		kfree(t);
+	}
+	kfree(info->psinfo.data);
+}
+
+#else
+
 /* Here is the structure in which status of each thread is captured. */
 struct elf_thread_status
 {
@@ -1748,6 +1970,8 @@ static void free_note_info(struct elf_note_info *info)
 #endif
 }
 
+#endif
+
 static struct vm_area_struct *first_vma(struct task_struct *tsk,
 					struct vm_area_struct *gate_vma)
 {
-- 
cgit v1.2.3


From 2f79e48ae2651fff08d08dab3acf1294467c1155 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:46 +0100
Subject: x86: compat_binfmt_elf

This adds fs/compat_binfmt_elf.c, a wrapper around fs/binfmt_elf.c for
32-bit ELF support on 64-bit kernels.  It can replace all the hand-rolled
versions of this that each 32/64 arch has, which are all about the same.

To use this, an arch's asm/elf.h has to define at least a few compat_*
macros that parallel the various macros that fs/binfmt_elf.c uses for
native support.

There is no attempt to deal with compat macros for the core dump format
support.  To use this file, the arch has to define compat_gregset_t for
linux/elfcore-compat.h and #define CORE_DUMP_USE_REGSET.  The 32-bit
compatible formats should come automatically from task_user_regset_view
called on a 32-bit task.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/compat_binfmt_elf.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 fs/compat_binfmt_elf.c

(limited to 'fs')

diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
new file mode 100644
index 000000000000..0adced2f296f
--- /dev/null
+++ b/fs/compat_binfmt_elf.c
@@ -0,0 +1,131 @@
+/*
+ * 32-bit compatibility support for ELF format executables and core dumps.
+ *
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ *
+ * This file is used in a 64-bit kernel that wants to support 32-bit ELF.
+ * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros
+ * used below, with definitions appropriate for 32-bit ABI compatibility.
+ *
+ * We use macros to rename the ABI types and machine-dependent
+ * functions used in binfmt_elf.c to compat versions.
+ */
+
+#include <linux/elfcore-compat.h>
+#include <linux/time.h>
+
+/*
+ * Rename the basic ELF layout types to refer to the 32-bit class of files.
+ */
+#undef	ELF_CLASS
+#define ELF_CLASS	ELFCLASS32
+
+#undef	elfhdr
+#undef	elf_phdr
+#undef	elf_note
+#undef	elf_addr_t
+#define elfhdr		elf32_hdr
+#define elf_phdr	elf32_phdr
+#define elf_note	elf32_note
+#define elf_addr_t	Elf32_Addr
+
+/*
+ * The machine-dependent core note format types are defined in elfcore-compat.h,
+ * which requires asm/elf.h to define compat_elf_gregset_t et al.
+ */
+#define elf_prstatus	compat_elf_prstatus
+#define elf_prpsinfo	compat_elf_prpsinfo
+
+/*
+ * Compat version of cputime_to_compat_timeval, perhaps this
+ * should be an inline in <linux/compat.h>.
+ */
+static void cputime_to_compat_timeval(const cputime_t cputime,
+				      struct compat_timeval *value)
+{
+	struct timeval tv;
+	cputime_to_timeval(cputime, &tv);
+	value->tv_sec = tv.tv_sec;
+	value->tv_usec = tv.tv_usec;
+}
+
+#undef cputime_to_timeval
+#define cputime_to_timeval cputime_to_compat_timeval
+
+
+/*
+ * To use this file, asm/elf.h must define compat_elf_check_arch.
+ * The other following macros can be defined if the compat versions
+ * differ from the native ones, or omitted when they match.
+ */
+
+#undef	ELF_ARCH
+#undef	elf_check_arch
+#define	elf_check_arch	compat_elf_check_arch
+
+#ifdef	COMPAT_ELF_PLATFORM
+#undef	ELF_PLATFORM
+#define	ELF_PLATFORM		COMPAT_ELF_PLATFORM
+#endif
+
+#ifdef	COMPAT_ELF_HWCAP
+#undef	ELF_HWCAP
+#define	ELF_HWCAP		COMPAT_ELF_HWCAP
+#endif
+
+#ifdef	COMPAT_ARCH_DLINFO
+#undef	ARCH_DLINFO
+#define	ARCH_DLINFO		COMPAT_ARCH_DLINFO
+#endif
+
+#ifdef	COMPAT_ELF_ET_DYN_BASE
+#undef	ELF_ET_DYN_BASE
+#define	ELF_ET_DYN_BASE		COMPAT_ELF_ET_DYN_BASE
+#endif
+
+#ifdef COMPAT_ELF_EXEC_PAGESIZE
+#undef	ELF_EXEC_PAGESIZE
+#define	ELF_EXEC_PAGESIZE	COMPAT_ELF_EXEC_PAGESIZE
+#endif
+
+#ifdef	COMPAT_ELF_PLAT_INIT
+#undef	ELF_PLAT_INIT
+#define	ELF_PLAT_INIT		COMPAT_ELF_PLAT_INIT
+#endif
+
+#ifdef	COMPAT_SET_PERSONALITY
+#undef	SET_PERSONALITY
+#define	SET_PERSONALITY		COMPAT_SET_PERSONALITY
+#endif
+
+#ifdef	compat_start_thread
+#undef	start_thread
+#define	start_thread		compat_start_thread
+#endif
+
+#ifdef	compat_arch_setup_additional_pages
+#undef	ARCH_HAS_SETUP_ADDITIONAL_PAGES
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+#undef	arch_setup_additional_pages
+#define	arch_setup_additional_pages compat_arch_setup_additional_pages
+#endif
+
+/*
+ * Rename a few of the symbols that binfmt_elf.c will define.
+ * These are all local so the names don't really matter, but it
+ * might make some debugging less confusing not to duplicate them.
+ */
+#define elf_format		compat_elf_format
+#define init_elf_binfmt		init_compat_elf_binfmt
+#define exit_elf_binfmt		exit_compat_elf_binfmt
+
+/*
+ * We share all the actual code with the native (64-bit) version.
+ */
+#include "binfmt_elf.c"
-- 
cgit v1.2.3


From b9d36d5d000294a128f7f174fe67623a10e29d61 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:46 +0100
Subject: x86: compat_binfmt_elf Kconfig

This adds Kconfig and Makefile bits to build fs/compat_binfmt_elf.c,
just added.  Each arch that wants to use this file needs to add a
"select COMPAT_BINFMT_ELF" line in its Kconfig bits that enable COMPAT.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/Kconfig.binfmt | 4 ++++
 fs/Makefile       | 1 +
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index d4fc6095466d..7c3d5f923da1 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -23,6 +23,10 @@ config BINFMT_ELF
 	  ld.so (check the file <file:Documentation/Changes> for location and
 	  latest version).
 
+config COMPAT_BINFMT_ELF
+	bool
+	depends on COMPAT && MMU
+
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y
diff --git a/fs/Makefile b/fs/Makefile
index 500cf15cdb4b..1e7a11bd4da1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
 obj-y				+= binfmt_script.o
 
 obj-$(CONFIG_BINFMT_ELF)	+= binfmt_elf.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
 obj-$(CONFIG_BINFMT_SOM)	+= binfmt_som.o
 obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
-- 
cgit v1.2.3


From 612a95b4e053b8a06319049191fd2dce9c970189 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 30 Jan 2008 13:33:32 +0100
Subject: x86: remove iBCS support

ibcs2 support has never been supported on 2.6 kernels as far as I know,
and if it has it must have been an external patch.  Anyways, if anybody
applies an external patch they could as well readd the ibcs checking
code to the ELF loader in the same patch.  But there is no reason to
keep this code running in all Linux kernels.  This will save at least
two strcmps each ELF execution.

No deprecation period because it could not have been used anyway.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/binfmt_elf.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 786ee275ec0a..18ed6dd906c1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -595,7 +595,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	int load_addr_set = 0;
 	char * elf_interpreter = NULL;
 	unsigned int interpreter_type = INTERPRETER_NONE;
-	unsigned char ibcs2_interpreter = 0;
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata;
 	unsigned long elf_bss, elf_brk;
@@ -713,14 +712,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
 				goto out_free_interp;
 
-			/* If the program interpreter is one of these two,
-			 * then assume an iBCS2 image. Otherwise assume
-			 * a native linux image.
-			 */
-			if (strcmp(elf_interpreter,"/usr/lib/libc.so.1") == 0 ||
-			    strcmp(elf_interpreter,"/usr/lib/ld.so.1") == 0)
-				ibcs2_interpreter = 1;
-
 			/*
 			 * The early SET_PERSONALITY here is so that the lookup
 			 * for the interpreter happens in the namespace of the 
@@ -740,7 +731,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * switch really is going to happen - do this in
 			 * flush_thread().	- akpm
 			 */
-			SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+			SET_PERSONALITY(loc->elf_ex, 0);
 
 			interpreter = open_exec(elf_interpreter);
 			retval = PTR_ERR(interpreter);
@@ -819,7 +810,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			goto out_free_dentry;
 	} else {
 		/* Executables without an interpreter also need a personality  */
-		SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+		SET_PERSONALITY(loc->elf_ex, 0);
 	}
 
 	/* OK, we are done with that, now set up the arg stuff,
@@ -853,7 +844,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 
 	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
 	   may depend on the personality.  */
-	SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+	SET_PERSONALITY(loc->elf_ex, 0);
 	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
 		current->personality |= READ_IMPLIES_EXEC;
 
-- 
cgit v1.2.3


From e7847d35ac39fe92c94540e88ac3d0e177f52d9e Mon Sep 17 00:00:00 2001
From: "Fabio M. Di Nitto" <fabbione@ubuntu.com>
Date: Wed, 30 Jan 2008 10:56:42 -0600
Subject: dlm: align midcomms message buffer

gcc does not guarantee that an auto buffer is 64bit aligned.
This change allows sparc64 to work.

Signed-off-by: Fabio M. Di Nitto <fabbione@ubuntu.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/midcomms.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index f8c69dda16a0..e69926e984db 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -58,8 +58,12 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset,
 int dlm_process_incoming_buffer(int nodeid, const void *base,
 				unsigned offset, unsigned len, unsigned limit)
 {
-	unsigned char __tmp[DLM_INBUF_LEN];
-	struct dlm_header *msg = (struct dlm_header *) __tmp;
+	union {
+		unsigned char __buf[DLM_INBUF_LEN];
+		/* this is to force proper alignment on some arches */
+		struct dlm_header dlm;
+	} __tmp;
+	struct dlm_header *msg = &__tmp.dlm;
 	int ret = 0;
 	int err = 0;
 	uint16_t msglen;
@@ -100,8 +104,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
 		   in the buffer on the stack (which should work for most
 		   ordinary messages). */
 
-		if (msglen > sizeof(__tmp) &&
-		    msg == (struct dlm_header *) __tmp) {
+		if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) {
 			msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
 			if (msg == NULL)
 				return ret;
@@ -119,7 +122,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
 		dlm_receive_buffer(msg, nodeid);
 	}
 
-	if (msg != (struct dlm_header *) __tmp)
+	if (msg != &__tmp.dlm)
 		kfree(msg);
 
 	return err ? err : ret;
-- 
cgit v1.2.3


From 550283e30ccec5ddab9749a77b0022ebcaf0f3af Mon Sep 17 00:00:00 2001
From: "Fabio M. Di Nitto" <fabbione@ubuntu.com>
Date: Tue, 15 Jan 2008 15:13:36 -0600
Subject: dlm: swap bytes for rcom lock reply

DLM_RCOM_LOCK_REPLY messages need byte swapping.

Signed-off-by: Fabio M. Di Nitto <fabbione@ubuntu.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/util.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 963889cf6740..38dcfeb9c4b7 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -137,7 +137,7 @@ void dlm_rcom_out(struct dlm_rcom *rc)
 	rc->rc_seq		= cpu_to_le64(rc->rc_seq);
 	rc->rc_seq_reply	= cpu_to_le64(rc->rc_seq_reply);
 
-	if (type == DLM_RCOM_LOCK)
+	if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
 		rcom_lock_out((struct rcom_lock *) rc->rc_buf);
 
 	else if (type == DLM_RCOM_STATUS_REPLY)
@@ -147,6 +147,7 @@ void dlm_rcom_out(struct dlm_rcom *rc)
 void dlm_rcom_in(struct dlm_rcom *rc)
 {
 	struct dlm_header *hd = (struct dlm_header *) rc;
+	int type;
 
 	header_in(hd);
 
@@ -156,10 +157,12 @@ void dlm_rcom_in(struct dlm_rcom *rc)
 	rc->rc_seq		= le64_to_cpu(rc->rc_seq);
 	rc->rc_seq_reply	= le64_to_cpu(rc->rc_seq_reply);
 
-	if (rc->rc_type == DLM_RCOM_LOCK)
+	type = rc->rc_type;
+
+	if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
 		rcom_lock_in((struct rcom_lock *) rc->rc_buf);
 
-	else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+	else if (type == DLM_RCOM_STATUS_REPLY)
 		rcom_config_in((struct rcom_config *) rc->rc_buf);
 }
 
-- 
cgit v1.2.3


From 861e2369e9e7e003677f99f22c4d1f05d3ed66d3 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 15 Jan 2008 15:43:24 -0600
Subject: dlm: use fixed errno values in messages

Some errno values differ across platforms. So if we return things like
-EINPROGRESS from one node it can get misinterpreted or rejected on
another one.

This patch fixes up the errno values passed on the wire so that they
match the x86 ones (so as not to break the protocol), and re-instates
the platform-specific ones at the other end.

Many thanks to Fabio for testing this patch.
Initial patch from Patrick.

Signed-off-by: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Fabio M. Di Nitto <fabbione@ubuntu.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/util.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 38dcfeb9c4b7..11c6a456309f 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -14,6 +14,14 @@
 #include "rcom.h"
 #include "util.h"
 
+#define DLM_ERRNO_EDEADLK		35
+#define DLM_ERRNO_EBADR			53
+#define DLM_ERRNO_EBADSLT		57
+#define DLM_ERRNO_EPROTO		71
+#define DLM_ERRNO_EOPNOTSUPP		95
+#define DLM_ERRNO_ETIMEDOUT	       110
+#define DLM_ERRNO_EINPROGRESS	       115
+
 static void header_out(struct dlm_header *hd)
 {
 	hd->h_version		= cpu_to_le32(hd->h_version);
@@ -30,6 +38,51 @@ static void header_in(struct dlm_header *hd)
 	hd->h_length		= le16_to_cpu(hd->h_length);
 }
 
+/* higher errno values are inconsistent across architectures, so select
+   one set of values for on the wire */
+
+static int to_dlm_errno(int err)
+{
+	switch (err) {
+	case -EDEADLK:
+		return -DLM_ERRNO_EDEADLK;
+	case -EBADR:
+		return -DLM_ERRNO_EBADR;
+	case -EBADSLT:
+		return -DLM_ERRNO_EBADSLT;
+	case -EPROTO:
+		return -DLM_ERRNO_EPROTO;
+	case -EOPNOTSUPP:
+		return -DLM_ERRNO_EOPNOTSUPP;
+	case -ETIMEDOUT:
+		return -DLM_ERRNO_ETIMEDOUT;
+	case -EINPROGRESS:
+		return -DLM_ERRNO_EINPROGRESS;
+	}
+	return err;
+}
+
+static int from_dlm_errno(int err)
+{
+	switch (err) {
+	case -DLM_ERRNO_EDEADLK:
+		return -EDEADLK;
+	case -DLM_ERRNO_EBADR:
+		return -EBADR;
+	case -DLM_ERRNO_EBADSLT:
+		return -EBADSLT;
+	case -DLM_ERRNO_EPROTO:
+		return -EPROTO;
+	case -DLM_ERRNO_EOPNOTSUPP:
+		return -EOPNOTSUPP;
+	case -DLM_ERRNO_ETIMEDOUT:
+		return -ETIMEDOUT;
+	case -DLM_ERRNO_EINPROGRESS:
+		return -EINPROGRESS;
+	}
+	return err;
+}
+
 void dlm_message_out(struct dlm_message *ms)
 {
 	struct dlm_header *hd = (struct dlm_header *) ms;
@@ -53,7 +106,7 @@ void dlm_message_out(struct dlm_message *ms)
 	ms->m_rqmode		= cpu_to_le32(ms->m_rqmode);
 	ms->m_bastmode		= cpu_to_le32(ms->m_bastmode);
 	ms->m_asts		= cpu_to_le32(ms->m_asts);
-	ms->m_result		= cpu_to_le32(ms->m_result);
+	ms->m_result		= cpu_to_le32(to_dlm_errno(ms->m_result));
 }
 
 void dlm_message_in(struct dlm_message *ms)
@@ -79,7 +132,7 @@ void dlm_message_in(struct dlm_message *ms)
 	ms->m_rqmode		= le32_to_cpu(ms->m_rqmode);
 	ms->m_bastmode		= le32_to_cpu(ms->m_bastmode);
 	ms->m_asts		= le32_to_cpu(ms->m_asts);
-	ms->m_result		= le32_to_cpu(ms->m_result);
+	ms->m_result		= from_dlm_errno(le32_to_cpu(ms->m_result));
 }
 
 static void rcom_lock_out(struct rcom_lock *rl)
-- 
cgit v1.2.3


From 8a358ca8e738b6226b004efea462ac28c0a2bbb1 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 7 Jan 2008 15:55:18 -0600
Subject: dlm: clear ast_type when removing from astqueue

The lkb_ast_type field indicates whether the lkb is on the astqueue list.
When clearing locks for a process, lkb's were being removed from the astqueue
list without clearing the field.  If release_lockspace then happened
immediately afterward, it could try to remove the lkb from the list a second
time.

Appears when process calls libdlm dlm_release_lockspace() which first
closes the ls dev triggering clear_proc_locks, and then removes the ls
(a write to control dev) causing release_lockspace().

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index ddb46281f34d..43ca2a30c413 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -4678,6 +4678,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 	}
 
 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+		lkb->lkb_ast_type = 0;
 		list_del(&lkb->lkb_astqueue);
 		dlm_put_lkb(lkb);
 	}
-- 
cgit v1.2.3


From 601342ce022b964f756b67f2eb99b605c1afa3ed Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 7 Jan 2008 16:15:05 -0600
Subject: dlm: recover locks waiting for overlap replies

When recovery looks at locks waiting for replies, it fails to consider
locks that have already received a reply for their first remote operation,
but not received a reply for secondary, overlapping unlock/cancel.  The
appropriate stub reply needs to be called for these waiters.

Appears when we start doing recovery in the presence of a many overlapping
unlock/cancel ops.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 43ca2a30c413..a758f1b80e3b 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3846,6 +3846,7 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
 void dlm_recover_waiters_pre(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb, *safe;
+	int wait_type, stub_unlock_result, stub_cancel_result;
 
 	mutex_lock(&ls->ls_waiters_mutex);
 
@@ -3864,7 +3865,33 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 		if (!waiter_needs_recovery(ls, lkb))
 			continue;
 
-		switch (lkb->lkb_wait_type) {
+		wait_type = lkb->lkb_wait_type;
+		stub_unlock_result = -DLM_EUNLOCK;
+		stub_cancel_result = -DLM_ECANCEL;
+
+		/* Main reply may have been received leaving a zero wait_type,
+		   but a reply for the overlapping op may not have been
+		   received.  In that case we need to fake the appropriate
+		   reply for the overlap op. */
+
+		if (!wait_type) {
+			if (is_overlap_cancel(lkb)) {
+				wait_type = DLM_MSG_CANCEL;
+				if (lkb->lkb_grmode == DLM_LOCK_IV)
+					stub_cancel_result = 0;
+			}
+			if (is_overlap_unlock(lkb)) {
+				wait_type = DLM_MSG_UNLOCK;
+				if (lkb->lkb_grmode == DLM_LOCK_IV)
+					stub_unlock_result = -ENOENT;
+			}
+
+			log_debug(ls, "rwpre overlap %x %x %d %d %d",
+				  lkb->lkb_id, lkb->lkb_flags, wait_type,
+				  stub_cancel_result, stub_unlock_result);
+		}
+
+		switch (wait_type) {
 
 		case DLM_MSG_REQUEST:
 			lkb->lkb_flags |= DLM_IFL_RESEND;
@@ -3877,7 +3904,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 		case DLM_MSG_UNLOCK:
 			hold_lkb(lkb);
 			ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
-			ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+			ls->ls_stub_ms.m_result = stub_unlock_result;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
@@ -3886,15 +3913,15 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 		case DLM_MSG_CANCEL:
 			hold_lkb(lkb);
 			ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
-			ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+			ls->ls_stub_ms.m_result = stub_cancel_result;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
 			break;
 
 		default:
-			log_error(ls, "invalid lkb wait_type %d",
-				  lkb->lkb_wait_type);
+			log_error(ls, "invalid lkb wait_type %d %d",
+				  lkb->lkb_wait_type, wait_type);
 		}
 		schedule();
 	}
-- 
cgit v1.2.3


From aec64e1be2225c6fc64499594d23257c6adf6168 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 8 Jan 2008 15:37:47 -0600
Subject: dlm: another call to confirm_master in receive_request_reply

When a failed request (EBADR or ENOTBLK) is unlocked/canceled instead of
retried, there may be other lkb's waiting on the rsb_lookup list for it
to complete.  A call to confirm_master() is needed to move on to the next
waiting lkb since the current one won't be retried.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index a758f1b80e3b..d5e8ea1b4f75 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1940,8 +1940,11 @@ static void confirm_master(struct dlm_rsb *r, int error)
 		break;
 
 	case -EAGAIN:
-		/* the remote master didn't queue our NOQUEUE request;
-		   make a waiting lkb the first_lkid */
+	case -EBADR:
+	case -ENOTBLK:
+		/* the remote request failed and won't be retried (it was
+		   a NOQUEUE, or has been canceled/unlocked); make a waiting
+		   lkb the first_lkid */
 
 		r->res_first_lkid = 0;
 
@@ -3382,6 +3385,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 		if (is_overlap(lkb)) {
 			/* we'll ignore error in cancel/unlock reply */
 			queue_cast_overlap(r, lkb);
+			confirm_master(r, result);
 			unhold_lkb(lkb); /* undoes create_lkb() */
 		} else
 			_request_lock(r, lkb);
-- 
cgit v1.2.3


From 46b43eed7018bab3a4e8c259eed27697b9170cb8 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 8 Jan 2008 16:24:00 -0600
Subject: dlm: reject messages from non-members

Messages from nodes that are no longer members of the lockspace should be
ignored.  When nodes are removed from the lockspace, recovery can
sometimes complete quickly enough that messages arrive from a removed node
after recovery has completed.  When processed, these messages would often
cause an error message, and could in some cases change some state, causing
problems.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c   | 9 ++++++++-
 fs/dlm/member.c | 4 ++--
 fs/dlm/member.h | 3 ++-
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index d5e8ea1b4f75..c3b9fca17044 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -3643,6 +3643,13 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
 {
+	if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
+		log_debug(ls, "ignore non-member message %d from %d %x %x %d",
+			  ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
+			  ms->m_remid, ms->m_result);
+		return;
+	}
+
 	switch (ms->m_type) {
 
 	/* messages sent to a master node */
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index e9cdcab306e2..fa17f5a27883 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
 	ls->ls_num_nodes--;
 }
 
-static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
 {
 	struct dlm_member *memb;
 
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 927c08c19214..7a26fca1e0b5 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_clear_members(struct dlm_ls *ls);
 void dlm_clear_members_gone(struct dlm_ls *ls);
 int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
 int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+int dlm_is_member(struct dlm_ls *ls, int nodeid);
 
 #endif                          /* __MEMBER_DOT_H__ */
 
-- 
cgit v1.2.3


From c54e04b00fe027da30ada5af76b6749772dd644a Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 9 Jan 2008 09:59:41 -0600
Subject: dlm: validate messages before processing

There was some hit and miss validation of messages that has now been
cleaned up and unified.  Before processing a message, the new
validate_message() function checks that the lkb is the appropriate type,
process-copy or master-copy, and that the message is from the correct
nodeid for the the given lkb.  Other checks and assertions on the
lkb type and nodeid have been removed.  The assertions were particularly
bad since they would panic the machine instead of just ignoring the bad
message.

Although other recent patches have made processing old message unlikely,
it still may be possible for an old message to be processed and caught
by these checks.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 139 +++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 104 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c3b9fca17044..c2890efb0259 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3008,8 +3008,6 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
 	lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
 
-	DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
-
 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
 		/* lkb was just created so there won't be an lvb yet */
 		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
@@ -3023,16 +3021,6 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 				struct dlm_message *ms)
 {
-	if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
-		log_error(ls, "convert_args nodeid %d %d lkid %x %x",
-			  lkb->lkb_nodeid, ms->m_header.h_nodeid,
-			  lkb->lkb_id, lkb->lkb_remid);
-		return -EINVAL;
-	}
-
-	if (!is_master_copy(lkb))
-		return -EINVAL;
-
 	if (lkb->lkb_status != DLM_LKSTS_GRANTED)
 		return -EBUSY;
 
@@ -3048,8 +3036,6 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 			       struct dlm_message *ms)
 {
-	if (!is_master_copy(lkb))
-		return -EINVAL;
 	if (receive_lvb(ls, lkb, ms))
 		return -ENOMEM;
 	return 0;
@@ -3065,6 +3051,50 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
 	lkb->lkb_remid = ms->m_lkid;
 }
 
+/* This is called after the rsb is locked so that we can safely inspect
+   fields in the lkb. */
+
+static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	int from = ms->m_header.h_nodeid;
+	int error = 0;
+
+	switch (ms->m_type) {
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_CANCEL:
+		if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
+			error = -EINVAL;
+		break;
+
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_UNLOCK_REPLY:
+	case DLM_MSG_CANCEL_REPLY:
+	case DLM_MSG_GRANT:
+	case DLM_MSG_BAST:
+		if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
+			error = -EINVAL;
+		break;
+
+	case DLM_MSG_REQUEST_REPLY:
+		if (!is_process_copy(lkb))
+			error = -EINVAL;
+		else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
+			error = -EINVAL;
+		break;
+
+	default:
+		error = -EINVAL;
+	}
+
+	if (error)
+		log_error(lkb->lkb_resource->res_ls,
+			  "ignore invalid message %d from %d %x %x %x %d",
+			  ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
+			  lkb->lkb_flags, lkb->lkb_nodeid);
+	return error;
+}
+
 static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
 {
 	struct dlm_lkb *lkb;
@@ -3126,17 +3156,21 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
 	receive_flags(lkb, ms);
 	error = receive_convert_args(ls, lkb, ms);
 	if (error)
-		goto out;
+		goto out_reply;
 	reply = !down_conversion(lkb);
 
 	error = do_convert(r, lkb);
- out:
+ out_reply:
 	if (reply)
 		send_convert_reply(r, lkb, error);
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3162,15 +3196,19 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
 	receive_flags(lkb, ms);
 	error = receive_unlock_args(ls, lkb, ms);
 	if (error)
-		goto out;
+		goto out_reply;
 
 	error = do_unlock(r, lkb);
- out:
+ out_reply:
 	send_unlock_reply(r, lkb, error);
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3198,9 +3236,13 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
 	error = do_cancel(r, lkb);
 	send_cancel_reply(r, lkb, error);
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3219,22 +3261,26 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_error(ls, "receive_grant no lkb");
+		log_debug(ls, "receive_grant from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
 		return;
 	}
-	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	r = lkb->lkb_resource;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
 	receive_flags_reply(lkb, ms);
 	if (is_altmode(lkb))
 		munge_altmode(lkb, ms);
 	grant_lock_pc(r, lkb, ms);
 	queue_cast(r, lkb, 0);
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3248,18 +3294,22 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_error(ls, "receive_bast no lkb");
+		log_debug(ls, "receive_bast from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
 		return;
 	}
-	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	r = lkb->lkb_resource;
 
 	hold_rsb(r);
 	lock_rsb(r);
 
-	queue_bast(r, lkb, ms->m_bastmode);
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
 
+	queue_bast(r, lkb, ms->m_bastmode);
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
@@ -3325,15 +3375,19 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_error(ls, "receive_request_reply no lkb");
+		log_debug(ls, "receive_request_reply from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
 		return;
 	}
-	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	r = lkb->lkb_resource;
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
 	mstype = lkb->lkb_wait_type;
 	error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
 	if (error)
@@ -3466,6 +3520,10 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
 	/* stub reply can happen with waiters_mutex held */
 	error = remove_from_waiters_ms(lkb, ms);
 	if (error)
@@ -3484,10 +3542,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_error(ls, "receive_convert_reply no lkb");
+		log_debug(ls, "receive_convert_reply from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
 		return;
 	}
-	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	_receive_convert_reply(lkb, ms);
 	dlm_put_lkb(lkb);
@@ -3501,6 +3559,10 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
 	/* stub reply can happen with waiters_mutex held */
 	error = remove_from_waiters_ms(lkb, ms);
 	if (error)
@@ -3532,10 +3594,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_error(ls, "receive_unlock_reply no lkb");
+		log_debug(ls, "receive_unlock_reply from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
 		return;
 	}
-	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	_receive_unlock_reply(lkb, ms);
 	dlm_put_lkb(lkb);
@@ -3549,6 +3611,10 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
 	hold_rsb(r);
 	lock_rsb(r);
 
+	error = validate_message(lkb, ms);
+	if (error)
+		goto out;
+
 	/* stub reply can happen with waiters_mutex held */
 	error = remove_from_waiters_ms(lkb, ms);
 	if (error)
@@ -3580,10 +3646,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = find_lkb(ls, ms->m_remid, &lkb);
 	if (error) {
-		log_error(ls, "receive_cancel_reply no lkb");
+		log_debug(ls, "receive_cancel_reply from %d no lkb %x",
+			  ms->m_header.h_nodeid, ms->m_remid);
 		return;
 	}
-	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
 
 	_receive_cancel_reply(lkb, ms);
 	dlm_put_lkb(lkb);
@@ -3816,6 +3882,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
 		ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
 		ls->ls_stub_ms.m_result = -EINPROGRESS;
 		ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+		ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
 		_receive_convert_reply(lkb, &ls->ls_stub_ms);
 
 		/* Same special case as in receive_rcom_lock_args() */
@@ -3917,6 +3984,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 			ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
 			ls->ls_stub_ms.m_result = stub_unlock_result;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+			ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
 			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
 			break;
@@ -3926,6 +3994,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
 			ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
 			ls->ls_stub_ms.m_result = stub_cancel_result;
 			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+			ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
 			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
 			break;
-- 
cgit v1.2.3


From 42dc1601a9a31e8da767a4a9c37bad844b3698ab Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 9 Jan 2008 10:30:45 -0600
Subject: dlm: reject normal unlock when lock is waiting for lookup

Non-forced unlocks should be rejected if the lock is waiting on the
rsb_lookup list for another lock to establish the master node.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c2890efb0259..fa68e9b93651 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2110,17 +2110,18 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 	/* an lkb may be waiting for an rsb lookup to complete where the
 	   lookup was initiated by another lock */
 
-	if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
-		if (!list_empty(&lkb->lkb_rsb_lookup)) {
+	if (!list_empty(&lkb->lkb_rsb_lookup)) {
+		if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
 			log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
 			list_del_init(&lkb->lkb_rsb_lookup);
 			queue_cast(lkb->lkb_resource, lkb,
 				   args->flags & DLM_LKF_CANCEL ?
 				   -DLM_ECANCEL : -DLM_EUNLOCK);
 			unhold_lkb(lkb); /* undoes create_lkb() */
-			rv = -EBUSY;
-			goto out;
 		}
+		/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
+		rv = -EBUSY;
+		goto out;
 	}
 
 	/* cancel not allowed with another cancel/unlock in progress */
-- 
cgit v1.2.3


From 755b5eb8bac90b35dc901465a06081aaad94e9ae Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 9 Jan 2008 10:37:39 -0600
Subject: dlm: limit dir lookup loop

In a rare case we may need to repeat a local resource directory lookup
due to a race with removing the rsb and removing the resdir record.
We'll never need to do more than a single additional lookup, though,
so the infinite loop around the lookup can be removed.  In addition
to being unnecessary, the infinite loop is dangerous since some other
unknown condition may appear causing the loop to never break.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index fa68e9b93651..bc2e4ba4c1be 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1851,7 +1851,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
 static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	struct dlm_ls *ls = r->res_ls;
-	int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+	int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
 
 	if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
 		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
@@ -1885,7 +1885,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		return 1;
 	}
 
-	for (;;) {
+	for (i = 0; i < 2; i++) {
 		/* It's possible for dlm_scand to remove an old rsb for
 		   this same resource from the toss list, us to create
 		   a new one, look up the master locally, and find it
@@ -1899,6 +1899,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
 		schedule();
 	}
+	if (error && error != -EEXIST)
+		return error;
 
 	if (ret_nodeid == our_nodeid) {
 		r->res_first_lkid = 0;
-- 
cgit v1.2.3


From ce5246b972f7514af899a63c0faf831d05ed5ee1 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 14 Jan 2008 15:48:58 -0600
Subject: dlm: fix possible use-after-free

The dlm_put_lkb() can free the lkb and its associated ua structure,
so we can't depend on using the ua struct after the put.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 4f741546f4bb..eb6164816948 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -236,12 +236,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 	spin_unlock(&proc->asts_spin);
 
 	if (eol) {
-		spin_lock(&ua->proc->locks_spin);
+		spin_lock(&proc->locks_spin);
 		if (!list_empty(&lkb->lkb_ownqueue)) {
 			list_del_init(&lkb->lkb_ownqueue);
 			dlm_put_lkb(lkb);
 		}
-		spin_unlock(&ua->proc->locks_spin);
+		spin_unlock(&proc->locks_spin);
 	}
  out:
 	mutex_unlock(&ls->ls_clear_proc_locks);
-- 
cgit v1.2.3


From 594199ebaae5d77f025974dfcfa6651cc81325a8 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 16 Jan 2008 11:03:41 -0600
Subject: dlm: change error message to debug

The invalid lockspace messages are normal and can appear relatively
often.  They should be suppressed without debugging enabled.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index bc2e4ba4c1be..7ee7c7c55453 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3857,8 +3857,9 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
 
 	ls = dlm_find_lockspace_global(hd->h_lockspace);
 	if (!ls) {
-		log_print("invalid h_lockspace %x from %d cmd %d type %d",
-			  hd->h_lockspace, nodeid, hd->h_cmd, type);
+		if (dlm_config.ci_log_debug)
+			log_print("invalid lockspace %x from %d cmd %d type %d",
+				  hd->h_lockspace, nodeid, hd->h_cmd, type);
 
 		if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
 			dlm_send_ls_not_ready(nodeid, rc);
-- 
cgit v1.2.3


From 85f0379aa0f9366bb6918e2e898a915231176fbd Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 16 Jan 2008 13:02:31 -0600
Subject: dlm: keep cached master rsbs during recovery

To prevent the master of an rsb from changing rapidly, an unused rsb is kept
on the "toss list" for a period of time to be reused.  The toss list was
being cleared completely for each recovery, which is unnecessary.  Much of
the benefit of the toss list can be maintained if nodes keep rsb's in their
toss list that they are the master of.  These rsb's need to be included
when the resource directory is rebuilt during recovery.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dir.c      | 66 +++++++++++++++++++++++++++----------------------------
 fs/dlm/lock.c     |  6 -----
 fs/dlm/lock.h     |  2 --
 fs/dlm/recover.c  | 25 +++++++++++++++++++--
 fs/dlm/recoverd.c | 11 +++++-----
 5 files changed, 61 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 600bb1d1a9b6..ff97ba924333 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -329,49 +329,47 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
 	return get_entry(ls, nodeid, name, namelen, r_nodeid);
 }
 
-/* Copy the names of master rsb's into the buffer provided.
-   Only select names whose dir node is the given nodeid. */
+static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
+{
+	struct dlm_rsb *r;
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		if (len == r->res_length && !memcmp(name, r->res_name, len)) {
+			up_read(&ls->ls_root_sem);
+			return r;
+		}
+	}
+	up_read(&ls->ls_root_sem);
+	return NULL;
+}
+
+/* Find the rsb where we left off (or start again), then send rsb names
+   for rsb's we're master of and whose directory node matches the requesting
+   node.  inbuf is the rsb name last sent, inlen is the name's length */
 
 void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
  			   char *outbuf, int outlen, int nodeid)
 {
 	struct list_head *list;
-	struct dlm_rsb *start_r = NULL, *r = NULL;
-	int offset = 0, start_namelen, error, dir_nodeid;
-	char *start_name;
+	struct dlm_rsb *r;
+	int offset = 0, dir_nodeid;
 	uint16_t be_namelen;
 
-	/*
-	 * Find the rsb where we left off (or start again)
-	 */
-
-	start_namelen = inlen;
-	start_name = inbuf;
-
-	if (start_namelen > 1) {
-		/*
-		 * We could also use a find_rsb_root() function here that
-		 * searched the ls_root_list.
-		 */
-		error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
-				     &start_r);
-		DLM_ASSERT(!error && start_r,
-			   printk("error %d\n", error););
-		DLM_ASSERT(!list_empty(&start_r->res_root_list),
-			   dlm_print_rsb(start_r););
-		dlm_put_rsb(start_r);
-	}
-
-	/*
-	 * Send rsb names for rsb's we're master of and whose directory node
-	 * matches the requesting node.
-	 */
-
 	down_read(&ls->ls_root_sem);
-	if (start_r)
-		list = start_r->res_root_list.next;
-	else
+
+	if (inlen > 1) {
+		r = find_rsb_root(ls, inbuf, inlen);
+		if (!r) {
+			inbuf[inlen - 1] = '\0';
+			log_error(ls, "copy_master_names from %d start %d %s",
+				  nodeid, inlen, inbuf);
+			goto out;
+		}
+		list = r->res_root_list.next;
+	} else {
 		list = ls->ls_root_list.next;
+	}
 
 	for (offset = 0; list != &ls->ls_root_list; list = list->next) {
 		r = list_entry(list, struct dlm_rsb, res_root_list);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 7ee7c7c55453..ff4a198fa677 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -489,12 +489,6 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
 	return error;
 }
 
-int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
-		 unsigned int flags, struct dlm_rsb **r_ret)
-{
-	return find_rsb(ls, name, namelen, flags, r_ret);
-}
-
 /* This is only called to add a reference when the code already holds
    a valid reference to the rsb, so there's no need for locking. */
 
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index ada04680a1e5..27b6ed302911 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -19,8 +19,6 @@ void dlm_print_lkb(struct dlm_lkb *lkb);
 void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
 void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
 int dlm_modes_compat(int mode1, int mode2);
-int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
-	unsigned int flags, struct dlm_rsb **r_ret);
 void dlm_put_rsb(struct dlm_rsb *r);
 void dlm_hold_rsb(struct dlm_rsb *r);
 int dlm_put_lkb(struct dlm_lkb *lkb);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 2f9d9a30df97..df075dc300fa 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -731,6 +731,20 @@ int dlm_create_root_list(struct dlm_ls *ls)
 			list_add(&r->res_root_list, &ls->ls_root_list);
 			dlm_hold_rsb(r);
 		}
+
+		/* If we're using a directory, add tossed rsbs to the root
+		   list; they'll have entries created in the new directory,
+		   but no other recovery steps should do anything with them. */
+
+		if (dlm_no_directory(ls)) {
+			read_unlock(&ls->ls_rsbtbl[i].lock);
+			continue;
+		}
+
+		list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
+			list_add(&r->res_root_list, &ls->ls_root_list);
+			dlm_hold_rsb(r);
+		}
 		read_unlock(&ls->ls_rsbtbl[i].lock);
 	}
  out:
@@ -750,6 +764,11 @@ void dlm_release_root_list(struct dlm_ls *ls)
 	up_write(&ls->ls_root_sem);
 }
 
+/* If not using a directory, clear the entire toss list, there's no benefit to
+   caching the master value since it's fixed.  If we are using a dir, keep the
+   rsb's we're the master of.  Recovery will add them to the root list and from
+   there they'll be entered in the rebuilt directory. */
+
 void dlm_clear_toss_list(struct dlm_ls *ls)
 {
 	struct dlm_rsb *r, *safe;
@@ -759,8 +778,10 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
 		write_lock(&ls->ls_rsbtbl[i].lock);
 		list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
 					 res_hashchain) {
-			list_del(&r->res_hashchain);
-			dlm_free_rsb(r);
+			if (dlm_no_directory(ls) || !is_master(r)) {
+				list_del(&r->res_hashchain);
+				dlm_free_rsb(r);
+			}
 		}
 		write_unlock(&ls->ls_rsbtbl[i].lock);
 	}
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 4b89e20eebe7..997f9531d594 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -67,17 +67,18 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 	dlm_astd_resume();
 
 	/*
-	 * This list of root rsb's will be the basis of most of the recovery
-	 * routines.
+	 * Free non-master tossed rsb's.  Master rsb's are kept on toss
+	 * list and put on root list to be included in resdir recovery.
 	 */
 
-	dlm_create_root_list(ls);
+	dlm_clear_toss_list(ls);
 
 	/*
-	 * Free all the tossed rsb's so we don't have to recover them.
+	 * This list of root rsb's will be the basis of most of the recovery
+	 * routines.
 	 */
 
-	dlm_clear_toss_list(ls);
+	dlm_create_root_list(ls);
 
 	/*
 	 * Add or remove nodes from the lockspace's ls_nodes list.
-- 
cgit v1.2.3


From 2a79289e87f3b6487b5fd23c8569f32097057fb4 Mon Sep 17 00:00:00 2001
From: Patrick Caulfeld <pcaulfie@redhat.com>
Date: Thu, 17 Jan 2008 10:25:28 +0000
Subject: dlm: Sanity check namelen before copying it

The 32/64 compatibility code in the DLM does not check the validity of
the lock name length passed into it, so it can easily overwrite memory
if the value is rubbish (as early versions of libdlm can cause with
unlock calls, it doesn't zero the field).

This patch restricts the length of the name to the amount of data
actually passed into the call.

Signed-off-by: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/user.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index eb6164816948..1acb4c5813cd 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -82,7 +82,8 @@ struct dlm_lock_result32 {
 };
 
 static void compat_input(struct dlm_write_request *kb,
-			 struct dlm_write_request32 *kb32)
+			 struct dlm_write_request32 *kb32,
+			 int max_namelen)
 {
 	kb->version[0] = kb32->version[0];
 	kb->version[1] = kb32->version[1];
@@ -112,7 +113,11 @@ static void compat_input(struct dlm_write_request *kb,
 		kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
 		kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
 		memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
-		memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
+		if (kb->i.lock.namelen <= max_namelen)
+			memcpy(kb->i.lock.name, kb32->i.lock.name,
+			       kb->i.lock.namelen);
+		else
+			kb->i.lock.namelen = max_namelen;
 	}
 }
 
@@ -529,7 +534,8 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 
 		if (proc)
 			set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
-		compat_input(kbuf, k32buf);
+		compat_input(kbuf, k32buf,
+			     count - sizeof(struct dlm_write_request32));
 		kfree(k32buf);
 	}
 #endif
-- 
cgit v1.2.3


From dbcfc34733d1ae37e7a78c9e4e5325451223a5eb Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 29 Jan 2008 14:52:10 -0600
Subject: dlm: clean ups

A couple small clean-ups.  Remove unnecessary wrapper-functions in
rcom.c, and remove unnecessary casting and an unnecessary ASSERT in
util.c.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/rcom.c | 25 +++++--------------------
 fs/dlm/util.c | 16 +++++-----------
 2 files changed, 10 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index ae2fd97fa4ad..026824cd3acb 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -197,11 +197,6 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	spin_unlock(&ls->ls_rcom_spin);
 }
 
-static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
-	receive_sync_reply(ls, rc_in);
-}
-
 int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
 {
 	struct dlm_rcom *rc;
@@ -254,11 +249,6 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	send_rcom(ls, mh, rc);
 }
 
-static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
-	receive_sync_reply(ls, rc_in);
-}
-
 int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
 {
 	struct dlm_rcom *rc;
@@ -381,11 +371,6 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	send_rcom(ls, mh, rc);
 }
 
-static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
-	dlm_recover_process_copy(ls, rc_in);
-}
-
 /* If the lockspace doesn't exist then still send a status message
    back; it's possible that it just doesn't have its global_id yet. */
 
@@ -481,11 +466,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 		break;
 
 	case DLM_RCOM_STATUS_REPLY:
-		receive_rcom_status_reply(ls, rc);
+		receive_sync_reply(ls, rc);
 		break;
 
 	case DLM_RCOM_NAMES_REPLY:
-		receive_rcom_names_reply(ls, rc);
+		receive_sync_reply(ls, rc);
 		break;
 
 	case DLM_RCOM_LOOKUP_REPLY:
@@ -493,11 +478,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 		break;
 
 	case DLM_RCOM_LOCK_REPLY:
-		receive_rcom_lock_reply(ls, rc);
+		dlm_recover_process_copy(ls, rc);
 		break;
 
 	default:
-		DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+		log_error(ls, "receive_rcom bad type %d", rc->rc_type);
 	}
  out:
 	return;
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 11c6a456309f..4d9c1f4e1bd1 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -85,9 +85,7 @@ static int from_dlm_errno(int err)
 
 void dlm_message_out(struct dlm_message *ms)
 {
-	struct dlm_header *hd = (struct dlm_header *) ms;
-
-	header_out(hd);
+	header_out(&ms->m_header);
 
 	ms->m_type		= cpu_to_le32(ms->m_type);
 	ms->m_nodeid		= cpu_to_le32(ms->m_nodeid);
@@ -111,9 +109,7 @@ void dlm_message_out(struct dlm_message *ms)
 
 void dlm_message_in(struct dlm_message *ms)
 {
-	struct dlm_header *hd = (struct dlm_header *) ms;
-
-	header_in(hd);
+	header_in(&ms->m_header);
 
 	ms->m_type		= le32_to_cpu(ms->m_type);
 	ms->m_nodeid		= le32_to_cpu(ms->m_nodeid);
@@ -179,10 +175,9 @@ static void rcom_config_in(struct rcom_config *rf)
 
 void dlm_rcom_out(struct dlm_rcom *rc)
 {
-	struct dlm_header *hd = (struct dlm_header *) rc;
 	int type = rc->rc_type;
 
-	header_out(hd);
+	header_out(&rc->rc_header);
 
 	rc->rc_type		= cpu_to_le32(rc->rc_type);
 	rc->rc_result		= cpu_to_le32(rc->rc_result);
@@ -199,10 +194,9 @@ void dlm_rcom_out(struct dlm_rcom *rc)
 
 void dlm_rcom_in(struct dlm_rcom *rc)
 {
-	struct dlm_header *hd = (struct dlm_header *) rc;
 	int type;
 
-	header_in(hd);
+	header_in(&rc->rc_header);
 
 	rc->rc_type		= le32_to_cpu(rc->rc_type);
 	rc->rc_result		= le32_to_cpu(rc->rc_result);
-- 
cgit v1.2.3


From 0fe410d3f3b1496190f37ef74cd089229cef97fa Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Tue, 29 Jan 2008 13:50:16 +0800
Subject: dlm: static initialization improvements

also change name_prefix from char pointer to char array.

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/user.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 1acb4c5813cd..7cbc6826239b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,8 +24,7 @@
 #include "lvb_table.h"
 #include "user.h"
 
-static const char *name_prefix="dlm";
-static struct miscdevice ctl_device;
+static const char name_prefix[] = "dlm";
 static const struct file_operations device_fops;
 
 #ifdef CONFIG_COMPAT
@@ -902,14 +901,16 @@ static const struct file_operations ctl_device_fops = {
 	.owner   = THIS_MODULE,
 };
 
+static struct miscdevice ctl_device = {
+	.name  = "dlm-control",
+	.fops  = &ctl_device_fops,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
 int dlm_user_init(void)
 {
 	int error;
 
-	ctl_device.name = "dlm-control";
-	ctl_device.fops = &ctl_device_fops;
-	ctl_device.minor = MISC_DYNAMIC_MINOR;
-
 	error = misc_register(&ctl_device);
 	if (error)
 		log_print("misc_register failed for control device");
-- 
cgit v1.2.3


From 8084870854fe181996c4aa4f44cb2fabcebf164c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 30 Jan 2008 12:24:48 +0100
Subject: splice: always updated atime in direct splice

Andre Majorel <aym-xunil@teaser.fr> points out that if we only updated
the atime when we transfer some data, we deviate from the standard
of always updating the atime. So change splice to always call
file_accessed() even if splice_direct_to_actor() didn't transfer
any data.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 1577a7391d23..4ee49e86edde 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1033,9 +1033,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 
 done:
 	pipe->nrbufs = pipe->curbuf = 0;
-	if (bytes > 0)
-		file_accessed(in);
-
+	file_accessed(in);
 	return bytes;
 
 out_release:
-- 
cgit v1.2.3


From 0c11b9428f619ab377c92eff2f160a834a6585dd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Jan 2008 04:20:52 -0500
Subject: [PATCH] switch audit_get_loginuid() to task_struct *

all callers pass something->audit_context

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9fa9708cc715..33537487f5ab 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -984,7 +984,7 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 	if (!task)
 		return -ESRCH;
 	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
-				audit_get_loginuid(task->audit_context));
+				audit_get_loginuid(task));
 	put_task_struct(task);
 	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 }
-- 
cgit v1.2.3


From 46f8a64bae11f5c9b15b4401f6e9863281999b66 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 22 Nov 2007 13:54:18 -0500
Subject: nfsd4: probe callback channel only once

Our callback code doesn't actually handle concurrent attempts to probe
the callback channel.  Some rethinking of the locking may be required.
However, we can also just move the callback probing to this case.  Since
this is the only time a client is "confirmed" (and since that can only
happen once in the lifetime of a client), this ensures we only probe
once.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 3 +--
 fs/nfsd/nfs4state.c    | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 9d536a8cb379..a9735a672963 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -395,8 +395,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 	};
 	struct task_struct *t;
 
-	if (atomic_read(&cb->cb_set))
-		return;
+	BUG_ON(atomic_read(&clp->cl_callback.cb_set));
 
 	/* Initialize address */
 	memset(&addr, 0, sizeof(addr));
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 31673cd251c3..9d81c7117ae6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -948,6 +948,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			}
 			move_to_confirmed(unconf);
 			conf = unconf;
+			nfsd4_probe_callback(conf);
 			status = nfs_ok;
 		}
 	} else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -965,8 +966,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		status = nfserr_clid_inuse;
 	}
 out:
-	if (!status)
-		nfsd4_probe_callback(conf);
 	nfs4_unlock_state();
 	return status;
 }
-- 
cgit v1.2.3


From 63c86716ea34ad94d52e5b0abbda152574dc42b5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 25 Oct 2007 19:00:26 -0400
Subject: nfsd: move callback rpc_client creation into separate thread

The whole reason to move this callback-channel probe into a separate
thread was because (for now) we don't have an easy way to create the
rpc_client asynchronously.  But I forgot to move the rpc_create() to the
spawned thread.  Doh!  Fix that.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 78 +++++++++++++++++++++++++-------------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a9735a672963..6eb5cd2381ab 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -350,30 +350,6 @@ static struct rpc_version *	nfs_cb_version[] = {
 static int do_probe_callback(void *data)
 {
 	struct nfs4_client *clp = data;
-	struct nfs4_callback *cb = &clp->cl_callback;
-	struct rpc_message msg = {
-		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-		.rpc_argp       = clp,
-	};
-	int status;
-
-	status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT);
-
-	if (status) {
-		rpc_shutdown_client(cb->cb_client);
-		cb->cb_client = NULL;
-	} else
-		atomic_set(&cb->cb_set, 1);
-	put_nfs4_client(clp);
-	return 0;
-}
-
-/*
- * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
- */
-void
-nfsd4_probe_callback(struct nfs4_client *clp)
-{
 	struct sockaddr_in	addr;
 	struct nfs4_callback    *cb = &clp->cl_callback;
 	struct rpc_timeout	timeparms = {
@@ -390,12 +366,15 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 		.timeout	= &timeparms,
 		.program	= program,
 		.version	= nfs_cb_version[1]->number,
-		.authflavor	= RPC_AUTH_UNIX,	/* XXX: need AUTH_GSS... */
+		.authflavor	= RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
 		.flags		= (RPC_CLNT_CREATE_NOPING),
 	};
-	struct task_struct *t;
-
-	BUG_ON(atomic_read(&clp->cl_callback.cb_set));
+	struct rpc_message msg = {
+		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+		.rpc_argp       = clp,
+	};
+	struct rpc_clnt *client;
+	int status;
 
 	/* Initialize address */
 	memset(&addr, 0, sizeof(addr));
@@ -415,29 +394,50 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 	program->stats->program = program;
 
 	/* Create RPC client */
-	cb->cb_client = rpc_create(&args);
-	if (IS_ERR(cb->cb_client)) {
+	client = rpc_create(&args);
+	if (IS_ERR(client)) {
 		dprintk("NFSD: couldn't create callback client\n");
+		status = PTR_ERR(client);
 		goto out_err;
 	}
 
+	status = rpc_call_sync(client, &msg, RPC_TASK_SOFT);
+
+	if (status)
+		goto out_release_client;
+
+	cb->cb_client = client;
+	atomic_set(&cb->cb_set, 1);
+	put_nfs4_client(clp);
+	return 0;
+out_release_client:
+	rpc_shutdown_client(client);
+out_err:
+	put_nfs4_client(clp);
+	dprintk("NFSD: warning: no callback path to client %.*s\n",
+		(int)clp->cl_name.len, clp->cl_name.data);
+	return status;
+}
+
+/*
+ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
+ */
+void
+nfsd4_probe_callback(struct nfs4_client *clp)
+{
+	struct task_struct *t;
+
+	BUG_ON(atomic_read(&clp->cl_callback.cb_set));
+
 	/* the task holds a reference to the nfs4_client struct */
 	atomic_inc(&clp->cl_count);
 
 	t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe");
 
 	if (IS_ERR(t))
-		goto out_release_clp;
+		atomic_dec(&clp->cl_count);
 
 	return;
-
-out_release_clp:
-	atomic_dec(&clp->cl_count);
-	rpc_shutdown_client(cb->cb_client);
-out_err:
-	cb->cb_client = NULL;
-	dprintk("NFSD: warning: no callback path to client %.*s\n",
-		(int)clp->cl_name.len, clp->cl_name.data);
 }
 
 /*
-- 
cgit v1.2.3


From aefa89d178e6dd83889b66d4e800d4d77363900b Mon Sep 17 00:00:00 2001
From: Prasad P <pvp@us.ibm.com>
Date: Wed, 24 Oct 2007 15:14:32 -0500
Subject: nfsd: Fix inconsistent assignment

Dereferenced pointer "dentry" without checking and assigned to inode
in the declaration.

(We could just delete the NULL checks that follow instead, as we never
get to the encode function in this particular case.  But it takes a
little detective work to verify that fact, so it's probably safer to
leave the checks in place.)

Cc: Steve French <smfltc@us.ibm.com>
Signed-off-by: Prasad V Potluri <pvp@us.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs2acl.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 0e5fa11e6b44..1c3b7654e966 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -221,12 +221,17 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_getaclres *resp)
 {
 	struct dentry *dentry = resp->fh.fh_dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode;
 	struct kvec *head = rqstp->rq_res.head;
 	unsigned int base;
 	int n;
 	int w;
 
+	/*
+	 * Since this is version 2, the check for nfserr in
+	 * nfsd_dispatch actually ensures the following cannot happen.
+	 * However, it seems fragile to depend on that.
+	 */
 	if (dentry == NULL || dentry->d_inode == NULL)
 		return 0;
 	inode = dentry->d_inode;
-- 
cgit v1.2.3


From d4395e03fec0895d01451904b8a2276ceda663c9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 26 Oct 2007 13:32:50 -0400
Subject: knfsd: fix broken length check in nfs4idmap.c

Obviously at some point we thought "error" represented the length when
positive.  This appears to be a long-standing typo.

Thanks to Prasad Potluri <pvp@us.ibm.com> for finding the problem and
proposing an earlier version of this patch.

Cc: Steve French <smfltc@us.ibm.com>
Cc: Prasad V Potluri <pvp@us.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4idmap.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4c0c683ce07a..5b56c77c15c5 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -255,13 +255,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
 		goto out;
 	if (len == 0)
 		set_bit(CACHE_NEGATIVE, &ent.h.flags);
-	else {
-		if (error >= IDMAP_NAMESZ) {
-			error = -EINVAL;
-			goto out;
-		}
+	else if (len >= IDMAP_NAMESZ)
+		goto out;
+	else
 		memcpy(ent.name, buf1, sizeof(ent.name));
-	}
 	error = -ENOMEM;
 	res = idtoname_update(&ent, res);
 	if (res == NULL)
-- 
cgit v1.2.3


From 48df020aa17ac95a012ff765b0086ede5996b320 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Nov 2007 16:56:53 -0400
Subject: NLM: Fix sign of length of NLM variable length strings

According to The Open Group's NLM specification, NLM callers are variable
length strings.  XDR variable length strings use an unsigned 32 bit length.
And internally, negative string lengths are not meaningful for the Linux
NLM implementation.

Clean up: Make nlm_lock.len and nlm_reboot.len unsigned integers.  This
makes the sign of NLM string lengths consistent with the sign of xdr_netobj
lengths.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-By: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 572601e98dcd..ebec0098efbf 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -34,10 +34,10 @@ static DEFINE_MUTEX(nlm_host_mutex);
 
 static void			nlm_gc_hosts(void);
 static struct nsm_handle *	__nsm_find(const struct sockaddr_in *,
-					const char *, int, int);
+					const char *, unsigned int, int);
 static struct nsm_handle *	nsm_find(const struct sockaddr_in *sin,
 					 const char *hostname,
-					 int hostname_len);
+					 unsigned int hostname_len);
 
 /*
  * Common host lookup routine for server & client
@@ -45,7 +45,8 @@ static struct nsm_handle *	nsm_find(const struct sockaddr_in *sin,
 static struct nlm_host *
 nlm_lookup_host(int server, const struct sockaddr_in *sin,
 		int proto, int version, const char *hostname,
-		int hostname_len, const struct sockaddr_in *ssin)
+		unsigned int hostname_len,
+		const struct sockaddr_in *ssin)
 {
 	struct hlist_head *chain;
 	struct hlist_node *pos;
@@ -176,7 +177,7 @@ nlm_destroy_host(struct nlm_host *host)
  */
 struct nlm_host *
 nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
-			const char *hostname, int hostname_len)
+			const char *hostname, unsigned int hostname_len)
 {
 	struct sockaddr_in ssin = {0};
 
@@ -189,7 +190,7 @@ nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
  */
 struct nlm_host *
 nlmsvc_lookup_host(struct svc_rqst *rqstp,
-			const char *hostname, int hostname_len)
+			const char *hostname, unsigned int hostname_len)
 {
 	struct sockaddr_in ssin = {0};
 
@@ -307,7 +308,8 @@ void nlm_release_host(struct nlm_host *host)
  * Release all resources held by that peer.
  */
 void nlm_host_rebooted(const struct sockaddr_in *sin,
-				const char *hostname, int hostname_len,
+				const char *hostname,
+				unsigned int hostname_len,
 				u32 new_state)
 {
 	struct hlist_head *chain;
@@ -449,7 +451,7 @@ static DEFINE_MUTEX(nsm_mutex);
 
 static struct nsm_handle *
 __nsm_find(const struct sockaddr_in *sin,
-		const char *hostname, int hostname_len,
+		const char *hostname, unsigned int hostname_len,
 		int create)
 {
 	struct nsm_handle *nsm = NULL;
@@ -503,7 +505,8 @@ out:
 }
 
 static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len)
+nsm_find(const struct sockaddr_in *sin, const char *hostname,
+	 unsigned int hostname_len)
 {
 	return __nsm_find(sin, hostname, hostname_len, 1);
 }
-- 
cgit v1.2.3


From ee1a95b3b3fccf3c825bd95f89a8e006901b03ed Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Nov 2007 16:56:58 -0400
Subject: NFSD: Use unsigned length argument for decode_filename

Clean up: file name lengths are unsigned on the wire, negative lengths
are not meaningful natively either.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-By: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs3xdr.c | 4 ++--
 fs/nfsd/nfsxdr.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f917fd25858a..c02b8d69297d 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -88,10 +88,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
  * no slashes or null bytes.
  */
 static __be32 *
-decode_filename(__be32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
 {
 	char		*name;
-	int		i;
+	unsigned int	i;
 
 	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
 		for (i = 0, name = *namp; i < *lenp; i++, name++) {
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b86e3658a0af..50bd6187edfc 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -62,10 +62,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
  * no slashes or null bytes.
  */
 static __be32 *
-decode_filename(__be32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
 {
 	char		*name;
-	int		i;
+	unsigned int	i;
 
 	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
 		for (i = 0, name = *namp; i < *lenp; i++, name++) {
-- 
cgit v1.2.3


From 5a022fc8700cadbac373766cf1b5c746ffec7164 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Nov 2007 16:57:09 -0400
Subject: NFSD: Adjust filename length argument of nfsd_lookup

Clean up: adjust the sign of the length argument of nfsd_lookup and
nfsd_lookup_dentry, for consistency with recent changes.  NFSD version
4 callers already pass an unsigned file name length.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-By: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d0199189924c..755ba43c13e1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -132,7 +132,7 @@ out:
 
 __be32
 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
-		   const char *name, int len,
+		   const char *name, unsigned int len,
 		   struct svc_export **exp_ret, struct dentry **dentry_ret)
 {
 	struct svc_export	*exp;
@@ -226,7 +226,7 @@ out_nfserr:
  */
 __be32
 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
-					int len, struct svc_fh *resfh)
+				unsigned int len, struct svc_fh *resfh)
 {
 	struct svc_export	*exp;
 	struct dentry		*dentry;
-- 
cgit v1.2.3


From 9c7544d3a195cde33b3d1e46639b23c221f901db Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Nov 2007 16:57:14 -0400
Subject: NFSD: Use unsigned length argument for decode_pathname

Clean up: path name lengths are unsigned on the wire, negative lengths
are not meaningful natively either.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-By: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsxdr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 50bd6187edfc..7003c313272f 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -78,10 +78,10 @@ decode_filename(__be32 *p, char **namp, unsigned int *lenp)
 }
 
 static __be32 *
-decode_pathname(__be32 *p, char **namp, int *lenp)
+decode_pathname(__be32 *p, char **namp, unsigned int *lenp)
 {
 	char		*name;
-	int		i;
+	unsigned int	i;
 
 	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) {
 		for (i = 0, name = *namp; i < *lenp; i++, name++) {
-- 
cgit v1.2.3


From a628f6675861d979405f751418e924c4ec7d457d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Nov 2007 16:57:20 -0400
Subject: NFSD: Fix mixed sign comparison in nfs3svc_decode_symlinkargs

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-By: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs3xdr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index c02b8d69297d..be515c5a8154 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -452,8 +452,7 @@ int
 nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_symlinkargs *args)
 {
-	unsigned int len;
-	int avail;
+	unsigned int len, avail;
 	char *old, *new;
 	struct kvec *vec;
 
@@ -486,7 +485,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
 	/* now copy next page if there is one */
 	if (len && !avail && rqstp->rq_arg.page_len) {
 		avail = rqstp->rq_arg.page_len;
-		if (avail > PAGE_SIZE) avail = PAGE_SIZE;
+		if (avail > PAGE_SIZE)
+			avail = PAGE_SIZE;
 		old = page_address(rqstp->rq_arg.pages[0]);
 	}
 	while (len && avail && *old) {
-- 
cgit v1.2.3


From ca2a05aa7c72309ee65164c78fa2be7a5038215e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Sun, 11 Nov 2007 15:43:12 -0500
Subject: nfsd: Fix handling of negative lengths in read_buf()

The length "nbytes" passed into read_buf should never be negative, but
we check only for too-large values of "nbytes", not for too-small
values.  Make nbytes unsigned, so it's clear that the former tests are
sufficient.  (Despite this read_buf() currently correctly returns an xdr
error in the case of a negative length, thanks to an unsigned
comparison with size_of() and bounds-checking in kmalloc().  This seems
very fragile, though.)

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 57333944af7f..bf1e792a65a0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -148,12 +148,12 @@ xdr_error:					\
 	}					\
 } while (0)
 
-static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
 {
 	/* We want more bytes than seem to be available.
 	 * Maybe we need a new page, maybe we have just run out
 	 */
-	int avail = (char*)argp->end - (char*)argp->p;
+	unsigned int avail = (char *)argp->end - (char *)argp->p;
 	__be32 *p;
 	if (avail + argp->pagelen < nbytes)
 		return NULL;
@@ -169,6 +169,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
 			return NULL;
 		
 	}
+	/*
+	 * The following memcpy is safe because read_buf is always
+	 * called with nbytes > avail, and the two cases above both
+	 * guarantee p points to at least nbytes bytes.
+	 */
 	memcpy(p, argp->p, avail);
 	/* step to next page */
 	argp->p = page_address(argp->pagelist[0]);
-- 
cgit v1.2.3


From 46b25895767c606c630a97b03a895934a7a36a70 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 9 Nov 2007 12:31:55 -0500
Subject: knfsd: cleanup nfsd4 properly on module init failure

We forgot to shut down the nfs4 state and idmapping code in this case.

Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77dc9893b7ba..d8d50a773a5b 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -695,12 +695,14 @@ static int __init init_nfsd(void)
 	}
 	retval = register_filesystem(&nfsd_fs_type);
 	if (retval) {
+		nfsd_idmap_shutdown();
 		nfsd_export_shutdown();
 		nfsd_cache_shutdown();
 		remove_proc_entry("fs/nfs/exports", NULL);
 		remove_proc_entry("fs/nfs", NULL);
 		nfsd_stat_shutdown();
 		nfsd_lockd_shutdown();
+		nfsd4_free_slabs();
 	}
 	return retval;
 }
-- 
cgit v1.2.3


From 26808d3f10b1213bbb6e27d441be40e20ab84611 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 9 Nov 2007 13:44:06 -0500
Subject: nfsd: cleanup nfsd module initialization cleanup

Handle the failure case here with something closer to the standard
kernel style.

Doesn't really matter for now, but I'd like to add a few more failure
cases, and then this'll help.

Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index d8d50a773a5b..ecf377944286 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -694,16 +694,18 @@ static int __init init_nfsd(void)
 			entry->proc_fops =  &exports_operations;
 	}
 	retval = register_filesystem(&nfsd_fs_type);
-	if (retval) {
-		nfsd_idmap_shutdown();
-		nfsd_export_shutdown();
-		nfsd_cache_shutdown();
-		remove_proc_entry("fs/nfs/exports", NULL);
-		remove_proc_entry("fs/nfs", NULL);
-		nfsd_stat_shutdown();
-		nfsd_lockd_shutdown();
-		nfsd4_free_slabs();
-	}
+	if (retval)
+		goto out_free_all;
+	return 0;
+out_free_all:
+	nfsd_idmap_shutdown();
+	nfsd_export_shutdown();
+	nfsd_cache_shutdown();
+	remove_proc_entry("fs/nfs/exports", NULL);
+	remove_proc_entry("fs/nfs", NULL);
+	nfsd_stat_shutdown();
+	nfsd_lockd_shutdown();
+	nfsd4_free_slabs();
 	return retval;
 }
 
-- 
cgit v1.2.3


From d5c3428b2cb26d605fddc4878f4fcc03c23df89f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 9 Nov 2007 14:10:56 -0500
Subject: nfsd: fail module init on reply cache init failure

If the reply cache initialization fails due to a kmalloc failure,
currently we try to soldier on with a reduced (or nonexistant) reply
cache.

Better to just fail immediately: the failure is then much easier to
understand and debug, and it could save us complexity in some later
code.  (But actually, it doesn't help currently because the cache is
also turned off in some odd failure cases; we should probably find a
better way to handle those failure cases some day.)

Fix some minor style problems while we're at it, and rename
nfsd_cache_init() to remove the need for a comment describing it.

Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfscache.c | 28 ++++++++++++----------------
 fs/nfsd/nfsctl.c   | 11 +++++++----
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 578f2c9d56be..5bfc2ac60d54 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -44,17 +44,17 @@ static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
  */
 static DEFINE_SPINLOCK(cache_lock);
 
-void
-nfsd_cache_init(void)
+int nfsd_reply_cache_init(void)
 {
 	struct svc_cacherep	*rp;
 	int			i;
 
 	INIT_LIST_HEAD(&lru_head);
 	i = CACHESIZE;
-	while(i) {
+	while (i) {
 		rp = kmalloc(sizeof(*rp), GFP_KERNEL);
-		if (!rp) break;
+		if (!rp)
+			goto out_nomem;
 		list_add(&rp->c_lru, &lru_head);
 		rp->c_state = RC_UNUSED;
 		rp->c_type = RC_NOCACHE;
@@ -62,23 +62,19 @@ nfsd_cache_init(void)
 		i--;
 	}
 
-	if (i)
-		printk (KERN_ERR "nfsd: cannot allocate all %d cache entries, only got %d\n",
-			CACHESIZE, CACHESIZE-i);
-
 	hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
-	if (!hash_list) {
-		nfsd_cache_shutdown();
-		printk (KERN_ERR "nfsd: cannot allocate %Zd bytes for hash list\n",
-			HASHSIZE * sizeof(struct hlist_head));
-		return;
-	}
+	if (!hash_list)
+		goto out_nomem;
 
 	cache_disabled = 0;
+	return 0;
+out_nomem:
+	printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
+	nfsd_reply_cache_shutdown();
+	return -ENOMEM;
 }
 
-void
-nfsd_cache_shutdown(void)
+void nfsd_reply_cache_shutdown(void)
 {
 	struct svc_cacherep	*rp;
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index ecf377944286..2bfda9b8f504 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -683,7 +683,9 @@ static int __init init_nfsd(void)
 	if (retval)
 		return retval;
 	nfsd_stat_init();	/* Statistics */
-	nfsd_cache_init();	/* RPC reply cache */
+	retval = nfsd_reply_cache_init();
+	if (retval)
+		goto out_free_stat;
 	nfsd_export_init();	/* Exports table */
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
 	nfsd_idmap_init();      /* Name to ID mapping */
@@ -700,11 +702,12 @@ static int __init init_nfsd(void)
 out_free_all:
 	nfsd_idmap_shutdown();
 	nfsd_export_shutdown();
-	nfsd_cache_shutdown();
+	nfsd_reply_cache_shutdown();
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
-	nfsd_stat_shutdown();
 	nfsd_lockd_shutdown();
+out_free_stat:
+	nfsd_stat_shutdown();
 	nfsd4_free_slabs();
 	return retval;
 }
@@ -712,7 +715,7 @@ out_free_all:
 static void __exit exit_nfsd(void)
 {
 	nfsd_export_shutdown();
-	nfsd_cache_shutdown();
+	nfsd_reply_cache_shutdown();
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
 	nfsd_stat_shutdown();
-- 
cgit v1.2.3


From df95a9d4fb91d819d3fb55dd437056df59e7f15e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 8 Nov 2007 16:09:59 -0500
Subject: knfsd: cache unregistration needn't return error

There's really nothing much the caller can do if cache unregistration
fails.  And indeed, all any caller does in this case is print an error
and continue.  So just return void and move the printk's inside
cache_unregister.

Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/export.c    | 6 ++----
 fs/nfsd/nfs4idmap.c | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 66d0aeb32a47..d29b70a28f2b 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1670,10 +1670,8 @@ nfsd_export_shutdown(void)
 
 	exp_writelock();
 
-	if (cache_unregister(&svc_expkey_cache))
-		printk(KERN_ERR "nfsd: failed to unregister expkey cache\n");
-	if (cache_unregister(&svc_export_cache))
-		printk(KERN_ERR "nfsd: failed to unregister export cache\n");
+	cache_unregister(&svc_expkey_cache);
+	cache_unregister(&svc_export_cache);
 	svcauth_unix_purge();
 
 	exp_writeunlock();
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5b56c77c15c5..ef22179c49ad 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -474,10 +474,8 @@ nfsd_idmap_init(void)
 void
 nfsd_idmap_shutdown(void)
 {
-	if (cache_unregister(&idtoname_cache))
-		printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n");
-	if (cache_unregister(&nametoid_cache))
-		printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n");
+	cache_unregister(&idtoname_cache);
+	cache_unregister(&nametoid_cache);
 }
 
 /*
-- 
cgit v1.2.3


From 440bcc592052e42c7050a51489c65e18df4a0636 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 12 Nov 2007 17:09:49 -0500
Subject: nfsd: select CONFIG_PROC_FS in nfsv4 and gss server cases

The server depends on upcalls under /proc to support nfsv4 and gss.

Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 219ec06a8c7e..987b5d7cb21a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1674,6 +1674,8 @@ config NFSD
 	select CRYPTO_MD5 if NFSD_V4
 	select CRYPTO if NFSD_V4
 	select FS_POSIX_ACL if NFSD_V4
+	select PROC_FS if NFSD_V4
+	select PROC_FS if SUNRPC_GSS
 	help
 	  If you want your Linux box to act as an NFS *server*, so that other
 	  computers on your local network which support NFS can access certain
-- 
cgit v1.2.3


From e331f606a85a2a9e84e9c63c94d43c0517136139 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 12 Nov 2007 17:32:21 -0500
Subject: nfsd: fail init on /proc/fs/nfs/exports creation failure

I assume the reason failure of creation was ignored here was just to
continue support embedded systems that want nfsd but not proc.

However, in cases where proc is supported it would be clearer to fail
entirely than to come up with some features disabled.

Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2bfda9b8f504..2b95597aa4a5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -674,6 +674,27 @@ static struct file_system_type nfsd_fs_type = {
 	.kill_sb	= kill_litter_super,
 };
 
+#ifdef CONFIG_PROC_FS
+static int create_proc_exports_entry(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = proc_mkdir("fs/nfs", NULL);
+	if (!entry)
+		return -ENOMEM;
+	entry = create_proc_entry("fs/nfs/exports", 0, NULL);
+	if (!entry)
+		return -ENOMEM;
+	entry->proc_fops =  &exports_operations;
+	return 0;
+}
+#else /* CONFIG_PROC_FS */
+static int create_proc_exports_entry(void)
+{
+	return 0;
+}
+#endif
+
 static int __init init_nfsd(void)
 {
 	int retval;
@@ -689,23 +710,21 @@ static int __init init_nfsd(void)
 	nfsd_export_init();	/* Exports table */
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
 	nfsd_idmap_init();      /* Name to ID mapping */
-	if (proc_mkdir("fs/nfs", NULL)) {
-		struct proc_dir_entry *entry;
-		entry = create_proc_entry("fs/nfs/exports", 0, NULL);
-		if (entry)
-			entry->proc_fops =  &exports_operations;
-	}
+	retval = create_proc_exports_entry();
+	if (retval)
+		goto out_free_idmap;
 	retval = register_filesystem(&nfsd_fs_type);
 	if (retval)
 		goto out_free_all;
 	return 0;
 out_free_all:
-	nfsd_idmap_shutdown();
-	nfsd_export_shutdown();
-	nfsd_reply_cache_shutdown();
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
+	nfsd_idmap_shutdown();
+out_free_idmap:
 	nfsd_lockd_shutdown();
+	nfsd_export_shutdown();
+	nfsd_reply_cache_shutdown();
 out_free_stat:
 	nfsd_stat_shutdown();
 	nfsd4_free_slabs();
-- 
cgit v1.2.3


From dbf847ecb6318d3a22c6758fe39696d00f39063a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 8 Nov 2007 17:20:34 -0500
Subject: knfsd: allow cache_register to return error on failure

Newer server features such as nfsv4 and gss depend on proc to work, so a
failure to initialize the proc files they need should be treated as
fatal.

Thanks to Andrew Morton for style fix and compile fix in case where
CONFIG_NFSD_V4 is undefined.

Cc: Andrew Morton <akpm@linux-foundation.org>
Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/export.c    | 12 +++++++++---
 fs/nfsd/nfs4idmap.c | 13 ++++++++++---
 fs/nfsd/nfsctl.c    | 12 +++++++++---
 3 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index d29b70a28f2b..cbbc594ef592 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1637,13 +1637,19 @@ exp_verify_string(char *cp, int max)
 /*
  * Initialize the exports module.
  */
-void
+int
 nfsd_export_init(void)
 {
+	int rv;
 	dprintk("nfsd: initializing export module.\n");
 
-	cache_register(&svc_export_cache);
-	cache_register(&svc_expkey_cache);
+	rv = cache_register(&svc_export_cache);
+	if (rv)
+		return rv;
+	rv = cache_register(&svc_expkey_cache);
+	if (rv)
+		cache_unregister(&svc_export_cache);
+	return rv;
 
 }
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index ef22179c49ad..996bd88b75ba 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -464,11 +464,18 @@ nametoid_update(struct ent *new, struct ent *old)
  * Exported API
  */
 
-void
+int
 nfsd_idmap_init(void)
 {
-	cache_register(&idtoname_cache);
-	cache_register(&nametoid_cache);
+	int rv;
+
+	rv = cache_register(&idtoname_cache);
+	if (rv)
+		return rv;
+	rv = cache_register(&nametoid_cache);
+	if (rv)
+		cache_unregister(&idtoname_cache);
+	return rv;
 }
 
 void
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2b95597aa4a5..4aba92698581 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -707,9 +707,13 @@ static int __init init_nfsd(void)
 	retval = nfsd_reply_cache_init();
 	if (retval)
 		goto out_free_stat;
-	nfsd_export_init();	/* Exports table */
+	retval = nfsd_export_init();
+	if (retval)
+		goto out_free_cache;
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
-	nfsd_idmap_init();      /* Name to ID mapping */
+	retval = nfsd_idmap_init();
+	if (retval)
+		goto out_free_lockd;
 	retval = create_proc_exports_entry();
 	if (retval)
 		goto out_free_idmap;
@@ -720,10 +724,12 @@ static int __init init_nfsd(void)
 out_free_all:
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
-	nfsd_idmap_shutdown();
 out_free_idmap:
+	nfsd_idmap_shutdown();
+out_free_lockd:
 	nfsd_lockd_shutdown();
 	nfsd_export_shutdown();
+out_free_cache:
 	nfsd_reply_cache_shutdown();
 out_free_stat:
 	nfsd_stat_shutdown();
-- 
cgit v1.2.3


From 2e8138a274d81d87591db0803b1e81f4284ff935 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 15 Nov 2007 17:05:43 -0500
Subject: nfsd: move nfsd/auth.h into fs/nfsd

This header is used only in a few places in fs/nfsd, so there seems to
be little point to having it in include/.  (Thanks to Robert Day for
pointing this out.)

Cc: Robert P. J. Day <rpjday@crashcourse.ca>
Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/auth.h    | 27 +++++++++++++++++++++++++++
 fs/nfsd/nfs3xdr.c |  1 +
 fs/nfsd/nfsfh.c   |  1 +
 fs/nfsd/nfsxdr.c  |  1 +
 4 files changed, 30 insertions(+)
 create mode 100644 fs/nfsd/auth.h

(limited to 'fs')

diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h
new file mode 100644
index 000000000000..0fb9f7212195
--- /dev/null
+++ b/fs/nfsd/auth.h
@@ -0,0 +1,27 @@
+/*
+ * include/linux/nfsd/auth.h
+ *
+ * nfsd-specific authentication stuff.
+ * uid/gid mapping not yet implemented.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_AUTH_H
+#define LINUX_NFSD_AUTH_H
+
+#ifdef __KERNEL__
+
+#define nfsd_luid(rq, uid)	((u32)(uid))
+#define nfsd_lgid(rq, gid)	((u32)(gid))
+#define nfsd_ruid(rq, uid)	((u32)(uid))
+#define nfsd_rgid(rq, gid)	((u32)(gid))
+
+/*
+ * Set the current process's fsuid/fsgid etc to those of the NFS
+ * client user
+ */
+int nfsd_setuser(struct svc_rqst *, struct svc_export *);
+
+#endif /* __KERNEL__ */
+#endif /* LINUX_NFSD_AUTH_H */
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index be515c5a8154..4b1ffe3be7e2 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -21,6 +21,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/xdr3.h>
+#include "auth.h"
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 468f17a78441..8fbd2dc08a92 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -22,6 +22,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/nfsd/nfsd.h>
+#include "auth.h"
 
 #define NFSDDBG_FACILITY		NFSDDBG_FH
 
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 7003c313272f..61ad61743d94 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -15,6 +15,7 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/xdr.h>
 #include <linux/mm.h>
+#include "auth.h"
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
-- 
cgit v1.2.3


From 1f69f172c73a2bf0bf55da9346da8dccea9035cf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 15 Nov 2007 17:06:58 -0500
Subject: nfsd: minor fs/nfsd/auth.h cleanup

While we're here, let's remove the redundant (and now wrong) pathname in
the comment, and the #ifdef __KERNEL__'s.

Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/auth.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h
index 0fb9f7212195..78b3c0e93822 100644
--- a/fs/nfsd/auth.h
+++ b/fs/nfsd/auth.h
@@ -1,6 +1,4 @@
 /*
- * include/linux/nfsd/auth.h
- *
  * nfsd-specific authentication stuff.
  * uid/gid mapping not yet implemented.
  *
@@ -10,8 +8,6 @@
 #ifndef LINUX_NFSD_AUTH_H
 #define LINUX_NFSD_AUTH_H
 
-#ifdef __KERNEL__
-
 #define nfsd_luid(rq, uid)	((u32)(uid))
 #define nfsd_lgid(rq, gid)	((u32)(gid))
 #define nfsd_ruid(rq, uid)	((u32)(uid))
@@ -23,5 +19,4 @@
  */
 int nfsd_setuser(struct svc_rqst *, struct svc_export *);
 
-#endif /* __KERNEL__ */
 #endif /* LINUX_NFSD_AUTH_H */
-- 
cgit v1.2.3


From a186e767473bd329122f0229b91573b9b6fa43c1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 20 Nov 2007 16:11:27 -0500
Subject: nfsd4: kill some unneeded setclientid comments

Most of these comments just summarize the code.

The matching of code to the cases described in the RFC may still be
useful, though; add specific section references to make that easier to
follow.  Also update references to the outdated RFC 3010.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 136 ++++++++++++++++------------------------------------
 1 file changed, 40 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9d81c7117ae6..242fee7c1018 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -683,39 +683,6 @@ out_err:
 	return;
 }
 
-/*
- * RFC 3010 has a complex implmentation description of processing a 
- * SETCLIENTID request consisting of 5 bullets, labeled as 
- * CASE0 - CASE4 below.
- *
- * NOTES:
- * 	callback information will be processed in a future patch
- *
- *	an unconfirmed record is added when:
- *      NORMAL (part of CASE 4): there is no confirmed nor unconfirmed record.
- *	CASE 1: confirmed record found with matching name, principal,
- *		verifier, and clientid.
- *	CASE 2: confirmed record found with matching name, principal,
- *		and there is no unconfirmed record with matching
- *		name and principal
- *
- *      an unconfirmed record is replaced when:
- *	CASE 3: confirmed record found with matching name, principal,
- *		and an unconfirmed record is found with matching 
- *		name, principal, and with clientid and
- *		confirm that does not match the confirmed record.
- *	CASE 4: there is no confirmed record with matching name and 
- *		principal. there is an unconfirmed record with 
- *		matching name, principal.
- *
- *	an unconfirmed record is deleted when:
- *	CASE 1: an unconfirmed record that matches input name, verifier,
- *		and confirmed clientid.
- *	CASE 4: any unconfirmed records with matching name and principal
- *		that exist after an unconfirmed record has been replaced
- *		as described above.
- *
- */
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
@@ -748,11 +715,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfs4_lock_state();
 	conf = find_confirmed_client_by_str(dname, strhashval);
 	if (conf) {
-		/* 
-		 * CASE 0:
-		 * clname match, confirmed, different principal
-		 * or different ip_address
-		 */
+		/* RFC 3530 14.2.33 CASE 0: */
 		status = nfserr_clid_inuse;
 		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
 				|| conf->cl_addr != sin->sin_addr.s_addr) {
@@ -761,12 +724,17 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 		}
 	}
+	/*
+	 * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
+	 * has a description of SETCLIENTID request processing consisting
+	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
+	 */
 	unconf = find_unconfirmed_client_by_str(dname, strhashval);
 	status = nfserr_resource;
 	if (!conf) {
-		/* 
-		 * CASE 4:
-		 * placed first, because it is the normal case.
+		/*
+		 * RFC 3530 14.2.33 CASE 4:
+		 * placed first, because it is the normal case
 		 */
 		if (unconf)
 			expire_client(unconf);
@@ -776,17 +744,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		gen_clid(new);
 	} else if (same_verf(&conf->cl_verifier, &clverifier)) {
 		/*
-		 * CASE 1:
-		 * cl_name match, confirmed, principal match
-		 * verifier match: probable callback update
-		 *
-		 * remove any unconfirmed nfs4_client with 
-		 * matching cl_name, cl_verifier, and cl_clientid
-		 *
-		 * create and insert an unconfirmed nfs4_client with same 
-		 * cl_name, cl_verifier, and cl_clientid as existing 
-		 * nfs4_client,  but with the new callback info and a 
-		 * new cl_confirm
+		 * RFC 3530 14.2.33 CASE 1:
+		 * probable callback update
 		 */
 		if (unconf) {
 			/* Note this is removing unconfirmed {*x***},
@@ -802,32 +761,19 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		copy_clid(new, conf);
 	} else if (!unconf) {
 		/*
-		 * CASE 2:
-		 * clname match, confirmed, principal match
-		 * verfier does not match
-		 * no unconfirmed. create a new unconfirmed nfs4_client
-		 * using input clverifier, clname, and callback info
-		 * and generate a new cl_clientid and cl_confirm.
+		 * RFC 3530 14.2.33 CASE 2:
+		 * probable client reboot; state will be removed if
+		 * confirmed.
 		 */
 		new = create_client(clname, dname);
 		if (new == NULL)
 			goto out;
 		gen_clid(new);
 	} else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) {
-		/*	
-		 * CASE3:
-		 * confirmed found (name, principal match)
-		 * confirmed verifier does not match input clverifier
-		 *
-		 * unconfirmed found (name match)
-		 * confirmed->cl_confirm != unconfirmed->cl_confirm
-		 *
-		 * remove unconfirmed.
-		 *
-		 * create an unconfirmed nfs4_client 
-		 * with same cl_name as existing confirmed nfs4_client, 
-		 * but with new callback info, new cl_clientid,
-		 * new cl_verifier and a new cl_confirm
+		/*
+		 * RFC 3530 14.2.33 CASE 3:
+		 * probable client reboot; state will be removed if
+		 * confirmed.
 		 */
 		expire_client(unconf);
 		new = create_client(clname, dname);
@@ -857,11 +803,9 @@ out:
 
 
 /*
- * RFC 3010 has a complex implmentation description of processing a 
- * SETCLIENTID_CONFIRM request consisting of 4 bullets describing
- * processing on a DRC miss, labeled as CASE1 - CASE4 below.
- *
- * NOTE: callback information will be processed here in a future patch
+ * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
+ * a description of SETCLIENTID_CONFIRM request processing consisting of 4
+ * bullets, labeled as CASE1 - CASE4 below.
  */
 __be32
 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
@@ -892,16 +836,20 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 	if (unconf && unconf->cl_addr != sin->sin_addr.s_addr)
 		goto out;
 
+	/*
+	 * section 14.2.34 of RFC 3530 has a description of
+	 * SETCLIENTID_CONFIRM request processing consisting
+	 * of 4 bullet points, labeled as CASE1 - CASE4 below.
+	 */
 	if ((conf && unconf) && 
 	    (same_verf(&unconf->cl_confirm, &confirm)) &&
 	    (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) &&
 	    (same_name(conf->cl_recdir,unconf->cl_recdir))  &&
 	    (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
-		/* CASE 1:
-		* unconf record that matches input clientid and input confirm.
-		* conf record that matches input clientid.
-		* conf and unconf records match names, verifiers
-		*/
+		/*
+		 * RFC 3530 14.2.34 CASE 1:
+		 * callback update
+		 */
 		if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
 			status = nfserr_clid_inuse;
 		else {
@@ -918,11 +866,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 	    ((conf && unconf) && 
 	     (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) ||
 	      !same_name(conf->cl_recdir, unconf->cl_recdir)))) {
-		/* CASE 2:
-		 * conf record that matches input clientid.
-		 * if unconf record matches input clientid, then
-		 * unconf->cl_name or unconf->cl_verifier don't match the
-		 * conf record.
+		/*
+		 * RFC 3530 14.2.34 CASE 2:
+		 * probable retransmitted request; play it safe and
+		 * do nothing.
 		 */
 		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
 			status = nfserr_clid_inuse;
@@ -930,10 +877,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			status = nfs_ok;
 	} else if (!conf && unconf
 			&& same_verf(&unconf->cl_confirm, &confirm)) {
-		/* CASE 3:
-		 * conf record not found.
-		 * unconf record found.
-		 * unconf->cl_confirm matches input confirm
+		/*
+		 * RFC 3530 14.2.34 CASE 3:
+		 * Normal case; new or rebooted client:
 		 */
 		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
 			status = nfserr_clid_inuse;
@@ -954,11 +900,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 	} else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
 	    && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
 				    				&confirm)))) {
-		/* CASE 4:
-		 * conf record not found, or if conf, conf->cl_confirm does not
-		 * match input confirm.
-		 * unconf record not found, or if unconf, unconf->cl_confirm
-		 * does not match input confirm.
+		/*
+		 * RFC 3530 14.2.34 CASE 4:
+		 * Client probably hasn't noticed that we rebooted yet.
 		 */
 		status = nfserr_stale_clientid;
 	} else {
-- 
cgit v1.2.3


From 49ba87811f34a0219dc7a373cd24aa68450f2058 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 19 Nov 2007 19:09:50 -0500
Subject: nfsd: eliminate final bogus case from setclientid logic

We're supposed to generate a different cl_confirm verifier for each new
client, so these to cl_confirm values should never be the same.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 242fee7c1018..035e70a01027 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -769,7 +769,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		if (new == NULL)
 			goto out;
 		gen_clid(new);
-	} else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) {
+	} else {
 		/*
 		 * RFC 3530 14.2.33 CASE 3:
 		 * probable client reboot; state will be removed if
@@ -780,11 +780,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		if (new == NULL)
 			goto out;
 		gen_clid(new);
-	} else {
-		/* No cases hit !!! */
-		status = nfserr_inval;
-		goto out;
-
 	}
 	copy_verf(new, &clverifier);
 	new->cl_addr = sin->sin_addr.s_addr;
-- 
cgit v1.2.3


From deda2faa8e71474c828d8eefc8bc0f19d02062ef Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 19 Nov 2007 20:31:04 -0500
Subject: nfsd: uniquify cl_confirm values

Using a counter instead of the nanoseconds value seems more likely to
produce a unique cl_confirm.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 035e70a01027..9f6322e830fa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -491,15 +491,14 @@ gen_clid(struct nfs4_client *clp) {
 	clp->cl_clientid.cl_id = current_clientid++; 
 }
 
-static void
-gen_confirm(struct nfs4_client *clp) {
-	struct timespec 	tv;
-	u32 *			p;
+static void gen_confirm(struct nfs4_client *clp)
+{
+	static u32 i;
+	u32 *p;
 
-	tv = CURRENT_TIME;
 	p = (u32 *)clp->cl_confirm.data;
-	*p++ = tv.tv_sec;
-	*p++ = tv.tv_nsec;
+	*p++ = get_seconds();
+	*p++ = i++;
 }
 
 static int
-- 
cgit v1.2.3


From f394baad139f8a67a40b4246d53d3b818af2eb88 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 20 Nov 2007 15:39:07 -0500
Subject: nfsd4: kill unnecessary same_name() in setclientid_confirm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If conf and unconf are both found in the lookup by cl_clientid, then
they share the same cl_clientid.  We always create a unique new
cl_clientid field when creating a new client--the only exception is the
"probable callback update" case in setclientid, where we copy the old
cl_clientid from another clientid with the same name.

Therefore two clients with the same cl_client field also always share
the same cl_name field, and a couple of the checks here are redundant.

Thanks to Simon Holm Thøgersen for a compile fix.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Simon Holm Thøgersen <odie@cs.aau.dk>
---
 fs/nfsd/nfs4state.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9f6322e830fa..df3e7a7ad31e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -838,7 +838,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 	if ((conf && unconf) && 
 	    (same_verf(&unconf->cl_confirm, &confirm)) &&
 	    (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) &&
-	    (same_name(conf->cl_recdir,unconf->cl_recdir))  &&
 	    (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
 		/*
 		 * RFC 3530 14.2.34 CASE 1:
@@ -858,8 +857,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		}
 	} else if ((conf && !unconf) ||
 	    ((conf && unconf) && 
-	     (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) ||
-	      !same_name(conf->cl_recdir, unconf->cl_recdir)))) {
+	     !same_verf(&conf->cl_verifier, &unconf->cl_verifier))) {
 		/*
 		 * RFC 3530 14.2.34 CASE 2:
 		 * probable retransmitted request; play it safe and
-- 
cgit v1.2.3


From f3aba4e5a1b963c8bd43394cb15fb9fb6a229cd2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 20 Nov 2007 16:52:07 -0500
Subject: nfsd4: remove unnecessary cl_verifier check from setclientid_confirm

Again, the only way conf and unconf can have the same clientid is if
they were created in the "probable callback update" case of setclientid,
in which case we already know that the cl_verifier fields must agree.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index df3e7a7ad31e..23b5fc71f9fb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -837,7 +837,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 	 */
 	if ((conf && unconf) && 
 	    (same_verf(&unconf->cl_confirm, &confirm)) &&
-	    (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) &&
 	    (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
 		/*
 		 * RFC 3530 14.2.34 CASE 1:
@@ -855,9 +854,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			status = nfs_ok;
 
 		}
-	} else if ((conf && !unconf) ||
-	    ((conf && unconf) && 
-	     !same_verf(&conf->cl_verifier, &unconf->cl_verifier))) {
+	} else if (conf && !unconf) {
 		/*
 		 * RFC 3530 14.2.34 CASE 2:
 		 * probable retransmitted request; play it safe and
-- 
cgit v1.2.3


From 366e0c1d9116ed03320779ecf9c162204f4c712e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 20 Nov 2007 15:54:10 -0500
Subject: nfsd4: kill unneeded cl_confirm check

We generate a unique cl_confirm for every new client; so if we've
already checked that this cl_confirm agrees with the cl_confirm of
unconf, then we already know that it does not agree with the cl_confirm
of conf.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 23b5fc71f9fb..60cc937b7076 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -835,9 +835,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 	 * SETCLIENTID_CONFIRM request processing consisting
 	 * of 4 bullet points, labeled as CASE1 - CASE4 below.
 	 */
-	if ((conf && unconf) && 
-	    (same_verf(&unconf->cl_confirm, &confirm)) &&
-	    (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
+	if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) {
 		/*
 		 * RFC 3530 14.2.34 CASE 1:
 		 * callback update
-- 
cgit v1.2.3


From 99d965eda736b839a63fe85438ee03a0f660053c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 21 Nov 2007 14:10:07 -0500
Subject: nfsd: fix encode_entryplus_baggage() indentation

Fix bizarre indentation.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs3xdr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 4b1ffe3be7e2..d7647f70e02b 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -817,11 +817,11 @@ static __be32 *
 encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
 		struct svc_fh *fhp)
 {
-		p = encode_post_op_attr(cd->rqstp, p, fhp);
-		*p++ = xdr_one;			/* yes, a file handle follows */
-		p = encode_fh(p, fhp);
-		fh_put(fhp);
-		return p;
+	p = encode_post_op_attr(cd->rqstp, p, fhp);
+	*p++ = xdr_one;			/* yes, a file handle follows */
+	p = encode_fh(p, fhp);
+	fh_put(fhp);
+	return p;
 }
 
 static int
-- 
cgit v1.2.3


From 5ec7b46c2f4a6f5e136188d598a3f9912ca922e9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 21 Nov 2007 21:58:56 -0500
Subject: nfsd4: make current_clientid local

Declare this variable in the one function where it's used, and clean up
some minor style problems.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 60cc937b7076..78b9139cdd0f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@ static time_t lease_time = 90;     /* default lease time */
 static time_t user_lease_time = 90;
 static time_t boot_time;
 static int in_grace = 1;
-static u32 current_clientid = 1;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
@@ -485,8 +484,10 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
 	return cr1->cr_uid == cr2->cr_uid;
 }
 
-static void
-gen_clid(struct nfs4_client *clp) {
+static void gen_clid(struct nfs4_client *clp)
+{
+	static u32 current_clientid = 1;
+
 	clp->cl_clientid.cl_boot = boot_time;
 	clp->cl_clientid.cl_id = current_clientid++; 
 }
-- 
cgit v1.2.3


From 35bba9a37e68c68a820a1a772f016255c0838f79 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 21 Nov 2007 22:07:08 -0500
Subject: nfsd4: miscellaneous nfs4state.c style fixes

Fix various minor style violations.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 55 +++++++++++++++++++++++++----------------------------
 1 file changed, 26 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 78b9139cdd0f..b9d395856b3a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -339,21 +339,20 @@ STALE_CLIENTID(clientid_t *clid)
  * This type of memory management is somewhat inefficient, but we use it
  * anyway since SETCLIENTID is not a common operation.
  */
-static inline struct nfs4_client *
-alloc_client(struct xdr_netobj name)
+static struct nfs4_client *alloc_client(struct xdr_netobj name)
 {
 	struct nfs4_client *clp;
 
-	if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) {
-		if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) {
-			memcpy(clp->cl_name.data, name.data, name.len);
-			clp->cl_name.len = name.len;
-		}
-		else {
-			kfree(clp);
-			clp = NULL;
-		}
+	clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
+	if (clp == NULL)
+		return NULL;
+	clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
+	if (clp->cl_name.data == NULL) {
+		kfree(clp);
+		return NULL;
 	}
+	memcpy(clp->cl_name.data, name.data, name.len);
+	clp->cl_name.len = name.len;
 	return clp;
 }
 
@@ -421,12 +420,13 @@ expire_client(struct nfs4_client *clp)
 	put_nfs4_client(clp);
 }
 
-static struct nfs4_client *
-create_client(struct xdr_netobj name, char *recdir) {
+static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
+{
 	struct nfs4_client *clp;
 
-	if (!(clp = alloc_client(name)))
-		goto out;
+	clp = alloc_client(name);
+	if (clp == NULL)
+		return NULL;
 	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
 	atomic_set(&clp->cl_count, 1);
 	atomic_set(&clp->cl_callback.cb_set, 0);
@@ -435,32 +435,30 @@ create_client(struct xdr_netobj name, char *recdir) {
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_lru);
-out:
 	return clp;
 }
 
-static void
-copy_verf(struct nfs4_client *target, nfs4_verifier *source) {
-	memcpy(target->cl_verifier.data, source->data, sizeof(target->cl_verifier.data));
+static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
+{
+	memcpy(target->cl_verifier.data, source->data,
+			sizeof(target->cl_verifier.data));
 }
 
-static void
-copy_clid(struct nfs4_client *target, struct nfs4_client *source) {
+static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
+{
 	target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; 
 	target->cl_clientid.cl_id = source->cl_clientid.cl_id; 
 }
 
-static void
-copy_cred(struct svc_cred *target, struct svc_cred *source) {
-
+static void copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
 	target->cr_uid = source->cr_uid;
 	target->cr_gid = source->cr_gid;
 	target->cr_group_info = source->cr_group_info;
 	get_group_info(target->cr_group_info);
 }
 
-static inline int
-same_name(const char *n1, const char *n2)
+static int same_name(const char *n1, const char *n2)
 {
 	return 0 == memcmp(n1, n2, HEXDIR_LEN);
 }
@@ -502,9 +500,8 @@ static void gen_confirm(struct nfs4_client *clp)
 	*p++ = i++;
 }
 
-static int
-check_name(struct xdr_netobj name) {
-
+static int check_name(struct xdr_netobj name)
+{
 	if (name.len == 0) 
 		return 0;
 	if (name.len > NFS4_OPAQUE_LIMIT) {
-- 
cgit v1.2.3


From 404ec117be5d36e1a4c4582d0c518594333e32df Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 23 Nov 2007 22:26:18 -0500
Subject: nfsd4: recognize callback channel failure earlier

When the callback channel fails, we inform the client of that by
returning a cb_path_down error the next time it tries to renew its
lease.

If we wait most of a lease period before deciding that a callback has
failed and that the callback channel is down, then we decrease the
chances that the client will find out in time to do anything about it.

So, mark the channel down as soon as we recognize that an rpc has
failed.  However, continue trying to recall delegations anyway, in hopes
it will come back up.  This will prevent more delegations from being
given out, and ensure cb_path_down is returned to renew calls earlier,
while still making the best effort to deliver recalls of existing
delegations.

Also fix a couple comments and remove a dprink that doesn't seem likely
to be useful.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 13 +++++--------
 fs/nfsd/nfs4state.c    |  5 ++++-
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6eb5cd2381ab..aae2b29ae2c9 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -457,9 +457,6 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 	int retries = 1;
 	int status = 0;
 
-	if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt)
-		return;
-
 	cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
 	cbr->cbr_dp = dp;
 
@@ -468,6 +465,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 		switch (status) {
 			case -EIO:
 				/* Network partition? */
+				atomic_set(&clp->cl_callback.cb_set, 0);
 			case -EBADHANDLE:
 			case -NFS4ERR_BAD_STATEID:
 				/* Race: client probably got cb_recall
@@ -480,11 +478,10 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
 		status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
 	}
 out_put_cred:
-	if (status == -EIO)
-		atomic_set(&clp->cl_callback.cb_set, 0);
-	/* Success or failure, now we're either waiting for lease expiration
-	 * or deleg_return. */
-	dprintk("NFSD: nfs4_cb_recall: dp %p dl_flock %p dl_count %d\n",dp, dp->dl_flock, atomic_read(&dp->dl_count));
+	/*
+	 * Success or failure, now we're either waiting for lease expiration
+	 * or deleg_return.
+	 */
 	put_nfs4_client(clp);
 	nfs4_put_delegation(dp);
 	return;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b9d395856b3a..11aa4b6b4fa2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -361,8 +361,11 @@ shutdown_callback_client(struct nfs4_client *clp)
 {
 	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
 
-	/* shutdown rpc client, ending any outstanding recall rpcs */
 	if (clnt) {
+		/*
+		 * Callback threads take a reference on the client, so there
+		 * should be no outstanding callbacks at this point.
+		 */
 		clp->cl_callback.cb_client = NULL;
 		rpc_shutdown_client(clnt);
 	}
-- 
cgit v1.2.3


From b7e6b86948df8d08d420558212e09eb449be9bfa Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Mon, 26 Nov 2007 13:35:11 -0500
Subject: lockd: fix reference count leaks in async locking case

In a number of places where we wish only to translate nlm_drop_reply to
rpc_drop_reply errors we instead return early with rpc_drop_reply,
skipping some important end-of-function cleanup.

This results in reference count leaks when lockd is doing posix locking
on GFS2.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc4proc.c | 20 ++++++++++++--------
 fs/lockd/svcproc.c  | 22 +++++++++++++---------
 2 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bf27b6c6cb6b..385437e3387d 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -84,6 +84,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
 	struct nlm_host	*host;
 	struct nlm_file	*file;
+	int rc = rpc_success;
 
 	dprintk("lockd: TEST4        called\n");
 	resp->cookie = argp->cookie;
@@ -91,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Don't accept test requests during grace period */
 	if (nlmsvc_grace_period) {
 		resp->status = nlm_lck_denied_grace_period;
-		return rpc_success;
+		return rc;
 	}
 
 	/* Obtain client and file */
@@ -101,12 +102,13 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Now check for conflicting locks */
 	resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
 	if (resp->status == nlm_drop_reply)
-		return rpc_drop_reply;
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
 
-	dprintk("lockd: TEST4          status %d\n", ntohl(resp->status));
 	nlm_release_host(host);
 	nlm_release_file(file);
-	return rpc_success;
+	return rc;
 }
 
 static __be32
@@ -115,6 +117,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
 	struct nlm_host	*host;
 	struct nlm_file	*file;
+	int rc = rpc_success;
 
 	dprintk("lockd: LOCK          called\n");
 
@@ -123,7 +126,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Don't accept new lock requests during grace period */
 	if (nlmsvc_grace_period && !argp->reclaim) {
 		resp->status = nlm_lck_denied_grace_period;
-		return rpc_success;
+		return rc;
 	}
 
 	/* Obtain client and file */
@@ -146,12 +149,13 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
 					argp->block, &argp->cookie);
 	if (resp->status == nlm_drop_reply)
-		return rpc_drop_reply;
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
-	dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
 	nlm_release_host(host);
 	nlm_release_file(file);
-	return rpc_success;
+	return rc;
 }
 
 static __be32
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 9cd5c8b37593..88379cc6e0b1 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -113,6 +113,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
 	struct nlm_host	*host;
 	struct nlm_file	*file;
+	int rc = rpc_success;
 
 	dprintk("lockd: TEST          called\n");
 	resp->cookie = argp->cookie;
@@ -120,7 +121,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Don't accept test requests during grace period */
 	if (nlmsvc_grace_period) {
 		resp->status = nlm_lck_denied_grace_period;
-		return rpc_success;
+		return rc;
 	}
 
 	/* Obtain client and file */
@@ -130,13 +131,14 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Now check for conflicting locks */
 	resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
 	if (resp->status == nlm_drop_reply)
-		return rpc_drop_reply;
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: TEST          status %d vers %d\n",
+			ntohl(resp->status), rqstp->rq_vers);
 
-	dprintk("lockd: TEST          status %d vers %d\n",
-		ntohl(resp->status), rqstp->rq_vers);
 	nlm_release_host(host);
 	nlm_release_file(file);
-	return rpc_success;
+	return rc;
 }
 
 static __be32
@@ -145,6 +147,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
 	struct nlm_host	*host;
 	struct nlm_file	*file;
+	int rc = rpc_success;
 
 	dprintk("lockd: LOCK          called\n");
 
@@ -153,7 +156,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	/* Don't accept new lock requests during grace period */
 	if (nlmsvc_grace_period && !argp->reclaim) {
 		resp->status = nlm_lck_denied_grace_period;
-		return rpc_success;
+		return rc;
 	}
 
 	/* Obtain client and file */
@@ -176,12 +179,13 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
 					       argp->block, &argp->cookie));
 	if (resp->status == nlm_drop_reply)
-		return rpc_drop_reply;
+		rc = rpc_drop_reply;
+	else
+		dprintk("lockd: LOCK         status %d\n", ntohl(resp->status));
 
-	dprintk("lockd: LOCK          status %d\n", ntohl(resp->status));
 	nlm_release_host(host);
 	nlm_release_file(file);
-	return rpc_success;
+	return rc;
 }
 
 static __be32
-- 
cgit v1.2.3


From 39325bd03fc16d903f1e0f51104436d939899c8c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 26 Nov 2007 17:06:39 -0500
Subject: nfsd4: fix bad seqid on lock request incompatible with open mode

The failure to return a stateowner from nfs4_preprocess_seqid_op() means
in the case where a lock request is of a type incompatible with an open
(due to, e.g., an application attempting a write lock on a file open for
read), means that fs/nfsd/nfs4xdr.c:ENCODE_SEQID_OP_TAIL() never bumps
the seqid as it should.  The client, attempting to close the file
afterwards, then gets an (incorrect) bad sequence id error.  Worse, this
prevents the open file from ever being closed, so we leak state.

Thanks to Benny Halevy and Trond Myklebust for analysis, and to Steven
Wilton for the report and extensive data-gathering.

Cc: Benny Halevy <bhalevy@panasas.com>
Cc: Steven Wilton <steven.wilton@team.eftel.com.au>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 11aa4b6b4fa2..c4b10a1e6c30 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2093,8 +2093,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 		goto check_replay;
 	}
 
+	*stpp = stp;
+	*sopp = sop = stp->st_stateowner;
+
 	if (lock) {
-		struct nfs4_stateowner *sop = stp->st_stateowner;
 		clientid_t *lockclid = &lock->v.new.clientid;
 		struct nfs4_client *clp = sop->so_client;
 		int lkflg = 0;
@@ -2124,9 +2126,6 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 		return nfserr_bad_stateid;
 	}
 
-	*stpp = stp;
-	*sopp = sop = stp->st_stateowner;
-
 	/*
 	*  We now validate the seqid and stateid generation numbers.
 	*  For the moment, we ignore the possibility of 
-- 
cgit v1.2.3


From 406a7ea97d9dc1a9348ba92c4cd0e7c678185c4c Mon Sep 17 00:00:00 2001
From: Frank Filz <ffilzlnx@us.ibm.com>
Date: Tue, 27 Nov 2007 11:34:05 -0800
Subject: nfsd: Allow AIX client to read dir containing mountpoints

This patch addresses a compatibility issue with a Linux NFS server and
AIX NFS client.

I have exported /export as fsid=0 with sec=krb5:krb5i
I have mount --bind /home onto /export/home
I have exported /export/home with sec=krb5i

The AIX client mounts / -o sec=krb5:krb5i onto /mnt

If I do an ls /mnt, the AIX client gets a permission error. Looking at
the network traceIwe see a READDIR looking for attributes
FATTR4_RDATTR_ERROR and FATTR4_MOUNTED_ON_FILEID. The response gives a
NFS4ERR_WRONGSEC which the AIX client is not expecting.

Since the AIX client is only asking for an attribute that is an
attribute of the parent file system (pseudo root in my example), it
seems reasonable that there should not be an error.

In discussing this issue with Bruce Fields, I initially proposed
ignoring the error in nfsd4_encode_dirent_fattr() if all that was being
asked for was FATTR4_RDATTR_ERROR and FATTR4_MOUNTED_ON_FILEID, however,
Bruce suggested that we avoid calling cross_mnt() if only these
attributes are requested.

The following patch implements bypassing cross_mnt() if only
FATTR4_RDATTR_ERROR and FATTR4_MOUNTED_ON_FILEID are called. Since there
is some complexity in the code in nfsd4_encode_fattr(), I didn't want to
duplicate code (and introduce a maintenance nightmare), so I added a
parameter to nfsd4_encode_fattr() that indicates whether it should
ignore cross mounts and simply fill in the attribute using the passed in
dentry as opposed to it's parent.

Signed-off-by: Frank Filz <ffilzlnx@us.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c |  2 +-
 fs/nfsd/nfs4xdr.c  | 27 ++++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 18ead1790bb3..c593db047d8b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -750,7 +750,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				    cstate->current_fh.fh_export,
 				    cstate->current_fh.fh_dentry, buf,
 				    &count, verify->ve_bmval,
-				    rqstp);
+				    rqstp, 0);
 
 	/* this means that nfsd4_encode_fattr() ran out of space */
 	if (status == nfserr_resource && count == 0)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index bf1e792a65a0..b0592e7c378d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1453,7 +1453,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
 __be32
 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 		struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
-		struct svc_rqst *rqstp)
+		struct svc_rqst *rqstp, int ignore_crossmnt)
 {
 	u32 bmval0 = bmval[0];
 	u32 bmval1 = bmval[1];
@@ -1833,7 +1833,12 @@ out_acl:
 	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
 		if ((buflen -= 8) < 0)
                 	goto out_resource;
-		if (exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
+		/*
+		 * Get parent's attributes if not ignoring crossmount
+		 * and this is the root of a cross-mounted filesystem.
+		 */
+		if (ignore_crossmnt == 0 &&
+		    exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
 			err = vfs_getattr(exp->ex_mnt->mnt_parent,
 				exp->ex_mnt->mnt_mountpoint, &stat);
 			if (err)
@@ -1869,13 +1874,25 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
 	struct svc_export *exp = cd->rd_fhp->fh_export;
 	struct dentry *dentry;
 	__be32 nfserr;
+	int ignore_crossmnt = 0;
 
 	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 
 	exp_get(exp);
-	if (d_mountpoint(dentry)) {
+	/*
+	 * In the case of a mountpoint, the client may be asking for
+	 * attributes that are only properties of the underlying filesystem
+	 * as opposed to the cross-mounted file system. In such a case,
+	 * we will not follow the cross mount and will fill the attribtutes
+	 * directly from the mountpoint dentry.
+	 */
+	if (d_mountpoint(dentry) &&
+	    (cd->rd_bmval[0] & ~FATTR4_WORD0_RDATTR_ERROR) == 0 &&
+	    (cd->rd_bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) == 0)
+		ignore_crossmnt = 1;
+	else if (d_mountpoint(dentry)) {
 		int err;
 
 		/*
@@ -1894,7 +1911,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
 
 	}
 	nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
-					cd->rd_rqstp);
+					cd->rd_rqstp, ignore_crossmnt);
 out_put:
 	dput(dentry);
 	exp_put(exp);
@@ -2048,7 +2065,7 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 	buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
 	nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
 				    resp->p, &buflen, getattr->ga_bmval,
-				    resp->rqstp);
+				    resp->rqstp, 0);
 	if (!nfserr)
 		resp->p += buflen;
 	return nfserr;
-- 
cgit v1.2.3


From 29dbf546159f5701e11de26fa2da5c4a962e0f83 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <Oleg.Drokin@Sun.COM>
Date: Thu, 29 Nov 2007 14:02:21 -0500
Subject: lockd: fix a leak in nlmsvc_testlock asynchronous request handling

Without the patch, there is a leakage of nlmblock structure refcount
that holds a reference nlmfile structure, that holds a reference to
struct file, when async GETFL is used (-EINPROGRESS return from
file_ops->lock()), and also in some error cases.

Fix up a style nit while we're here.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svclock.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d120ec39bcb0..84c4d5e04ebb 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -501,25 +501,29 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 			block, block->b_flags, block->b_fl);
 		if (block->b_flags & B_TIMED_OUT) {
 			nlmsvc_unlink_block(block);
-			return nlm_lck_denied;
+			ret = nlm_lck_denied;
+			goto out;
 		}
 		if (block->b_flags & B_GOT_CALLBACK) {
 			if (block->b_fl != NULL
 					&& block->b_fl->fl_type != F_UNLCK) {
 				lock->fl = *block->b_fl;
 				goto conf_lock;
-			}
-			else {
+			} else {
 				nlmsvc_unlink_block(block);
-				return nlm_granted;
+				ret = nlm_granted;
+				goto out;
 			}
 		}
-		return nlm_drop_reply;
+		ret = nlm_drop_reply;
+		goto out;
 	}
 
 	error = vfs_test_lock(file->f_file, &lock->fl);
-	if (error == -EINPROGRESS)
-		return nlmsvc_defer_lock_rqst(rqstp, block);
+	if (error == -EINPROGRESS) {
+		ret = nlmsvc_defer_lock_rqst(rqstp, block);
+		goto out;
+	}
 	if (error) {
 		ret = nlm_lck_denied_nolocks;
 		goto out;
-- 
cgit v1.2.3


From 5c002b3bb294a637312cab7ad92a0deafa05a758 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 30 Nov 2007 16:55:23 -0500
Subject: nfsd: allow root to set uid and gid on create

The server silently ignores attempts to set the uid and gid on create.
Based on the comment, this appears to have been done to prevent some
overly-clever IRIX client from causing itself problems.

Perhaps we should remove that hack completely.  For now, at least, it
makes sense to allow root (when no_root_squash is set) to set uid and
gid.

While we're there, since nfsd_create and nfsd_create_v3 share the same
logic, pull that out into a separate function.  And spell out the
individual modifications of ia_valid instead of doing them both at once
inside a conditional.

Thanks to Roger Willcocks <roger@filmlight.ltd.uk> for the bug report
and original patch on which this is based.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 755ba43c13e1..cc75e4fcd02b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1151,6 +1151,26 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 }
 #endif /* CONFIG_NFSD_V3 */
 
+__be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
+			struct iattr *iap)
+{
+	/*
+	 * Mode has already been set earlier in create:
+	 */
+	iap->ia_valid &= ~ATTR_MODE;
+	/*
+	 * Setting uid/gid works only for root.  Irix appears to
+	 * send along the gid on create when it tries to implement
+	 * setgid directories via NFS:
+	 */
+	if (current->fsuid != 0)
+		iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+	if (iap->ia_valid)
+		return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+	return 0;
+}
+
 /*
  * Create a file (regular, directory, device, fifo); UNIX sockets 
  * not yet implemented.
@@ -1167,6 +1187,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct dentry	*dentry, *dchild = NULL;
 	struct inode	*dirp;
 	__be32		err;
+	__be32		err2;
 	int		host_err;
 
 	err = nfserr_perm;
@@ -1257,16 +1278,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	}
 
 
-	/* Set file attributes. Mode has already been set and
-	 * setting uid/gid works only for root. Irix appears to
-	 * send along the gid when it tries to implement setgid
-	 * directories via NFS.
-	 */
-	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
-		__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
-		if (err2)
-			err = err2;
-	}
+	err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+	if (err2)
+		err = err2;
 	/*
 	 * Update the file handle to get the new inode info.
 	 */
@@ -1295,6 +1309,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct dentry	*dentry, *dchild = NULL;
 	struct inode	*dirp;
 	__be32		err;
+	__be32		err2;
 	int		host_err;
 	__u32		v_mtime=0, v_atime=0;
 
@@ -1399,16 +1414,10 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		iap->ia_atime.tv_nsec = 0;
 	}
 
-	/* Set file attributes.
-	 * Irix appears to send along the gid when it tries to
-	 * implement setgid directories via NFS. Clear out all that cruft.
-	 */
  set_attr:
-	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
- 		__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
-		if (err2)
-			err = err2;
-	}
+	err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+	if (err2)
+		err = err2;
 
 	/*
 	 * Update the filehandle to get the new inode info.
-- 
cgit v1.2.3


From 8838dc43d6544570e8969a74ddc4a0d21abffde6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 14 Jan 2008 13:12:19 -0500
Subject: nfsd4: clean up access_valid, deny_valid checks.

Document these checks a little better and inline, as suggested by Neil
Brown (note both functions have two callers).  Remove an obviously bogus
check while we're there (checking whether unsigned value is negative).

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Neil Brown <neilb@suse.de>
---
 fs/nfsd/nfs4state.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c4b10a1e6c30..f6744bc03dae 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1157,14 +1157,19 @@ find_file(struct inode *ino)
 	return NULL;
 }
 
-static int access_valid(u32 x)
+static inline int access_valid(u32 x)
 {
-	return (x > 0 && x < 4);
+	if (x < NFS4_SHARE_ACCESS_READ)
+		return 0;
+	if (x > NFS4_SHARE_ACCESS_BOTH)
+		return 0;
+	return 1;
 }
 
-static int deny_valid(u32 x)
+static inline int deny_valid(u32 x)
 {
-	return (x >= 0 && x < 5);
+	/* Note: unlike access bits, deny bits may be zero. */
+	return x <= NFS4_SHARE_DENY_BOTH;
 }
 
 static void
-- 
cgit v1.2.3


From 54ca95eb362d6988a577965ffb77c08702adb890 Mon Sep 17 00:00:00 2001
From: Oleg Drokin <Oleg.Drokin@Sun.COM>
Date: Fri, 11 Jan 2008 21:57:35 -0500
Subject: Leak in nlmsvc_testlock for async GETFL case

Fix nlm_block leak for the case of supplied blocking lock info.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svclock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 84c4d5e04ebb..2f4d8fa66689 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -505,12 +505,12 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 		}
 		if (block->b_flags & B_GOT_CALLBACK) {
+			nlmsvc_unlink_block(block);
 			if (block->b_fl != NULL
 					&& block->b_fl->fl_type != F_UNLCK) {
 				lock->fl = *block->b_fl;
 				goto conf_lock;
 			} else {
-				nlmsvc_unlink_block(block);
 				ret = nlm_granted;
 				goto out;
 			}
-- 
cgit v1.2.3


From d7c9f1ed972b4a468dd24a2457721704dfe9ca70 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Sun, 30 Dec 2007 21:07:44 -0600
Subject: svc: Change services to use new svc_create_xprt service

Modify the various kernel RPC svcs to use the svc_create_xprt service.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Acked-by: Neil Brown <neilb@suse.de>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc.c    | 17 ++++++++---------
 fs/nfs/callback.c |  4 ++--
 fs/nfsd/nfssvc.c  |  4 ++--
 3 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 82e2192a0d5c..868691535115 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -219,13 +219,12 @@ lockd(struct svc_rqst *rqstp)
 	module_put_and_exit(0);
 }
 
-
-static int find_socket(struct svc_serv *serv, int proto)
+static int find_xprt(struct svc_serv *serv, char *proto)
 {
 	struct svc_sock *svsk;
 	int found = 0;
 	list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
-		if (svsk->sk_sk->sk_protocol == proto) {
+		if (strcmp(svsk->sk_xprt.xpt_class->xcl_name, proto) == 0) {
 			found = 1;
 			break;
 		}
@@ -243,13 +242,13 @@ static int make_socks(struct svc_serv *serv, int proto)
 	int err = 0;
 
 	if (proto == IPPROTO_UDP || nlm_udpport)
-		if (!find_socket(serv, IPPROTO_UDP))
-			err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport,
-						SVC_SOCK_DEFAULTS);
+		if (!find_xprt(serv, "udp"))
+			err = svc_create_xprt(serv, "udp", nlm_udpport,
+					      SVC_SOCK_DEFAULTS);
 	if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport))
-		if (!find_socket(serv, IPPROTO_TCP))
-			err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport,
-						SVC_SOCK_DEFAULTS);
+		if (!find_xprt(serv, "tcp"))
+			err = svc_create_xprt(serv, "tcp", nlm_tcpport,
+					      SVC_SOCK_DEFAULTS);
 
 	if (err >= 0) {
 		warned = 0;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 9b6bbf1b9787..bd185a572a23 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -119,8 +119,8 @@ int nfs_callback_up(void)
 	if (!serv)
 		goto out_err;
 
-	ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport,
-							SVC_SOCK_ANONYMOUS);
+	ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
+			      SVC_SOCK_ANONYMOUS);
 	if (ret <= 0)
 		goto out_destroy;
 	nfs_callback_tcpport = ret;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1190aeaa92be..a828b0b0fb67 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -236,7 +236,7 @@ static int nfsd_init_socks(int port)
 
 	error = lockd_up(IPPROTO_UDP);
 	if (error >= 0) {
-		error = svc_makesock(nfsd_serv, IPPROTO_UDP, port,
+		error = svc_create_xprt(nfsd_serv, "udp", port,
 					SVC_SOCK_DEFAULTS);
 		if (error < 0)
 			lockd_down();
@@ -247,7 +247,7 @@ static int nfsd_init_socks(int port)
 #ifdef CONFIG_NFSD_TCP
 	error = lockd_up(IPPROTO_TCP);
 	if (error >= 0) {
-		error = svc_makesock(nfsd_serv, IPPROTO_TCP, port,
+		error = svc_create_xprt(nfsd_serv, "tcp", port,
 					SVC_SOCK_DEFAULTS);
 		if (error < 0)
 			lockd_down();
-- 
cgit v1.2.3


From 7a18208383ab3f3ce4a1f4e0536acc9372523d81 Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Sun, 30 Dec 2007 21:07:53 -0600
Subject: svc: Make close transport independent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move sk_list and sk_ready to svc_xprt. This involves close because these
lists are walked by svcs when closing all their transports. So I combined
the moving of these lists to svc_xprt with making close transport independent.

The svc_force_sock_close has been changed to svc_close_all and takes a list
as an argument. This removes some svc internals knowledge from the svcs.

This code races with module removal and transport addition.

Thanks to Simon Holm Thøgersen for a compile fix.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Acked-by: Neil Brown <neilb@suse.de>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Simon Holm Thøgersen <odie@cs.aau.dk>
---
 fs/lockd/svc.c   | 6 +++---
 fs/nfsd/nfssvc.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 868691535115..a8e79a907202 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -221,10 +221,10 @@ lockd(struct svc_rqst *rqstp)
 
 static int find_xprt(struct svc_serv *serv, char *proto)
 {
-	struct svc_sock *svsk;
+	struct svc_xprt *xprt;
 	int found = 0;
-	list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
-		if (strcmp(svsk->sk_xprt.xpt_class->xcl_name, proto) == 0) {
+	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
+		if (strcmp(xprt->xpt_class->xcl_name, proto) == 0) {
 			found = 1;
 			break;
 		}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a828b0b0fb67..9647b0f7bc0c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -155,8 +155,8 @@ static int killsig;	/* signal that was used to kill last nfsd */
 static void nfsd_last_thread(struct svc_serv *serv)
 {
 	/* When last nfsd thread exits we need to do some clean-up */
-	struct svc_sock *svsk;
-	list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
+	struct svc_xprt *xprt;
+	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
 		lockd_down();
 	nfsd_serv = NULL;
 	nfsd_racache_shutdown();
-- 
cgit v1.2.3


From 7fcb98d58cb4d18af6386f71025fc5192f25fbca Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Sun, 30 Dec 2007 21:08:33 -0600
Subject: svc: Add svc API that queries for a transport instance

Add a new svc function that allows a service to query whether a
transport instance has already been created. This is used in lockd
to determine whether or not a transport needs to be created when
a lockd instance is brought up.

Specifying 0 for the address family or port is effectively a wild-card,
and will result in matching the first transport in the service's list
that has a matching class name.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Acked-by: Neil Brown <neilb@suse.de>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a8e79a907202..470af0138bb5 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -219,18 +219,6 @@ lockd(struct svc_rqst *rqstp)
 	module_put_and_exit(0);
 }
 
-static int find_xprt(struct svc_serv *serv, char *proto)
-{
-	struct svc_xprt *xprt;
-	int found = 0;
-	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
-		if (strcmp(xprt->xpt_class->xcl_name, proto) == 0) {
-			found = 1;
-			break;
-		}
-	return found;
-}
-
 /*
  * Make any sockets that are needed but not present.
  * If nlm_udpport or nlm_tcpport were set as module
@@ -242,11 +230,11 @@ static int make_socks(struct svc_serv *serv, int proto)
 	int err = 0;
 
 	if (proto == IPPROTO_UDP || nlm_udpport)
-		if (!find_xprt(serv, "udp"))
+		if (!svc_find_xprt(serv, "udp", 0, 0))
 			err = svc_create_xprt(serv, "udp", nlm_udpport,
 					      SVC_SOCK_DEFAULTS);
 	if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport))
-		if (!find_xprt(serv, "tcp"))
+		if (!svc_find_xprt(serv, "tcp", 0, 0))
 			err = svc_create_xprt(serv, "tcp", nlm_tcpport,
 					      SVC_SOCK_DEFAULTS);
 
-- 
cgit v1.2.3


From a217813f9067b785241cb7f31956e51d2071703a Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Sun, 30 Dec 2007 21:08:35 -0600
Subject: knfsd: Support adding transports by writing portlist file

Update the write handler for the portlist file to allow creating new
listening endpoints on a transport. The general form of the string is:

<transport_name><space><port number>

For example:

echo "tcp 2049" > /proc/fs/nfsd/portlist

This is intended to support the creation of a listening endpoint for
RDMA transports without adding #ifdef code to the nfssvc.c file.

Transports can also be removed as follows:

'-'<transport_name><space><port number>

For example:

echo "-tcp 2049" > /proc/fs/nfsd/portlist

Attempting to add a listener with an invalid transport string results
in EPROTONOSUPPORT and a perror string of "Protocol not supported".

Attempting to remove an non-existent listener (.e.g. bad proto or port)
results in ENOTCONN and a perror string of
"Transport endpoint is not connected"

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Acked-by: Neil Brown <neilb@suse.de>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc.c   | 18 +++++++++++++-----
 fs/nfsd/nfsctl.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 470af0138bb5..08226464e563 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -227,17 +227,25 @@ lockd(struct svc_rqst *rqstp)
 static int make_socks(struct svc_serv *serv, int proto)
 {
 	static int warned;
+	struct svc_xprt *xprt;
 	int err = 0;
 
-	if (proto == IPPROTO_UDP || nlm_udpport)
-		if (!svc_find_xprt(serv, "udp", 0, 0))
+	if (proto == IPPROTO_UDP || nlm_udpport) {
+		xprt = svc_find_xprt(serv, "udp", 0, 0);
+		if (!xprt)
 			err = svc_create_xprt(serv, "udp", nlm_udpport,
 					      SVC_SOCK_DEFAULTS);
-	if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport))
-		if (!svc_find_xprt(serv, "tcp", 0, 0))
+		else
+			svc_xprt_put(xprt);
+	}
+	if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
+		xprt = svc_find_xprt(serv, "tcp", 0, 0);
+		if (!xprt)
 			err = svc_create_xprt(serv, "tcp", nlm_tcpport,
 					      SVC_SOCK_DEFAULTS);
-
+		else
+			svc_xprt_put(xprt);
+	}
 	if (err >= 0) {
 		warned = 0;
 		err = 0;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4aba92698581..eff6a6b4c2f6 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -540,7 +540,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 		}
 		return err < 0 ? err : 0;
 	}
-	if (buf[0] == '-') {
+	if (buf[0] == '-' && isdigit(buf[1])) {
 		char *toclose = kstrdup(buf+1, GFP_KERNEL);
 		int len = 0;
 		if (!toclose)
@@ -554,6 +554,53 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 		kfree(toclose);
 		return len;
 	}
+	/*
+	 * Add a transport listener by writing it's transport name
+	 */
+	if (isalpha(buf[0])) {
+		int err;
+		char transport[16];
+		int port;
+		if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+			err = nfsd_create_serv();
+			if (!err) {
+				err = svc_create_xprt(nfsd_serv,
+						      transport, port,
+						      SVC_SOCK_ANONYMOUS);
+				if (err == -ENOENT)
+					/* Give a reasonable perror msg for
+					 * bad transport string */
+					err = -EPROTONOSUPPORT;
+			}
+			return err < 0 ? err : 0;
+		}
+	}
+	/*
+	 * Remove a transport by writing it's transport name and port number
+	 */
+	if (buf[0] == '-' && isalpha(buf[1])) {
+		struct svc_xprt *xprt;
+		int err = -EINVAL;
+		char transport[16];
+		int port;
+		if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
+			if (port == 0)
+				return -EINVAL;
+			lock_kernel();
+			if (nfsd_serv) {
+				xprt = svc_find_xprt(nfsd_serv, transport,
+						     AF_UNSPEC, port);
+				if (xprt) {
+					svc_close_xprt(xprt);
+					svc_xprt_put(xprt);
+					err = 0;
+				} else
+					err = -ENOTCONN;
+			}
+			unlock_kernel();
+			return err < 0 ? err : 0;
+		}
+	}
 	return -EINVAL;
 }
 
-- 
cgit v1.2.3


From 9571af18fa1e4a431dc6f6023ddbd87d1112fd5d Mon Sep 17 00:00:00 2001
From: Tom Tucker <tom@opengridcomputing.com>
Date: Sun, 30 Dec 2007 21:08:37 -0600
Subject: svc: Add svc_xprt_names service to replace svc_sock_names

Create a transport independent version of the svc_sock_names function.

The toclose capability of the svc_sock_names service can be implemented
using the svc_xprt_find and svc_xprt_close services.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Acked-by: Neil Brown <neilb@suse.de>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index eff6a6b4c2f6..bc22e0b0343a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -503,7 +503,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 		int len = 0;
 		lock_kernel();
 		if (nfsd_serv)
-			len = svc_sock_names(buf, nfsd_serv, NULL);
+			len = svc_xprt_names(nfsd_serv, buf, 0);
 		unlock_kernel();
 		return len;
 	}
-- 
cgit v1.2.3


From f7b8066f9ff68016489ff6f9fb358aa59bd14e1b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 21 Jan 2008 12:20:45 -0500
Subject: knfsd: don't bother mapping putrootfh enoent to eperm

Neither EPERM and ENOENT map to valid errors for PUTROOTFH according to
rfc 3530, and, if anything, ENOENT is likely to be slightly more
informative; so don't bother mapping ENOENT to EPERM.  (Probably this
was originally done because one likely cause was that there is an fsid=0
export but that it isn't permitted to this particular client.  Now that
we allow WRONGSEC returns, this is somewhat less likely.)

In the long term we should work to make this situation less likely,
perhaps by turning off nfsv4 service entirely in the absence of the
pseudofs root, or constructing a pseudofilesystem root ourselves in the
kernel as necessary.

Thanks to Benny Halevy <bhalevy@panasas.com> for pointing out this
problem.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Benny Halevy <bhalevy@panasas.com>
---
 fs/nfsd/export.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index cbbc594ef592..79b4bf812960 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1357,8 +1357,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 	mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
 
 	exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
-	if (PTR_ERR(exp) == -ENOENT)
-		return nfserr_perm;
 	if (IS_ERR(exp))
 		return nfserrno(PTR_ERR(exp));
 	rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
-- 
cgit v1.2.3


From 50431d94e732ba71b66a83c5435890728e313095 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 31 Aug 2007 17:09:33 -0400
Subject: lockd: minor log message fix

Wendy Cheng noticed that function name doesn't agree here.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Wendy Cheng <wcheng@redhat.com>
---
 fs/lockd/svcsubs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 84ebba33b98d..dbbefbcd6712 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -87,7 +87,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
 	unsigned int	hash;
 	__be32		nfserr;
 
-	nlm_debug_print_fh("nlm_file_lookup", f);
+	nlm_debug_print_fh("nlm_lookup_file", f);
 
 	hash = file_hash(f);
 
-- 
cgit v1.2.3


From 87d26ea7771ad637035e6bd5a2700d81ee9162da Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 22 Jan 2008 17:40:42 -0500
Subject: nfsd: more careful input validation in nfsctl write methods

Neil Brown points out that we're checking buf[size-1] in a couple places
without first checking whether size is zero.

Actually, given the implementation of simple_transaction_get(), buf[-1]
is zero, so in both of these cases the subsequent check of the value of
buf[size-1] will catch this case.

But it seems fragile to depend on that, so add explicit checks for this
case.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Acked-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfsctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc22e0b0343a..8516137cdbb0 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -304,6 +304,9 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 	struct auth_domain *dom;
 	struct knfsd_fh fh;
 
+	if (size == 0)
+		return -EINVAL;
+
 	if (buf[size-1] != '\n')
 		return -EINVAL;
 	buf[size-1] = 0;
@@ -663,7 +666,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 	char *recdir;
 	int len, status;
 
-	if (size > PATH_MAX || buf[size-1] != '\n')
+	if (size == 0 || size > PATH_MAX || buf[size-1] != '\n')
 		return -EINVAL;
 	buf[size-1] = 0;
 
-- 
cgit v1.2.3


From d801b861681116ea23a7fb87a70bf463d29c8b9c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 29 Jan 2008 10:30:55 -0500
Subject: NLM: tear down RPC clients in nlm_shutdown_hosts

It's possible for a RPC to outlive the lockd daemon that created it, so
we need to make sure that all RPC's are killed when lockd is coming
down. When nlm_shutdown_hosts is called, kill off all RPC tasks
associated with the host. Since we need to wait until they have all gone
away, we might as well just shut down the RPC client altogether.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index ebec0098efbf..ca6b16fc3101 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -379,8 +379,13 @@ nlm_shutdown_hosts(void)
 	/* First, make all hosts eligible for gc */
 	dprintk("lockd: nuking all hosts...\n");
 	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
-		hlist_for_each_entry(host, pos, chain, h_hash)
+		hlist_for_each_entry(host, pos, chain, h_hash) {
 			host->h_expires = jiffies - 1;
+			if (host->h_rpcclnt) {
+				rpc_shutdown_client(host->h_rpcclnt);
+				host->h_rpcclnt = NULL;
+			}
+		}
 	}
 
 	/* Then, perform a garbage collection pass */
-- 
cgit v1.2.3