exofs: Write sbi->s_nextid as part of the Create command

Before when creating a new inode, we'd set the sb->s_dirt flag, and sometime later the system would write out s_nextid as part of the sb_info. Also on inode sync we would force the sb sync as well. Define the s_nextid as a new partition attribute and set it every time we create a new object. At mount we read it from it's new place. We now never set sb->s_dirt anywhere in exofs. write_super is actually never called. The call to exofs_write_super from exofs_put_super is also removed because the VFS always calls ->sync_fs before calling ->put_super twice. To stay backward-and-forward compatible we also write the old s_nextid in the super_block object at unmount, and support zero length attribute on mount. This also fixes a BUG where in layouts when group_width was not a divisor of EXOFS_SUPER_ID (0x10000) the s_nextid was not read from the device it was written to. Because of the sliding window layout trick, and because the read was always done from the 0 device but the write was done via the raid engine that might slide the device view. Now we read and write through the raid engine. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
author: Boaz Harrosh <bharrosh@panasas.com> 2011-02-03 17:53:25 +0200
committer: Boaz Harrosh <bharrosh@panasas.com> 2011-03-15 15:02:51 +0200
commit: 1cea312ad49d9cb964179a784fedb1fcfe396283 (patch)
tree: 27c45af006b48b1a079698605ea9007398f652b5
parent: 9ed96484311b89360b80a4181d856cbdb21630fd (diff)
5 files changed, 141 insertions, 31 deletions
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index f0d520312d8b..5e74ad3d4009 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -53,10 +53,14 @@
 #define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
 
 /* exofs Application specific page/attribute */
+/* Inode attrs */
 # define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
 # define EXOFS_ATTR_INODE_DATA	1
 # define EXOFS_ATTR_INODE_FILE_LAYOUT	2
 # define EXOFS_ATTR_INODE_DIR_LAYOUT	3
+/* Partition attrs */
+# define EXOFS_APAGE_SB_DATA	(0xF0000000U + 3)
+# define EXOFS_ATTR_SB_STATS	1
 
 /*
  * The maximum number of files we can have is limited by the size of the
@@ -86,8 +90,8 @@ enum {
  */
 enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
 struct exofs_fscb {
-	__le64  s_nextid;	/* Highest object ID used */
-	__le64  s_numfiles;	/* Number of files on fs */
+	__le64  s_nextid;	/* Only used after mkfs */
+	__le64  s_numfiles;	/* Only used after mkfs */
 	__le32	s_version;	/* == EXOFS_FSCB_VER */
 	__le16  s_magic;	/* Magic signature */
 	__le16  s_newfs;	/* Non-zero if this is a new fs */
@@ -98,6 +102,16 @@ struct exofs_fscb {
 } __packed;
 
 /*
+ * This struct is set on the FS partition's attributes.
+ * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
+ * with the create command, to atomically persist the sb writeable information.
+ */
+struct exofs_sb_stats {
+	__le64  s_nextid;	/* Highest object ID used */
+	__le64  s_numfiles;	/* Number of files on fs */
+} __packed;
+
+/*
  * Describes the raid used in the FS. It is part of the device table.
  * This here is taken from the pNFS-objects definition. In exofs we
  * use one raid policy through-out the filesystem. (NOTE: the funny
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 99fcb9126a97..c965806c2821 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -77,7 +77,7 @@ struct exofs_layout {
  * our extension to the in-memory superblock
  */
 struct exofs_sb_info {
-	struct exofs_fscb s_fscb;		/* Written often, pre-allocate*/
+	struct exofs_sb_stats s_ess;		/* Written often, pre-allocate*/
 	int		s_timeout;		/* timeout for OSD operations */
 	uint64_t	s_nextid;		/* highest object ID used     */
 	uint32_t	s_numfiles;		/* number of files on fs      */
@@ -281,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
 		    struct inode *);
 
 /* super.c               */
-int exofs_sync_fs(struct super_block *sb, int wait);
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
 
 /*********************
  * operation vectors *
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 4c0d6bac9143..45ca323d8363 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -45,17 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
 static int exofs_file_fsync(struct file *filp, int datasync)
 {
 	int ret;
-	struct inode *inode = filp->f_mapping->host;
-	struct super_block *sb;
-
-	ret = sync_inode_metadata(inode, 1);
-
-	/* This is a good place to write the sb */
-	/* TODO: Sechedule an sb-sync on create */
-	sb = inode->i_sb;
-	if (sb->s_dirt)
-		exofs_sync_fs(sb, 1);
 
+	ret = sync_inode_metadata(filp->f_mapping->host, 1);
 	return ret;
 }
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 681b3cb9b4d8..0c713cfbebf0 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1102,6 +1102,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
 	}
 	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
 }
+
 /*
  * Callback function from exofs_new_inode().  The important thing is that we
  * set the obj_created flag so that other methods know that the object exists on
@@ -1160,7 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	sbi = sb->s_fs_info;
 
 	inode->i_mapping->backing_dev_info = sb->s_bdi;
-	sb->s_dirt = 1;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
 	inode->i_blkbits = EXOFS_BLKSHIFT;
@@ -1171,6 +1171,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	spin_unlock(&sbi->s_next_gen_lock);
 	insert_inode_hash(inode);
 
+	exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
+
 	mark_inode_dirty(inode);
 
 	ret = exofs_get_io_state(&sbi->layout, &ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 474989eeb7d6..5eb0851e5481 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -213,6 +213,101 @@ static void destroy_inodecache(void)
 static const struct super_operations exofs_sops;
 static const struct export_operations exofs_export_ops;
 
+static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
+	EXOFS_APAGE_SB_DATA,
+	EXOFS_ATTR_SB_STATS,
+	sizeof(struct exofs_sb_stats));
+
+static int __sbi_read_stats(struct exofs_sb_info *sbi)
+{
+	struct osd_attr attrs[] = {
+		[0] = g_attr_sb_stats,
+	};
+	struct exofs_io_state *ios;
+	int ret;
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	ios->cred = sbi->s_cred;
+
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_read(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("Error reading super_block stats => %d\n", ret);
+		goto out;
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
+		goto out;
+	}
+	if (attrs[0].len) {
+		struct exofs_sb_stats *ess;
+
+		if (unlikely(attrs[0].len != sizeof(*ess))) {
+			EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
+				  "size(%d) != expected(%zd)\n",
+				  __func__, attrs[0].len, sizeof(*ess));
+			goto out;
+		}
+
+		ess = attrs[0].val_ptr;
+		sbi->s_nextid = le64_to_cpu(ess->s_nextid);
+		sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
+	}
+
+out:
+	exofs_put_io_state(ios);
+	return ret;
+}
+
+static void stats_done(struct exofs_io_state *ios, void *p)
+{
+	exofs_put_io_state(ios);
+	/* Good thanks nothing to do anymore */
+}
+
+/* Asynchronously write the stats attribute */
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
+{
+	struct osd_attr attrs[] = {
+		[0] = g_attr_sb_stats,
+	};
+	struct exofs_io_state *ios;
+	int ret;
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	sbi->s_ess.s_nextid   = cpu_to_le64(sbi->s_nextid);
+	sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
+	attrs[0].val_ptr = &sbi->s_ess;
+
+	ios->cred = sbi->s_cred;
+	ios->done = stats_done;
+	ios->private = sbi;
+	ios->out_attr = attrs;
+	ios->out_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_write(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
+		exofs_put_io_state(ios);
+	}
+
+	return ret;
+}
+
 /*
  * Write the superblock to the OSD
  */
@@ -223,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 	struct exofs_io_state *ios;
 	int ret = -ENOMEM;
 
-	lock_super(sb);
+	fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
+	if (unlikely(!fscb))
+		return -ENOMEM;
+
 	sbi = sb->s_fs_info;
-	fscb = &sbi->s_fscb;
 
+	/* NOTE: We no longer dirty the super_block anywhere in exofs. The
+	 * reason we write the fscb here on unmount is so we can stay backwards
+	 * compatible with fscb->s_version == 1. (What we are not compatible
+	 * with is if a new version FS crashed and then we try to mount an old
+	 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
+	 * the writeable info is set in exofs_sbi_write_stats() above.
+	 */
 	ret = exofs_get_io_state(&sbi->layout, &ios);
-	if (ret)
+	if (unlikely(ret))
 		goto out;
 
-	/* Note: We only write the changing part of the fscb. .i.e upto the
-	 *       the fscb->s_dev_table_oid member. There is no read-modify-write
-	 *       here.
-	 */
+	lock_super(sb);
+
 	ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
 	memset(fscb, 0, ios->length);
 	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -249,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 	ios->cred = sbi->s_cred;
 
 	ret = exofs_sbi_write(ios);
-	if (unlikely(ret)) {
+	if (unlikely(ret))
 		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
-		goto out;
-	}
-	sb->s_dirt = 0;
+	else
+		sb->s_dirt = 0;
 
+
+	unlock_super(sb);
 out:
 	EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
 	exofs_put_io_state(ios);
-	unlock_super(sb);
+	kfree(fscb);
 	return ret;
 }
 
@@ -302,9 +405,6 @@ static void exofs_put_super(struct super_block *sb)
 	int num_pend;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 
-	if (sb->s_dirt)
-		exofs_write_super(sb);
-
 	/* make sure there are no pending commands */
 	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
 	     num_pend = atomic_read(&sbi->s_curr_pending)) {
@@ -629,6 +729,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 
 	sb->s_magic = le16_to_cpu(fscb.s_magic);
+	/* NOTE: we read below to be backward compatible with old versions */
 	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
 	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
 
@@ -639,7 +740,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		ret = -EINVAL;
 		goto free_sbi;
 	}
-	if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+	if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
 		EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
 			  EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
 		ret = -EINVAL;
@@ -657,6 +758,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 			goto free_sbi;
 	}
 
+	__sbi_read_stats(sbi);
+
 	/* set up operation vectors */
 	sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
 	sb->s_bdi = &sbi->bdi;
author	Boaz Harrosh <bharrosh@panasas.com>	2011-02-03 17:53:25 +0200
committer	Boaz Harrosh <bharrosh@panasas.com>	2011-03-15 15:02:51 +0200
commit	1cea312ad49d9cb964179a784fedb1fcfe396283 (patch)
tree	27c45af006b48b1a079698605ea9007398f652b5
parent	9ed96484311b89360b80a4181d856cbdb21630fd (diff)