From dc17ff8f11d129db9e83ab7244769e4eae05e14d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jan 2008 15:46:30 -0500
Subject: Btrfs: Add data=ordered support

This forces file data extents down the disk along with the metadata that
references them.  The current implementation is fairly simple, and just
writes out all of the dirty pages in an inode before the commit.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 fs/btrfs/ordered-data.h

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..aaf9eb142719
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+struct btrfs_ordered_inode_tree {
+	rwlock_t lock;
+	struct rb_root tree;
+};
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+	rwlock_init(&t->lock);
+	t->tree.rb_node = NULL;
+}
+
+int btrfs_add_ordered_inode(struct inode *inode);
+int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid);
+int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid);
+#endif
-- 
cgit v1.2.3


From cee36a03e8f7c6e14aefd497d3acf01bcd3ef153 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 15 Jan 2008 08:40:48 -0500
Subject: Rework btrfs_drop_inode to avoid scheduling

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index aaf9eb142719..26b26212865b 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -36,4 +36,5 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid);
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid);
+int btrfs_del_ordered_inode(struct inode *inode);
 #endif
-- 
cgit v1.2.3


From 4d5e74bc0aec3f54b7e429d77b7c35de042c507d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Jan 2008 16:09:22 -0500
Subject: Btrfs: Fix data=ordered vs wait_on_inode deadlock on older kernels

Using ilookup5 during data=ordered writeback could deadlock on I_LOCK.  This
saves a pointer to the inode instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 26b26212865b..f25c6771ec64 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -33,8 +33,10 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 
 int btrfs_add_ordered_inode(struct inode *inode);
 int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid);
+				       u64 *root_objectid, u64 *objectid,
+				       struct inode **inode);
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid);
+				       u64 *root_objectid, u64 *objectid,
+				       struct inode **inode);
 int btrfs_del_ordered_inode(struct inode *inode);
 #endif
-- 
cgit v1.2.3


From 81d7ed29ff6bdec903c36c26b386e16c014993b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 08:51:48 -0400
Subject: Btrfs: Throttle file_write when data=ordered is flushing the inode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f25c6771ec64..29047e0abaab 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -39,4 +39,5 @@ int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
 int btrfs_del_ordered_inode(struct inode *inode);
+int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
-- 
cgit v1.2.3


From e1b81e6761bd8419146d4bbe0aadd8b2d348a01c Mon Sep 17 00:00:00 2001
From: Mingming <cmm@us.ibm.com>
Date: Tue, 27 May 2008 10:55:43 -0400
Subject: btrfs delete ordered inode handling fix

Use btrfs_release_file instead of a put_inode call

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 29047e0abaab..c515c4b39996 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -38,6 +38,6 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
-int btrfs_del_ordered_inode(struct inode *inode);
+void btrfs_del_ordered_inode(struct inode *inode);
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
-- 
cgit v1.2.3


From 594a24eb0e7fa8413f8b443863be4b7c72bfde9f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Fix btrfs_del_ordered_inode to allow forcing the drop during unlinks

This allows us to delete an unlinked inode with dirty pages from the list
instead of forcing commit to write these out before deleting the inode.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c515c4b39996..4fa78736423e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -38,6 +38,6 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
-void btrfs_del_ordered_inode(struct inode *inode);
+void btrfs_del_ordered_inode(struct inode *inode, int force);
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
-- 
cgit v1.2.3


From e6dcd2dc9c489108648e2ed543315dd134d50a9a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:53:50 -0400
Subject: Btrfs: New data=ordered implementation

The old data=ordered code would force commit to wait until
all the data extents from the transaction were fully on disk.  This
introduced large latencies into the commit and stalled new writers
in the transaction for a long time.

The new code changes the way data allocations and extents work:

* When delayed allocation is filled, data extents are reserved, and
  the extent bit EXTENT_ORDERED is set on the entire range of the extent.
  A struct btrfs_ordered_extent is allocated an inserted into a per-inode
  rbtree to track the pending extents.

* As each page is written EXTENT_ORDERED is cleared on the bytes corresponding
  to that page.

* When all of the bytes corresponding to a single struct btrfs_ordered_extent
  are written, The previously reserved extent is inserted into the FS
  btree and into the extent allocation trees.  The checksums for the file
  data are also updated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 71 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 60 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4fa78736423e..33292c5fe90c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -20,24 +20,73 @@
 #define __BTRFS_ORDERED_DATA__
 
 struct btrfs_ordered_inode_tree {
-	rwlock_t lock;
+	struct mutex mutex;
 	struct rb_root tree;
+	struct rb_node *last;
 };
 
+struct btrfs_sector_sum {
+	u64 offset;
+	u32 sum;
+};
+
+struct btrfs_ordered_sum {
+	u64 file_offset;
+	u64 len;
+	struct list_head list;
+	struct btrfs_sector_sum sums;
+};
+
+/* bits for the flags field */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+#define BTRFS_ORDERED_START 2 /* set when tree setup */
+
+struct btrfs_ordered_extent {
+	u64 file_offset;
+	u64 start;
+	u64 len;
+	unsigned long flags;
+	atomic_t refs;
+	struct list_head list;
+	struct inode *inode;
+	wait_queue_head_t wait;
+	struct rb_node rb_node;
+};
+
+
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
+{
+	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+		root->sectorsize;
+	return sizeof(struct btrfs_ordered_sum) +
+		num_sectors * sizeof(struct btrfs_sector_sum);
+}
+
 static inline void
 btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 {
-	rwlock_init(&t->lock);
+	mutex_init(&t->mutex);
 	t->tree.rb_node = NULL;
+	t->last = NULL;
 }
 
-int btrfs_add_ordered_inode(struct inode *inode);
-int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode);
-int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode);
-void btrfs_del_ordered_inode(struct inode *inode, int force);
-int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				       u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len);
+int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset);
+void btrfs_wait_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry);
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+int btrfs_add_ordered_pending(struct inode *inode,
+			      struct btrfs_ordered_extent *ordered,
+			      u64 start, u64 len);
 #endif
-- 
cgit v1.2.3


From dbe674a99c8af088faa4c95eddaeb271a3140ab6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:05 -0400
Subject: Btrfs: Update on disk i_size only after pending ordered extents are
 done

This changes the ordered data code to update i_size after the extent
is on disk.  An on disk i_size is maintained in the in-memory btrfs inode
structures, and this is updated as extents finish.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 33292c5fe90c..40e9126ad954 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -89,4 +89,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_add_ordered_pending(struct inode *inode,
 			      struct btrfs_ordered_extent *ordered,
 			      u64 start, u64 len);
+int btrfs_ordered_update_i_size(struct inode *inode,
+				struct btrfs_ordered_extent *ordered);
 #endif
-- 
cgit v1.2.3


From ba1da2f442ec91a1534afa893f9bef7e33056ace Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:15 -0400
Subject: Btrfs: Don't pin pages in ram until the entire ordered extent is on
 disk.

Checksum items are not inserted until the entire ordered extent is on disk,
but individual pages might be clean and available for reclaim long before
the whole extent is on disk.

In order to allow those pages to be freed, we need to be able to search
the list of ordered extents to find the checksum that is going to be inserted
in the tree.  This way if the page needs to be read back in before
the checksums are in the btree, we'll be able to verify the checksum on
the page.

This commit adds the ability to search the pending ordered extents for
a given offset in the file, and changes btrfs_releasepage to allow
ordered pages to be freed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 40e9126ad954..33f0d9e91b11 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -91,4 +91,5 @@ int btrfs_add_ordered_pending(struct inode *inode,
 			      u64 start, u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
 #endif
-- 
cgit v1.2.3


From eb84ae039e10f1f80443d846ba1350122bbdc753 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 13:53:27 -0400
Subject: Btrfs: Cleanup and comment ordered-data.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 49 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 33f0d9e91b11..98f491d1022b 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -19,12 +19,19 @@
 #ifndef __BTRFS_ORDERED_DATA__
 #define __BTRFS_ORDERED_DATA__
 
+/* one of these per inode */
 struct btrfs_ordered_inode_tree {
 	struct mutex mutex;
 	struct rb_root tree;
 	struct rb_node *last;
 };
 
+/*
+ * these are used to collect checksums done just before bios submission.
+ * They are attached via a list into the ordered extent, and
+ * checksum items are inserted into the tree after all the blocks in
+ * the ordered extent are on disk
+ */
 struct btrfs_sector_sum {
 	u64 offset;
 	u32 sum;
@@ -34,27 +41,56 @@ struct btrfs_ordered_sum {
 	u64 file_offset;
 	u64 len;
 	struct list_head list;
+	/* last field is a variable length array of btrfs_sector_sums */
 	struct btrfs_sector_sum sums;
 };
 
-/* bits for the flags field */
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters.  It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
 #define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+
 #define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
-#define BTRFS_ORDERED_START 2 /* set when tree setup */
 
 struct btrfs_ordered_extent {
+	/* logical offset in the file */
 	u64 file_offset;
+
+	/* disk byte number */
 	u64 start;
+
+	/* length of the extent in bytes */
 	u64 len;
+
+	/* flags (described above) */
 	unsigned long flags;
+
+	/* reference count */
 	atomic_t refs;
+
+	/* list of checksums for insertion when the extent io is done */
 	struct list_head list;
-	struct inode *inode;
+
+	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
 	wait_queue_head_t wait;
+
+	/* our friendly rbtree entry */
 	struct rb_node rb_node;
 };
 
 
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
 static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
 {
 	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
@@ -81,14 +117,11 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
-void btrfs_wait_ordered_extent(struct inode *inode,
-			       struct btrfs_ordered_extent *entry);
+void btrfs_start_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry, int wait);
 void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
-int btrfs_add_ordered_pending(struct inode *inode,
-			      struct btrfs_ordered_extent *ordered,
-			      u64 start, u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
-- 
cgit v1.2.3


From 3edf7d33f4edb1e4a9bb0a4c0a84d95fb4d22a09 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Jul 2008 06:17:13 -0400
Subject: Btrfs: Handle data checksumming on bios that span multiple ordered
 extents

Data checksumming is done right before the bio is sent down the IO stack,
which means a single bio might span more than one ordered extent.  In
this case, the checksumming data is split between two ordered extents.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 98f491d1022b..1794efd13ca3 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -39,7 +39,11 @@ struct btrfs_sector_sum {
 
 struct btrfs_ordered_sum {
 	u64 file_offset;
-	u64 len;
+	/*
+	 * this is the length in bytes covered by the sums array below.
+	 * But, the sums array may not be contiguous in the file.
+	 */
+	unsigned long len;
 	struct list_head list;
 	/* last field is a variable length array of btrfs_sector_sums */
 	struct btrfs_sector_sum sums;
@@ -95,6 +99,7 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
 {
 	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
 		root->sectorsize;
+	num_sectors++;
 	return sizeof(struct btrfs_ordered_sum) +
 		num_sectors * sizeof(struct btrfs_sector_sum);
 }
@@ -114,7 +119,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len);
-int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum);
+int btrfs_add_ordered_sum(struct inode *inode,
+			  struct btrfs_ordered_extent *entry,
+			  struct btrfs_ordered_sum *sum);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
-- 
cgit v1.2.3


From f421950f86bf96a11fef932e167ab2e70d4c43a0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jul 2008 11:18:09 -0400
Subject: Btrfs: Fix some data=ordered related data corruptions

Stress testing was showing data checksum errors, most of which were caused
by a lookup bug in the extent_map tree.  The tree was caching the last
pointer returned, and searches would check the last pointer first.

But, search callers also expect the search to return the very first
matching extent in the range, which wasn't always true with the last
pointer usage.

For now, the code to cache the last return value is just removed.  It is
easy to fix, but I think lookups are rare enough that it isn't required anymore.

This commit also replaces do_sync_mapping_range with a local copy of the
related functions.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1794efd13ca3..8e8e3c0404f3 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -132,4 +132,8 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+				       pgoff_t start, pgoff_t end);
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+			   loff_t end, int sync_mode);
 #endif
-- 
cgit v1.2.3


From ed98b56a6393c5e150fd5095b9eb7fd7d3cfb041 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jul 2008 23:06:42 -0400
Subject: Btrfs: Take the csum mutex while reading checksums

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8e8e3c0404f3..36e63f1f79b3 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -46,7 +46,7 @@ struct btrfs_ordered_sum {
 	unsigned long len;
 	struct list_head list;
 	/* last field is a variable length array of btrfs_sector_sums */
-	struct btrfs_sector_sum sums;
+	struct btrfs_sector_sum sums[];
 };
 
 /*
-- 
cgit v1.2.3


From 9ba4611a3a7902c6bad70c5c205de5161fcfc17b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 23 Jul 2008 09:26:26 -0400
Subject: Btrfs: Fix 32 bit compiles by using an unsigned long byte count in
 the ordered extent

The ordered extents have to fit in memory, so an unsigned long is sufficient.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 36e63f1f79b3..199cb0b4f1d9 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -95,7 +95,8 @@ struct btrfs_ordered_extent {
  * calculates the total size you need to allocate for an ordered sum
  * structure spanning 'bytes' in the file
  */
-static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+					 unsigned long bytes)
 {
 	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
 		root->sectorsize;
-- 
cgit v1.2.3


From 3eaa2885276fd6dac7b076a793932428b7168e74 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Jul 2008 11:57:52 -0400
Subject: Btrfs: Fix the defragmention code and the block relocation code for
 data=ordered

Before setting an extent to delalloc, the code needs to wait for
pending ordered extents.

Also, the relocation code needs to wait for ordered IO before scanning
the block group again.  This is because the extents are not removed
until the IO for the new extents is finished

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 199cb0b4f1d9..5efe6b63c74c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -80,6 +80,9 @@ struct btrfs_ordered_extent {
 	/* reference count */
 	atomic_t refs;
 
+	/* the inode we belong to */
+	struct inode *inode;
+
 	/* list of checksums for insertion when the extent io is done */
 	struct list_head list;
 
@@ -88,6 +91,9 @@ struct btrfs_ordered_extent {
 
 	/* our friendly rbtree entry */
 	struct rb_node rb_node;
+
+	/* a per root list of all the pending ordered extents */
+	struct list_head root_extent_list;
 };
 
 
@@ -137,4 +143,5 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 				       pgoff_t start, pgoff_t end);
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 			   loff_t end, int sync_mode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root);
 #endif
-- 
cgit v1.2.3


From 7ea394f1192bee1af67ea4762c88ef4b7b0487a8 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Tue, 5 Aug 2008 13:05:02 -0400
Subject: Btrfs: Fix nodatacow for the new data=ordered mode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 5efe6b63c74c..fd45519f30a8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -64,6 +64,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
 
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -125,7 +127,7 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len);
+			     u64 start, u64 len, int nocow);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
@@ -143,5 +145,5 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 				       pgoff_t start, pgoff_t end);
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 			   loff_t end, int sync_mode);
-int btrfs_wait_ordered_extents(struct btrfs_root *root);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
 #endif
-- 
cgit v1.2.3


From cb843a6f513a1a91c54951005e60bd9b95bdf973 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 3 Oct 2008 12:30:02 -0400
Subject: Btrfs: O_DIRECT writes via buffered writes + invaldiate

This reworks the btrfs O_DIRECT write code a bit.  It had always fallen
back to buffered IO and done an invalidate, but needed to be updated
for the data=ordered code.  The invalidate wasn't actually removing pages
because they were still inside an ordered extent.

This also combines the O_DIRECT/O_SYNC paths where possible, and kicks
off IO in the main btrfs_file_write loop to keep the pipe down the the
disk full as we process long writes.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index fd45519f30a8..f50f8870a144 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -135,7 +135,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry, int wait);
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
-- 
cgit v1.2.3


From c8b978188c9a0fd3d535c13debd19d522b726f1f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 29 Oct 2008 14:49:59 -0400
Subject: Btrfs: Add zlib compression support

This is a large change for adding compression on reading and writing,
both for inline and regular extents.  It does some fairly large
surgery to the writeback paths.

Compression is off by default and enabled by mount -o compress.  Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.

If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.

* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler.  This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.

* Inline extents are inserted at delalloc time now.  This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.

* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.

From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field.  Neither the encryption or the
'other' field are currently used.

In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k.  This is a
software only limit, the disk format supports u64 sized compressed extents.

In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k.  This is a software only limit
and will be subject to tuning later.

Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data.  This way additional encodings can be
layered on without having to figure out which encoding to checksum.

Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread.  This makes it tricky to
spread the compression load across all the cpus on the box.  We'll have to
look at parallel pdflush walks of dirty inodes at a later time.

Decompression is hooked into readpages and it does spread across CPUs nicely.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f50f8870a144..1ef464145d22 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
 
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -73,9 +75,12 @@ struct btrfs_ordered_extent {
 	/* disk byte number */
 	u64 start;
 
-	/* length of the extent in bytes */
+	/* ram length of the extent in bytes */
 	u64 len;
 
+	/* extent length on disk */
+	u64 disk_len;
+
 	/* flags (described above) */
 	unsigned long flags;
 
@@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, int nocow);
+			     u64 start, u64 len, u64 disk_len, int nocow,
+			     int compressed);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
-- 
cgit v1.2.3


From 80ff385665b7fca29fefe358a60ab0d09f9b8e87 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 30 Oct 2008 14:20:02 -0400
Subject: Btrfs: update nodatacow code v2

This patch simplifies the nodatacow checker. If all references
were created after the latest snapshot, then we can avoid COW
safely. This patch also updates run_delalloc_nocow to do more
fine-grained checking.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ordered-data.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1ef464145d22..e6d9bc54c2b1 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -132,8 +132,7 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, u64 disk_len, int nocow,
-			     int compressed);
+			     u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
-- 
cgit v1.2.3


From d899e05215178fed903ad0e7fc1cb4d8e0cc0a88 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 30 Oct 2008 14:25:28 -0400
Subject: Btrfs: Add fallocate support v2 This patch updates btrfs-progs for
 fallocate support.

fallocate is a little different in Btrfs because we need to tell the
COW system that a given preallocated extent doesn't need to be
cow'd as long as there are no snapshots of it.  This leverages the
-o nodatacow checks.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ordered-data.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e6d9bc54c2b1..260bf95dfe0c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,6 +68,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
 
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -132,7 +134,7 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, u64 disk_len, int type);
+			     u64 start, u64 len, u64 disk_len, int tyep);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
-- 
cgit v1.2.3


From d20f7043fa65659136c1a7c3c456eeeb5c6f431f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:58:54 -0500
Subject: Btrfs: move data checksumming into a dedicated tree

Btrfs stores checksums for each data block.  Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block.  This means that when we read the inode,
we've probably read in at least some checksums as well.

But, this has a few problems:

* The checksums are indexed by logical offset in the file.  When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data.  It would be faster if we could checksum
the compressed data instead.

* If we implement encryption, we'll be checksumming the plain text and
storing that on disk.  This is significantly less secure.

* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct.  This makes the raid
layer balancing and extent moving much more expensive.

* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.

* There is potentitally one copy of the checksum in each subvolume
referencing an extent.

The solution used here is to store the extent checksums in a dedicated
tree.  This allows us to index the checksums by phyiscal extent
start and length.  It means:

* The checksum is against the data stored on disk, after any compression
or encryption is done.

* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.

This makes compression significantly faster by reducing the amount of
data that needs to be checksummed.  It will also allow much faster
raid management code in general.

The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent.  This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/ordered-data.h')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 260bf95dfe0c..ab66d5e8d6d6 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -33,15 +33,17 @@ struct btrfs_ordered_inode_tree {
  * the ordered extent are on disk
  */
 struct btrfs_sector_sum {
-	u64 offset;
+	/* bytenr on disk */
+	u64 bytenr;
 	u32 sum;
 };
 
 struct btrfs_ordered_sum {
-	u64 file_offset;
+	/* bytenr is the start of this extent on disk */
+	u64 bytenr;
+
 	/*
 	 * this is the length in bytes covered by the sums array below.
-	 * But, the sums array may not be contiguous in the file.
 	 */
 	unsigned long len;
 	struct list_head list;
@@ -147,7 +149,7 @@ struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
 int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 				       pgoff_t start, pgoff_t end);
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
-- 
cgit v1.2.3