Btrfs: faster file extent item replace operations

When writing to a file we drop existing file extent items that cover the write range and then add a new file extent item that represents that write range. Before this change we were doing a tree lookup to remove the file extent items, and then after we did another tree lookup to insert the new file extent item. Most of the time all the file extent items we need to drop are located within a single leaf - this is the leaf where our new file extent item ends up at. Therefore, in this common case just combine these 2 operations into a single one. By avoiding the second btree navigation for insertion of the new file extent item, we reduce btree node/leaf lock acquisitions/releases, btree block/leaf COW operations, CPU time on btree node/leaf key binary searches, etc. Besides for file writes, this is an operation that happens for file fsync's as well. However log btrees are much less likely to big as big as regular fs btrees, therefore the impact of this change is smaller. The following benchmark was performed against an SSD drive and a HDD drive, both for random and sequential writes: sysbench --test=fileio --file-num=4096 --file-total-size=8G \ --file-test-mode=[rndwr|seqwr] --num-threads=512 \ --file-block-size=8192 \ --max-requests=1000000 \ --file-fsync-freq=0 --file-io-mode=sync [prepare|run] All results below are averages of 10 runs of the respective test. ** SSD sequential writes Before this change: 225.88 Mb/sec After this change: 277.26 Mb/sec ** SSD random writes Before this change: 49.91 Mb/sec After this change: 56.39 Mb/sec ** HDD sequential writes Before this change: 68.53 Mb/sec After this change: 69.87 Mb/sec ** HDD random writes Before this change: 13.04 Mb/sec After this change: 14.39 Mb/sec Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
author: Filipe David Borba Manana <fdmanana@gmail.com> 2014-01-07 11:42:27 +0000
committer: Chris Mason <clm@fb.com> 2014-01-28 13:20:23 -0800
commit: 1acae57b161ef1282f565ef907f72aeed0eb71d9 (patch)
tree: 7234dacef63e67640d780cfb82742762b5e00bfd /fs/btrfs/file.c
parent: 90515e7f5d7d24cbb2a4038a3f1b5cfa2921aa17 (diff)
1 files changed, 40 insertions, 4 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 35bf83876f3f..030012e1710c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -692,7 +692,10 @@ next:
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 u64 *drop_end, int drop_cache)
+			 u64 *drop_end, int drop_cache,
+			 int replace_extent,
+			 u32 extent_item_size,
+			 int *key_inserted)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
@@ -712,6 +715,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int modify_tree = -1;
 	int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
 	int found = 0;
+	int leafs_visited = 0;
 
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -733,6 +737,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 				path->slots[0]--;
 		}
 		ret = 0;
+		leafs_visited++;
 next_slot:
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -744,6 +749,7 @@ next_slot:
 				ret = 0;
 				break;
 			}
+			leafs_visited++;
 			leaf = path->nodes[0];
 			recow = 1;
 		}
@@ -927,14 +933,44 @@ next_slot:
 	}
 
 	if (!ret && del_nr > 0) {
+		/*
+		 * Set path->slots[0] to first slot, so that after the delete
+		 * if items are move off from our leaf to its immediate left or
+		 * right neighbor leafs, we end up with a correct and adjusted
+		 * path->slots[0] for our insertion.
+		 */
+		path->slots[0] = del_slot;
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		if (ret)
 			btrfs_abort_transaction(trans, root, ret);
+
+		leaf = path->nodes[0];
+		/*
+		 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
+		 * is, its contents got pushed to its neighbors), in which case
+		 * it means path->locks[0] == 0
+		 */
+		if (!ret && replace_extent && leafs_visited == 1 &&
+		    path->locks[0] &&
+		    btrfs_leaf_free_space(root, leaf) >=
+		    sizeof(struct btrfs_item) + extent_item_size) {
+
+			key.objectid = ino;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = start;
+			setup_items_for_insert(root, path, &key,
+					       &extent_item_size,
+					       extent_item_size,
+					       sizeof(struct btrfs_item) +
+					       extent_item_size, 1);
+			*key_inserted = 1;
+		}
 	}
 
+	if (!replace_extent || !(*key_inserted))
+		btrfs_release_path(path);
 	if (drop_end)
 		*drop_end = found ? min(end, extent_end) : end;
-	btrfs_release_path(path);
 	return ret;
 }
 
@@ -949,7 +985,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
-				   drop_cache);
+				   drop_cache, 0, 0, NULL);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -2219,7 +2255,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	while (cur_offset < lockend) {
 		ret = __btrfs_drop_extents(trans, root, inode, path,
 					   cur_offset, lockend + 1,
-					   &drop_end, 1);
+					   &drop_end, 1, 0, 0, NULL);
 		if (ret != -ENOSPC)
 			break;
author	Filipe David Borba Manana <fdmanana@gmail.com>	2014-01-07 11:42:27 +0000
committer	Chris Mason <clm@fb.com>	2014-01-28 13:20:23 -0800
commit	1acae57b161ef1282f565ef907f72aeed0eb71d9 (patch)
tree	7234dacef63e67640d780cfb82742762b5e00bfd /fs/btrfs/file.c
parent	90515e7f5d7d24cbb2a4038a3f1b5cfa2921aa17 (diff)