From d04c6b88320debb403bff8d8b634a1efa48b8d3d Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:14 -0400
Subject: btrfs: make btrfs_issue_discard return bytes discarded

Initially this will just be the length argument passed to it,
but the following patches will adjust that to reflect re-alignment
and skipped blocks.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 07204bf601ed..16655bb5f293 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1883,10 +1883,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int btrfs_issue_discard(struct block_device *bdev,
-				u64 start, u64 len)
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+			       u64 *discarded_bytes)
 {
-	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+	int ret = 0;
+
+	*discarded_bytes = 0;
+	ret = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+	if (!ret)
+		*discarded_bytes = len;
+
+	return ret;
 }
 
 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1907,14 +1914,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
 
 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+			u64 bytes;
 			if (!stripe->dev->can_discard)
 				continue;
 
 			ret = btrfs_issue_discard(stripe->dev->bdev,
 						  stripe->physical,
-						  stripe->length);
+						  stripe->length,
+						  &bytes);
 			if (!ret)
-				discarded_bytes += stripe->length;
+				discarded_bytes += bytes;
 			else if (ret != -EOPNOTSUPP)
 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
 
-- 
cgit v1.2.3


From 4d89d377bbb0e34cd1571b57a984c2326cab69b5 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:15 -0400
Subject: btrfs: btrfs_issue_discard ensure offset/length are aligned to sector
 boundaries

It's possible, though unexpected, to pass unaligned offsets and lengths
to btrfs_issue_discard.  We then shift the offset/length values to sector
units.  If an unaligned offset has been passed, it will result in the
entire sector being discarded, possibly losing data.  An unaligned
length is safe but we'll end up returning an inaccurate number of
discarded bytes.

This patch aligns the offset to the 512B boundary, adjusts the length,
and warns, since we shouldn't be discarding on an offset that isn't
aligned with our sector size.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 16655bb5f293..7aa6ad1f014d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1887,12 +1887,21 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 			       u64 *discarded_bytes)
 {
 	int ret = 0;
+	u64 aligned_start = ALIGN(start, 1 << 9);
 
-	*discarded_bytes = 0;
-	ret = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
-	if (!ret)
-		*discarded_bytes = len;
+	if (WARN_ON(start != aligned_start)) {
+		len -= aligned_start - start;
+		len = round_down(len, 1 << 9);
+		start = aligned_start;
+	}
 
+	*discarded_bytes = 0;
+	if (len) {
+		ret = blkdev_issue_discard(bdev, start >> 9, len >> 9,
+					   GFP_NOFS, 0);
+		if (!ret)
+			*discarded_bytes = len;
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3


From 86557861dfe4f8defde0df40620b97cc60285aa4 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:16 -0400
Subject: btrfs: skip superblocks during discard

Btrfs doesn't track superblocks with extent records so there is nothing
persistent on-disk to indicate that those blocks are in use.  We track
the superblocks in memory to ensure they don't get used by removing them
from the free space cache when we load a block group from disk.  Prior
to 47ab2a6c6a (Btrfs: remove empty block groups automatically), that
was fine since the block group would never be reclaimed so the superblock
was always safe.  Once we started removing the empty block groups, we
were protected by the fact that discards weren't being properly issued
for unused space either via FITRIM or -odiscard.  The block groups were
still being released, but the blocks remained on disk.

In order to properly discard unused block groups, we need to filter out
the superblocks from the discard range.  Superblocks are located at fixed
locations on each device, so it makes sense to filter them out in
btrfs_issue_discard, which is used by both -odiscard and FITRIM.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7aa6ad1f014d..d763457b3cce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1883,10 +1883,12 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 			       u64 *discarded_bytes)
 {
-	int ret = 0;
+	int j, ret = 0;
+	u64 bytes_left, end;
 	u64 aligned_start = ALIGN(start, 1 << 9);
 
 	if (WARN_ON(start != aligned_start)) {
@@ -1896,11 +1898,60 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 	}
 
 	*discarded_bytes = 0;
-	if (len) {
-		ret = blkdev_issue_discard(bdev, start >> 9, len >> 9,
+
+	if (!len)
+		return 0;
+
+	end = start + len;
+	bytes_left = len;
+
+	/* Skip any superblocks on this device. */
+	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+		u64 sb_start = btrfs_sb_offset(j);
+		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+		u64 size = sb_start - start;
+
+		if (!in_range(sb_start, start, bytes_left) &&
+		    !in_range(sb_end, start, bytes_left) &&
+		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+			continue;
+
+		/*
+		 * Superblock spans beginning of range.  Adjust start and
+		 * try again.
+		 */
+		if (sb_start <= start) {
+			start += sb_end - start;
+			if (start > end) {
+				bytes_left = 0;
+				break;
+			}
+			bytes_left = end - start;
+			continue;
+		}
+
+		if (size) {
+			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+						   GFP_NOFS, 0);
+			if (!ret)
+				*discarded_bytes += size;
+			else if (ret != -EOPNOTSUPP)
+				return ret;
+		}
+
+		start = sb_end;
+		if (start > end) {
+			bytes_left = 0;
+			break;
+		}
+		bytes_left = end - start;
+	}
+
+	if (bytes_left) {
+		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
 					   GFP_NOFS, 0);
 		if (!ret)
-			*discarded_bytes = len;
+			*discarded_bytes += bytes_left;
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 499f377f49f085ee4aa214c738e948e88626f39b Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:17 -0400
Subject: btrfs: iterate over unused chunk space in FITRIM

Since we now clean up block groups automatically as they become
empty, iterating over block groups is no longer sufficient to discard
unused space.

This patch iterates over the unused chunk space and discards any regions
that are unallocated, regardless of whether they were ever used.  This is
a change for btrfs but is consistent with other file systems.

We do this in a transactionless manner since the discard process can take
a substantial amount of time and a transaction would need to be started
before the acquisition of the device list lock.  That would mean a
transaction would be held open across /all/ of the discards collectively.
In order to prevent other threads from allocating or freeing chunks, we
hold the chunks lock across the search and discard calls.  We release it
between searches to allow the file system to perform more-or-less
normally.  Since the running transaction can commit and disappear while
we're using the transaction pointer, we take a reference to it and
release it after the search.  This is safe since it would happen normally
at the end of the transaction commit after any locks are released anyway.
We also take the commit_root_sem to protect against a transaction starting
and committing while we're running.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d763457b3cce..15411aefbfa0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10135,10 +10135,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 	return unpin_extent_range(root, start, end, false);
 }
 
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space.  Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time.  We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses.  For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+				   u64 minlen, u64 *trimmed)
+{
+	u64 start = 0, len = 0;
+	int ret;
+
+	*trimmed = 0;
+
+	/* Not writeable = nothing to do. */
+	if (!device->writeable)
+		return 0;
+
+	/* No free space = nothing to do. */
+	if (device->total_bytes <= device->bytes_used)
+		return 0;
+
+	ret = 0;
+
+	while (1) {
+		struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+		struct btrfs_transaction *trans;
+		u64 bytes;
+
+		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+		if (ret)
+			return ret;
+
+		down_read(&fs_info->commit_root_sem);
+
+		spin_lock(&fs_info->trans_lock);
+		trans = fs_info->running_transaction;
+		if (trans)
+			atomic_inc(&trans->use_count);
+		spin_unlock(&fs_info->trans_lock);
+
+		ret = find_free_dev_extent_start(trans, device, minlen, start,
+						 &start, &len);
+		if (trans)
+			btrfs_put_transaction(trans);
+
+		if (ret) {
+			up_read(&fs_info->commit_root_sem);
+			mutex_unlock(&fs_info->chunk_mutex);
+			if (ret == -ENOSPC)
+				ret = 0;
+			break;
+		}
+
+		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+		up_read(&fs_info->commit_root_sem);
+		mutex_unlock(&fs_info->chunk_mutex);
+
+		if (ret)
+			break;
+
+		start += len;
+		*trimmed += bytes;
+
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+	}
+
+	return ret;
+}
+
 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_device *device;
+	struct list_head *devices;
 	u64 group_trimmed;
 	u64 start;
 	u64 end;
@@ -10193,6 +10282,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 		cache = next_block_group(fs_info->tree_root, cache);
 	}
 
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	devices = &root->fs_info->fs_devices->alloc_list;
+	list_for_each_entry(device, devices, dev_alloc_list) {
+		ret = btrfs_trim_free_extents(device, range->minlen,
+					      &group_trimmed);
+		if (ret)
+			break;
+
+		trimmed += group_trimmed;
+	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
 	range->len = trimmed;
 	return ret;
 }
-- 
cgit v1.2.3


From e33e17ee1098d8d751552ac11c111e1c1a3db014 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 15 Jun 2015 09:41:19 -0400
Subject: btrfs: add missing discards when unpinning extents with -o discard

When we clear the dirty bits in btrfs_delete_unused_bgs for extents
in the empty block group, it results in btrfs_finish_extent_commit being
unable to discard the freed extents.

The block group removal patch added an alternate path to forget extents
other than btrfs_finish_extent_commit.  As a result, any extents that
would be freed when the block group is removed aren't discarded.  In my
test run, with a large copy of mixed sized files followed by removal, it
left nearly 2/3 of extents undiscarded.

To clean up the block groups, we add the removed block group onto a list
that will be discarded after transaction commit.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 64 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/extent-tree.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 15411aefbfa0..6b791f394698 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6131,20 +6131,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_block_group_cache *block_group, *tmp;
+	struct list_head *deleted_bgs;
 	struct extent_io_tree *unpin;
 	u64 start;
 	u64 end;
 	int ret;
 
-	if (trans->aborted)
-		return 0;
-
 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
 		unpin = &fs_info->freed_extents[1];
 	else
 		unpin = &fs_info->freed_extents[0];
 
-	while (1) {
+	while (!trans->aborted) {
 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY, NULL);
@@ -6163,6 +6162,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 
+	/*
+	 * Transaction is finished.  We don't need the lock anymore.  We
+	 * do need to clean up the block groups in case of a transaction
+	 * abort.
+	 */
+	deleted_bgs = &trans->transaction->deleted_bgs;
+	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+		u64 trimmed = 0;
+
+		ret = -EROFS;
+		if (!trans->aborted)
+			ret = btrfs_discard_extent(root,
+						   block_group->key.objectid,
+						   block_group->key.offset,
+						   &trimmed);
+
+		list_del_init(&block_group->bg_list);
+		btrfs_put_block_group_trimming(block_group);
+		btrfs_put_block_group(block_group);
+
+		if (ret) {
+			const char *errstr = btrfs_decode_error(ret);
+			btrfs_warn(fs_info,
+				   "Discard failed while removing blockgroup: errno=%d %s\n",
+				   ret, errstr);
+		}
+	}
+
 	return 0;
 }
 
@@ -9903,6 +9930,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	 * currently running transaction might finish and a new one start,
 	 * allowing for new block groups to be created that can reuse the same
 	 * physical device locations unless we take this special care.
+	 *
+	 * There may also be an implicit trim operation if the file system
+	 * is mounted with -odiscard. The same protections must remain
+	 * in place until the extents have been discarded completely when
+	 * the transaction commit has completed.
 	 */
 	remove_em = (atomic_read(&block_group->trimming) == 0);
 	/*
@@ -9977,6 +10009,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 	spin_lock(&fs_info->unused_bgs_lock);
 	while (!list_empty(&fs_info->unused_bgs)) {
 		u64 start, end;
+		int trimming;
 
 		block_group = list_first_entry(&fs_info->unused_bgs,
 					       struct btrfs_block_group_cache,
@@ -10076,12 +10109,39 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 		spin_unlock(&block_group->lock);
 		spin_unlock(&space_info->lock);
 
+		/* DISCARD can flip during remount */
+		trimming = btrfs_test_opt(root, DISCARD);
+
+		/* Implicit trim during transaction commit. */
+		if (trimming)
+			btrfs_get_block_group_trimming(block_group);
+
 		/*
 		 * Btrfs_remove_chunk will abort the transaction if things go
 		 * horribly wrong.
 		 */
 		ret = btrfs_remove_chunk(trans, root,
 					 block_group->key.objectid);
+
+		if (ret) {
+			if (trimming)
+				btrfs_put_block_group_trimming(block_group);
+			goto end_trans;
+		}
+
+		/*
+		 * If we're not mounted with -odiscard, we can just forget
+		 * about this block group. Otherwise we'll need to wait
+		 * until transaction commit to do the actual discard.
+		 */
+		if (trimming) {
+			WARN_ON(!list_empty(&block_group->bg_list));
+			spin_lock(&trans->transaction->deleted_bgs_lock);
+			list_move(&block_group->bg_list,
+				  &trans->transaction->deleted_bgs);
+			spin_unlock(&trans->transaction->deleted_bgs_lock);
+			btrfs_get_block_group(block_group);
+		}
 end_trans:
 		btrfs_end_transaction(trans, root);
 next:
-- 
cgit v1.2.3