summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjinbaohong <jinbaohong@synology.com>2026-01-28 07:06:41 +0000
committerDavid Sterba <dsterba@suse.com>2026-02-03 07:56:25 +0100
commitb291ad4458df8311626dfa0a089918f6a542d6bc (patch)
treefdfbe0532b6445ed7aa1fb609286403bba243f53
parentbfb670b9183b0e4ba660aff2e396ec1cc01d0761 (diff)
btrfs: fix transaction commit blocking during trim of unallocated space
When trimming unallocated space, btrfs_trim_fs() holds the device_list_mutex for the entire duration while iterating through all devices. On large filesystems with significant unallocated space, this operation can take minutes to hours on large storage systems. This causes a problem because btrfs_run_dev_stats(), which is called during transaction commit, also requires device_list_mutex: btrfs_trim_fs() mutex_lock(&fs_devices->device_list_mutex) list_for_each_entry(device, ...) btrfs_trim_free_extents(device) mutex_unlock(&fs_devices->device_list_mutex) commit_transaction() btrfs_run_dev_stats() mutex_lock(&fs_devices->device_list_mutex) // blocked! ... While trim is running, all transaction commits are blocked waiting for the mutex. Fix this by refactoring btrfs_trim_free_extents() to process devices in bounded chunks (up to 2GB per iteration) and release device_list_mutex between chunks. Signed-off-by: robbieko <robbieko@synology.com> Signed-off-by: jinbaohong <jinbaohong@synology.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/extent-tree.c156
-rw-r--r--fs/btrfs/fs.h6
2 files changed, 140 insertions, 22 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 87fd94449f11..03cf9f242c70 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6513,10 +6513,12 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6
* it while performing the free space search since we have already
* held back allocations.
*/
-static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
+static int btrfs_trim_free_extents_throttle(struct btrfs_device *device,
+ u64 *trimmed, u64 pos, u64 *ret_next_pos)
{
- u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
int ret;
+ u64 start = pos;
+ u64 trim_len = 0;
*trimmed = 0;
@@ -6536,15 +6538,20 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
while (1) {
struct btrfs_fs_info *fs_info = device->fs_info;
+ u64 cur_start;
+ u64 end;
+ u64 len;
u64 bytes;
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
if (ret)
break;
+ cur_start = start;
btrfs_find_first_clear_extent_bit(&device->alloc_state, start,
&start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ start = max(start, cur_start);
/* Check if there are any CHUNK_* bits left */
if (start > device->total_bytes) {
@@ -6570,6 +6577,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
end = min(end, device->total_bytes - 1);
len = end - start + 1;
+ len = min(len, BTRFS_MAX_TRIM_LENGTH);
/* We didn't find any extents */
if (!len) {
@@ -6590,6 +6598,12 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
start += len;
*trimmed += bytes;
+ trim_len += len;
+ if (trim_len >= BTRFS_MAX_TRIM_LENGTH) {
+ *ret_next_pos = start;
+ ret = -EAGAIN;
+ break;
+ }
if (btrfs_trim_interrupted()) {
ret = -ERESTARTSYS;
@@ -6602,6 +6616,122 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
return ret;
}
+static int btrfs_trim_free_extents(struct btrfs_fs_info *fs_info, u64 *trimmed,
+ u64 *dev_failed, int *dev_ret)
+{
+ struct btrfs_device *dev;
+ struct btrfs_device *working_dev = NULL;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u8 uuid[BTRFS_UUID_SIZE];
+ u64 start = BTRFS_DEVICE_RANGE_RESERVED;
+
+ *trimmed = 0;
+ *dev_failed = 0;
+ *dev_ret = 0;
+
+ /* Find the device with the smallest UUID to start. */
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (!working_dev ||
+ memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0)
+ working_dev = dev;
+ }
+ if (working_dev)
+ memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (!working_dev)
+ return 0;
+
+ while (1) {
+ u64 group_trimmed = 0;
+ u64 next_pos = 0;
+ int ret = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ /* Find and trim the current device. */
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (dev == working_dev) {
+ ret = btrfs_trim_free_extents_throttle(working_dev,
+ &group_trimmed, start, &next_pos);
+ break;
+ }
+ }
+
+ /* Throttle: continue the same device from the new position. */
+ if (ret == -EAGAIN && next_pos > start) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ *trimmed += group_trimmed;
+ start = next_pos;
+ cond_resched();
+ continue;
+ }
+
+ /* User interrupted. */
+ if (ret == -ERESTARTSYS || ret == -EINTR) {
+ mutex_unlock(&fs_devices->device_list_mutex);
+ *trimmed += group_trimmed;
+ return ret;
+ }
+
+ /*
+ * Device completed (ret == 0), failed, or EAGAIN with no progress.
+ * Record error if any, then move to next device.
+ */
+ if (ret == -EAGAIN) {
+ /* No progress - log and skip device. */
+ btrfs_warn(fs_info,
+ "trim throttle: no progress, offset=%llu device %s, skipping",
+ start, btrfs_dev_name(working_dev));
+ (*dev_failed)++;
+ if (!*dev_ret)
+ *dev_ret = ret;
+ } else if (ret) {
+ /* Device failed with error. */
+ (*dev_failed)++;
+ if (!*dev_ret)
+ *dev_ret = ret;
+ }
+
+ /*
+ * Find next device: smallest UUID larger than current.
+ * Devices added during trim with smaller UUID will be skipped.
+ */
+ working_dev = NULL;
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ /* Must larger than current UUID. */
+ if (memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE) <= 0)
+ continue;
+ /* Find the smallest. */
+ if (!working_dev ||
+ memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0)
+ working_dev = dev;
+ }
+ if (working_dev)
+ memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ *trimmed += group_trimmed;
+ start = BTRFS_DEVICE_RANGE_RESERVED;
+
+ /* No more devices. */
+ if (!working_dev)
+ break;
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
/*
* Trim the whole filesystem by:
* 1) trimming the free space in each block group
@@ -6613,9 +6743,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
*/
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_block_group *cache = NULL;
- struct btrfs_device *device;
u64 group_trimmed;
u64 range_end = U64_MAX;
u64 start;
@@ -6686,24 +6814,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (ret == -ERESTARTSYS || ret == -EINTR)
return ret;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
- continue;
-
- ret = btrfs_trim_free_extents(device, &group_trimmed);
-
- trimmed += group_trimmed;
- if (ret == -ERESTARTSYS || ret == -EINTR)
- break;
- if (ret) {
- dev_failed++;
- if (!dev_ret)
- dev_ret = ret;
- continue;
- }
- }
- mutex_unlock(&fs_devices->device_list_mutex);
+ ret = btrfs_trim_free_extents(fs_info, &group_trimmed, &dev_failed, &dev_ret);
+ trimmed += group_trimmed;
if (dev_failed)
btrfs_warn(fs_info,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index d3762fbe7267..3de3b517810e 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -65,6 +65,12 @@ struct btrfs_space_info;
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+/*
+ * Maximum length to trim in a single iteration to avoid holding device list
+ * mutex for too long.
+ */
+#define BTRFS_MAX_TRIM_LENGTH SZ_2G
+
#define BTRFS_OLDEST_GENERATION 0ULL
#define BTRFS_EMPTY_DIR_SIZE 0