diff options
author | Josef Bacik <jbacik@fb.com> | 2014-01-23 10:54:11 -0500 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2014-01-28 13:20:26 -0800 |
commit | 0a2b2a844af616addc87cac3cc18dcaba2a9d0fb (patch) | |
tree | d81e13b3388df4a66e3a2af6ff2df82f532d5c9e /fs/btrfs/extent-tree.c | |
parent | d7df2c796d7eedd72a334dc89c65e1fec8171431 (diff) |
Btrfs: throttle delayed refs better
On one of our gluster clusters we noticed some pretty big lag spikes. This
turned out to be because our transaction commit was taking like 3 minutes to
complete. This is because we have like 30 gigs of metadata, so our global
reserve would end up being the max which is like 512 mb. So our throttling code
would allow a ridiculous amount of delayed refs to build up and then they'd all
get run at transaction commit time, and for a cold mounted file system that
could take up to 3 minutes to run. So fix the throttling to be based on both
the size of the global reserve and how long it takes us to run delayed refs.
This patch tracks the time it takes to run delayed refs and then only allows 1
seconds worth of outstanding delayed refs at a time. This way it will auto-tune
itself from cold cache up to when everything is in memory and it no longer has
to go to disk. This makes our transaction commits take much less time to run.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 41 |
1 files changed, 40 insertions, 1 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c77156c77de7..b5322596d60b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2322,8 +2322,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; struct btrfs_fs_info *fs_info = root->fs_info; + ktime_t start = ktime_get(); int ret; unsigned long count = 0; + unsigned long actual_count = 0; int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; @@ -2452,6 +2454,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, &delayed_refs->href_root); spin_unlock(&delayed_refs->lock); } else { + actual_count++; ref->in_tree = 0; rb_erase(&ref->rb_node, &locked_ref->ref_root); } @@ -2502,6 +2505,26 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, count++; cond_resched(); } + + /* + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. + */ + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; + + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ + spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + avg = div64_u64(avg, 4); + fs_info->avg_delayed_ref_runtime = avg; + spin_unlock(&delayed_refs->lock); + } return 0; } @@ -2600,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_rsv *global_rsv; @@ -2629,6 +2652,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, return ret; } +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + + smp_mb(); + avg_runtime = fs_info->avg_delayed_ref_runtime; + if (num_entries * avg_runtime >= NSEC_PER_SEC) + return 1; + + return btrfs_check_space_for_delayed_refs(trans, root); +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be |