From 34d52cb6c50b5a43901709998f59fb1c5a43dc4a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Mar 2011 13:46:06 +0800 Subject: Btrfs: Make free space cache code generic So we can re-use the code to cache free inode numbers. The change is quite straightforward. Two new structures are introduced. - struct btrfs_free_space_ctl We move those variables that are used for caching free space from struct btrfs_block_group_cache to this new struct. - struct btrfs_free_space_op We do block group specific work (e.g. calculation of extents threshold) through functions registered in this struct. And then we can remove references to struct btrfs_block_group_cache. Signed-off-by: Li Zefan --- fs/btrfs/extent-tree.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 31f33ba56fe8..904eae10ec65 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -105,6 +105,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); WARN_ON(cache->reserved_pinned > 0); + kfree(cache->free_space_ctl); kfree(cache); } } @@ -4893,7 +4894,7 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, return 0; wait_event(caching_ctl->wait, block_group_cache_done(cache) || - (cache->free_space >= num_bytes)); + (cache->free_space_ctl->free_space >= num_bytes)); put_caching_control(caching_ctl); return 0; @@ -8551,10 +8552,16 @@ int btrfs_read_block_groups(struct btrfs_root *root) ret = -ENOMEM; goto error; } + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + ret = -ENOMEM; + goto error; + } atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - spin_lock_init(&cache->tree_lock); cache->fs_info = info; INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -8562,14 +8569,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (need_clear) cache->disk_cache_state = BTRFS_DC_CLEAR; - /* - * we only want to have 32k of ram per block group for keeping - * track of free space, and if we pass 1/2 of that we want to - * start converting things over to using bitmaps - */ - cache->extents_thresh = ((1024 * 32) / 2) / - sizeof(struct btrfs_free_space); - read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -8580,6 +8579,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; + btrfs_init_free_space_ctl(cache); + /* * We need to exclude the super stripes now so that the space * info has super bytes accounted for, otherwise we'll think @@ -8666,6 +8667,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) return -ENOMEM; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return -ENOMEM; + } cache->key.objectid = chunk_offset; cache->key.offset = size; @@ -8673,19 +8680,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->sectorsize = root->sectorsize; cache->fs_info = root->fs_info; - /* - * we only want to have 32k of ram per block group for keeping track - * of free space, and if we pass 1/2 of that we want to start - * converting things over to using bitmaps - */ - cache->extents_thresh = ((1024 * 32) / 2) / - sizeof(struct btrfs_free_space); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - spin_lock_init(&cache->tree_lock); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + btrfs_init_free_space_ctl(cache); + btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); cache->flags = type; -- cgit v1.2.3 From 33345d01522f8152f99dc84a3e7a1a45707f387f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 20 Apr 2011 10:31:50 +0800 Subject: Btrfs: Always use 64bit inode number There's a potential problem in 32bit system when we exhaust 32bit inode numbers and start to allocate big inode numbers, because btrfs uses inode->i_ino in many places. So here we always use BTRFS_I(inode)->location.objectid, which is an u64 variable. There are 2 exceptions that BTRFS_I(inode)->location.objectid != inode->i_ino: the btree inode (0 vs 1) and empty subvol dirs (256 vs 2), and inode->i_ino will be used in those cases. Another reason to make this change is I'm going to use a special inode to save free ino cache, and the inode number must be > (u64)-256. Signed-off-by: Li Zefan --- fs/btrfs/extent-tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 904eae10ec65..a0e818cb0401 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7009,8 +7009,8 @@ static noinline int get_new_locations(struct inode *reloc_inode, cur_pos = extent_key->objectid - offset; last_byte = extent_key->objectid + extent_key->offset; - ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, - cur_pos, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, + btrfs_ino(reloc_inode), cur_pos, 0); if (ret < 0) goto out; if (ret > 0) { @@ -7033,7 +7033,7 @@ static noinline int get_new_locations(struct inode *reloc_inode, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.offset != cur_pos || found_key.type != BTRFS_EXTENT_DATA_KEY || - found_key.objectid != reloc_inode->i_ino) + found_key.objectid != btrfs_ino(reloc_inode)) break; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -7179,7 +7179,7 @@ next: break; } - if (inode && key.objectid != inode->i_ino) { + if (inode && key.objectid != btrfs_ino(inode)) { BUG_ON(extent_locked); btrfs_release_path(root, path); mutex_unlock(&inode->i_mutex); @@ -7488,7 +7488,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root, continue; if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) continue; - if (!inode || inode->i_ino != key.objectid) { + if (!inode || btrfs_ino(inode) != key.objectid) { iput(inode); inode = btrfs_ilookup(target_root->fs_info->sb, key.objectid, target_root, 1); -- cgit v1.2.3 From 82d5902d9c681be37ffa9d70482907f9f0b7ec1f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 20 Apr 2011 10:33:24 +0800 Subject: Btrfs: Support reading/writing on disk free ino cache This is similar to block group caching. We dedicate a special inode in fs tree to save free ino cache. At the very first time we create/delete a file after mount, the free ino cache will be loaded from disk into memory. When the fs tree is commited, the cache will be written back to disk. To keep compatibility, we check the root generation against the generation of the special inode when loading the cache, so the loading will fail if the btrfs filesystem was mounted in an older kernel before. Signed-off-by: Li Zefan --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a0e818cb0401..95ce8da63b28 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3145,7 +3145,8 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - if (root == root->fs_info->tree_root) { + if (root == root->fs_info->tree_root || + BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { alloc_chunk = 0; committed = 1; } -- cgit v1.2.3 From 8d413713ca744fa00cf4e05d4054d80727b84789 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 25 Apr 2011 19:43:52 -0400 Subject: Btrfs: check return value of kmalloc() The check on the return value of kmalloc() is added to some places. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 31f33ba56fe8..cd52f7f556ef 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8059,6 +8059,10 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, u64 group_start = group->key.objectid; new_extents = kmalloc(sizeof(*new_extents), GFP_NOFS); + if (!new_extents) { + ret = -ENOMEM; + goto out; + } nr_extents = 1; ret = get_new_locations(reloc_inode, extent_key, -- cgit v1.2.3 From 62a45b60923a576170a1a0c309c240d9f40d193d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 20 Apr 2011 15:52:26 +0200 Subject: btrfs: make functions static when possible Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd52f7f556ef..7cdce82e03e7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -94,7 +94,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) return (cache->flags & bits) == bits; } -void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) { atomic_inc(&cache->count); } @@ -3651,8 +3651,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, - struct btrfs_block_rsv *dest, u64 num_bytes) +static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; -- cgit v1.2.3 From 172ddd60a662c4d8bf2809462866ddddd6431ea5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 21 Apr 2011 00:48:27 +0200 Subject: btrfs: drop gfp parameter from alloc_extent_map pass GFP_NOFS directly to kmem_cache_alloc Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7cdce82e03e7..6a3d53783d55 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6694,7 +6694,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, u64 start = extent_key->objectid - offset; u64 end = start + extent_key->offset - 1; - em = alloc_extent_map(GFP_NOFS); + em = alloc_extent_map(); BUG_ON(!em); em->start = start; -- cgit v1.2.3 From b3b4aa74b58bded927f579fff787fb6fa1c0393c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 21 Apr 2011 01:20:15 +0200 Subject: btrfs: drop unused parameter from btrfs_release_path parameter tree root it's not used since commit 5f39d397dfbe140a14edecd4e73c34ce23c4f9ee ("Btrfs: Create extent_buffer interface for large blocksizes") Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6a3d53783d55..a160f11465f8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -379,7 +379,7 @@ again: break; caching_ctl->progress = last; - btrfs_release_path(extent_root, path); + btrfs_release_path(path); up_read(&fs_info->extent_commit_sem); mutex_unlock(&caching_ctl->mutex); if (btrfs_transaction_in_commit(fs_info)) @@ -754,7 +754,7 @@ again: atomic_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); - btrfs_release_path(root->fs_info->extent_root, path); + btrfs_release_path(path); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); @@ -934,7 +934,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, break; } } - btrfs_release_path(root, path); + btrfs_release_path(path); if (owner < BTRFS_FIRST_FREE_OBJECTID) new_size += sizeof(*bi); @@ -1042,7 +1042,7 @@ again: return 0; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 key.type = BTRFS_EXTENT_REF_V0_KEY; - btrfs_release_path(root, path); + btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) { err = ret; @@ -1080,7 +1080,7 @@ again: if (match_extent_data_ref(leaf, ref, root_objectid, owner, offset)) { if (recow) { - btrfs_release_path(root, path); + btrfs_release_path(path); goto again; } err = 0; @@ -1141,7 +1141,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, if (match_extent_data_ref(leaf, ref, root_objectid, owner, offset)) break; - btrfs_release_path(root, path); + btrfs_release_path(path); key.offset++; ret = btrfs_insert_empty_item(trans, root, path, &key, size); @@ -1167,7 +1167,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); ret = 0; fail: - btrfs_release_path(root, path); + btrfs_release_path(path); return ret; } @@ -1293,7 +1293,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, ret = -ENOENT; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (ret == -ENOENT && parent) { - btrfs_release_path(root, path); + btrfs_release_path(path); key.type = BTRFS_EXTENT_REF_V0_KEY; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) @@ -1322,7 +1322,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, } ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - btrfs_release_path(root, path); + btrfs_release_path(path); return ret; } @@ -1608,7 +1608,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, if (ret != -ENOENT) return ret; - btrfs_release_path(root, path); + btrfs_release_path(path); *ref_ret = NULL; if (owner < BTRFS_FIRST_FREE_OBJECTID) { @@ -1862,7 +1862,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, __run_delayed_extent_op(extent_op, leaf, item); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(root->fs_info->extent_root, path); + btrfs_release_path(path); path->reada = 1; path->leave_spinning = 1; @@ -2361,7 +2361,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, atomic_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); - btrfs_release_path(root->fs_info->extent_root, path); + btrfs_release_path(path); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); @@ -2732,7 +2732,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(extent_root, path); + btrfs_release_path(path); fail: if (ret) return ret; @@ -2785,7 +2785,7 @@ again: inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); - btrfs_release_path(root, path); + btrfs_release_path(path); goto out; } @@ -2854,7 +2854,7 @@ again: out_put: iput(inode); out_free: - btrfs_release_path(root, path); + btrfs_release_path(path); out: spin_lock(&block_group->lock); block_group->disk_cache_state = dcs; @@ -4541,7 +4541,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, NULL, refs_to_drop, is_data); BUG_ON(ret); - btrfs_release_path(extent_root, path); + btrfs_release_path(path); path->leave_spinning = 1; key.objectid = bytenr; @@ -4580,7 +4580,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, owner_objectid, 0); BUG_ON(ret < 0); - btrfs_release_path(extent_root, path); + btrfs_release_path(path); path->leave_spinning = 1; key.objectid = bytenr; @@ -4650,7 +4650,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); BUG_ON(ret); - btrfs_release_path(extent_root, path); + btrfs_release_path(path); if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); @@ -6480,7 +6480,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, trans->block_rsv = block_rsv; } } - btrfs_release_path(root, path); + btrfs_release_path(path); BUG_ON(err); ret = btrfs_del_root(trans, tree_root, &root->root_key); @@ -8580,7 +8580,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) memcpy(&cache->key, &found_key, sizeof(found_key)); key.objectid = found_key.objectid + found_key.offset; - btrfs_release_path(root, path); + btrfs_release_path(path); cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; @@ -8802,12 +8802,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (ret < 0) goto out; if (ret > 0) - btrfs_release_path(tree_root, path); + btrfs_release_path(path); if (ret == 0) { ret = btrfs_del_item(trans, tree_root, path); if (ret) goto out; - btrfs_release_path(tree_root, path); + btrfs_release_path(path); } spin_lock(&root->fs_info->block_group_cache_lock); -- cgit v1.2.3 From 8cc33e5c19bf01c7617608669be8c1b4f663eb2f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 2 May 2011 15:29:25 +0200 Subject: btrfs: Document a mutex lock/unlock sequence --- fs/btrfs/extent-tree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a160f11465f8..fba1348cb2a0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -756,6 +756,10 @@ again: btrfs_release_path(path); + /* + * Mutex was contended, block until it's released and try + * again + */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); @@ -2297,6 +2301,10 @@ again: atomic_inc(&ref->refs); spin_unlock(&delayed_refs->lock); + /* + * Mutex was contended, block until it's + * released and try again + */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); @@ -2363,6 +2371,10 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, btrfs_release_path(path); + /* + * Mutex was contended, block until it's released and let + * caller try again + */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); -- cgit v1.2.3 From 182608c8294b5fe90d7bbd4b026c82bf0a24b736 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 5 May 2011 13:13:16 +0200 Subject: btrfs: remove old unused commented out code Remove code which has been #if0-ed out for a very long time and does not seem to be related to current codebase anymore. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 1661 +----------------------------------------------- 1 file changed, 1 insertion(+), 1660 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fba1348cb2a0..b457f195636e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2522,126 +2522,6 @@ out: return ret; } -#if 0 -int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, u32 nr_extents) -{ - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - u64 root_gen; - u32 nritems; - int i; - int level; - int ret = 0; - int shared = 0; - - if (!root->ref_cows) - return 0; - - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - shared = 0; - root_gen = root->root_key.offset; - } else { - shared = 1; - root_gen = trans->transid - 1; - } - - level = btrfs_header_level(buf); - nritems = btrfs_header_nritems(buf); - - if (level == 0) { - struct btrfs_leaf_ref *ref; - struct btrfs_extent_info *info; - - ref = btrfs_alloc_leaf_ref(root, nr_extents); - if (!ref) { - ret = -ENOMEM; - goto out; - } - - ref->root_gen = root_gen; - ref->bytenr = buf->start; - ref->owner = btrfs_header_owner(buf); - ref->generation = btrfs_header_generation(buf); - ref->nritems = nr_extents; - info = ref->extents; - - for (i = 0; nr_extents > 0 && i < nritems; i++) { - u64 disk_bytenr; - btrfs_item_key_to_cpu(buf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(buf, i, - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(buf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (disk_bytenr == 0) - continue; - - info->bytenr = disk_bytenr; - info->num_bytes = - btrfs_file_extent_disk_num_bytes(buf, fi); - info->objectid = key.objectid; - info->offset = key.offset; - info++; - } - - ret = btrfs_add_leaf_ref(root, ref, shared); - if (ret == -EEXIST && shared) { - struct btrfs_leaf_ref *old; - old = btrfs_lookup_leaf_ref(root, ref->bytenr); - BUG_ON(!old); - btrfs_remove_leaf_ref(root, old); - btrfs_free_leaf_ref(root, old); - ret = btrfs_add_leaf_ref(root, ref, shared); - } - WARN_ON(ret); - btrfs_free_leaf_ref(root, ref); - } -out: - return ret; -} - -/* when a block goes through cow, we update the reference counts of - * everything that block points to. The internal pointers of the block - * can be in just about any order, and it is likely to have clusters of - * things that are close together and clusters of things that are not. - * - * To help reduce the seeks that come with updating all of these reference - * counts, sort them by byte number before actual updates are done. - * - * struct refsort is used to match byte number to slot in the btree block. - * we sort based on the byte number and then use the slot to actually - * find the item. - * - * struct refsort is smaller than strcut btrfs_item and smaller than - * struct btrfs_key_ptr. Since we're currently limited to the page size - * for a btree block, there's no way for a kmalloc of refsorts for a - * single node to be bigger than a page. - */ -struct refsort { - u64 bytenr; - u32 slot; -}; - -/* - * for passing into sort() - */ -static int refsort_cmp(const void *a_void, const void *b_void) -{ - const struct refsort *a = a_void; - const struct refsort *b = b_void; - - if (a->bytenr < b->bytenr) - return -1; - if (a->bytenr > b->bytenr) - return 1; - return 0; -} -#endif - static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -3223,18 +3103,6 @@ commit_trans: goto again; } -#if 0 /* I hope we never need this code again, just in case */ - printk(KERN_ERR "no space left, need %llu, %llu bytes_used, " - "%llu bytes_reserved, " "%llu bytes_pinned, " - "%llu bytes_readonly, %llu may use %llu total\n", - (unsigned long long)bytes, - (unsigned long long)data_sinfo->bytes_used, - (unsigned long long)data_sinfo->bytes_reserved, - (unsigned long long)data_sinfo->bytes_pinned, - (unsigned long long)data_sinfo->bytes_readonly, - (unsigned long long)data_sinfo->bytes_may_use, - (unsigned long long)data_sinfo->total_bytes); -#endif return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -3867,23 +3735,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) u64 meta_used; u64 data_used; int csum_size = btrfs_super_csum_size(&fs_info->super_copy); -#if 0 - /* - * per tree used space accounting can be inaccuracy, so we - * can't rely on it. - */ - spin_lock(&fs_info->extent_root->accounting_lock); - num_bytes = btrfs_root_used(&fs_info->extent_root->root_item); - spin_unlock(&fs_info->extent_root->accounting_lock); - spin_lock(&fs_info->csum_root->accounting_lock); - num_bytes += btrfs_root_used(&fs_info->csum_root->root_item); - spin_unlock(&fs_info->csum_root->accounting_lock); - - spin_lock(&fs_info->tree_root->accounting_lock); - num_bytes += btrfs_root_used(&fs_info->tree_root->root_item); - spin_unlock(&fs_info->tree_root->accounting_lock); -#endif sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); spin_lock(&sinfo->lock); data_used = sinfo->bytes_used; @@ -3936,10 +3788,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } -#if 0 - printk(KERN_INFO"global block rsv size %llu reserved %llu\n", - block_rsv->size, block_rsv->reserved); -#endif + spin_unlock(&sinfo->lock); spin_unlock(&block_rsv->lock); } @@ -6596,1514 +6445,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -#if 0 -static unsigned long calc_ra(unsigned long start, unsigned long last, - unsigned long nr) -{ - return min(last, start + nr - 1); -} - -static noinline int relocate_inode_pages(struct inode *inode, u64 start, - u64 len) -{ - u64 page_start; - u64 page_end; - unsigned long first_index; - unsigned long last_index; - unsigned long i; - struct page *page; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct file_ra_state *ra; - struct btrfs_ordered_extent *ordered; - unsigned int total_read = 0; - unsigned int total_dirty = 0; - int ret = 0; - - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return -ENOMEM; - - mutex_lock(&inode->i_mutex); - first_index = start >> PAGE_CACHE_SHIFT; - last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; - - /* make sure the dirty trick played by the caller work */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - first_index, last_index); - if (ret) - goto out_unlock; - - file_ra_state_init(ra, inode->i_mapping); - - for (i = first_index ; i <= last_index; i++) { - if (total_read % ra->ra_pages == 0) { - btrfs_force_ra(inode->i_mapping, ra, NULL, i, - calc_ra(i, last_index, ra->ra_pages)); - } - total_read++; -again: - if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) - BUG_ON(1); - page = grab_cache_page(inode->i_mapping, i); - if (!page) { - ret = -ENOMEM; - goto out_unlock; - } - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - ret = -EIO; - goto out_unlock; - } - } - wait_on_page_writeback(page); - - page_start = (u64)page->index << PAGE_CACHE_SHIFT; - page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(io_tree, page_start, page_end, GFP_NOFS); - - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } - set_page_extent_mapped(page); - - if (i == first_index) - set_extent_bits(io_tree, page_start, page_end, - EXTENT_BOUNDARY, GFP_NOFS); - btrfs_set_extent_delalloc(inode, page_start, page_end); - - set_page_dirty(page); - total_dirty++; - - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - } - -out_unlock: - kfree(ra); - mutex_unlock(&inode->i_mutex); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); - return ret; -} - -static noinline int relocate_data_extent(struct inode *reloc_inode, - struct btrfs_key *extent_key, - u64 offset) -{ - struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; - struct extent_map *em; - u64 start = extent_key->objectid - offset; - u64 end = start + extent_key->offset - 1; - - em = alloc_extent_map(); - BUG_ON(!em); - - em->start = start; - em->len = extent_key->offset; - em->block_len = extent_key->offset; - em->block_start = extent_key->objectid; - em->bdev = root->fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - /* setup extent map to cheat btrfs_readpage */ - lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); - while (1) { - int ret; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(reloc_inode, start, end, 0); - } - unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); - - return relocate_inode_pages(reloc_inode, start, extent_key->offset); -} - -struct btrfs_ref_path { - u64 extent_start; - u64 nodes[BTRFS_MAX_LEVEL]; - u64 root_objectid; - u64 root_generation; - u64 owner_objectid; - u32 num_refs; - int lowest_level; - int current_level; - int shared_level; - - struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; - u64 new_nodes[BTRFS_MAX_LEVEL]; -}; - -struct disk_extent { - u64 ram_bytes; - u64 disk_bytenr; - u64 disk_num_bytes; - u64 offset; - u64 num_bytes; - u8 compression; - u8 encryption; - u16 other_encoding; -}; - -static int is_cowonly_root(u64 root_objectid) -{ - if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || - root_objectid == BTRFS_EXTENT_TREE_OBJECTID || - root_objectid == BTRFS_CHUNK_TREE_OBJECTID || - root_objectid == BTRFS_DEV_TREE_OBJECTID || - root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) - return 1; - return 0; -} - -static noinline int __next_ref_path(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_ref_path *ref_path, - int first_time) -{ - struct extent_buffer *leaf; - struct btrfs_path *path; - struct btrfs_extent_ref *ref; - struct btrfs_key key; - struct btrfs_key found_key; - u64 bytenr; - u32 nritems; - int level; - int ret = 1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - if (first_time) { - ref_path->lowest_level = -1; - ref_path->current_level = -1; - ref_path->shared_level = -1; - goto walk_up; - } -walk_down: - level = ref_path->current_level - 1; - while (level >= -1) { - u64 parent; - if (level < ref_path->lowest_level) - break; - - if (level >= 0) - bytenr = ref_path->nodes[level]; - else - bytenr = ref_path->extent_start; - BUG_ON(bytenr == 0); - - parent = ref_path->nodes[level + 1]; - ref_path->nodes[level + 1] = 0; - ref_path->current_level = level; - BUG_ON(parent == 0); - - key.objectid = bytenr; - key.offset = parent + 1; - key.type = BTRFS_EXTENT_REF_KEY; - - ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret == 0); - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret > 0) - goto next; - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid == bytenr && - found_key.type == BTRFS_EXTENT_REF_KEY) { - if (level < ref_path->shared_level) - ref_path->shared_level = level; - goto found; - } -next: - level--; - btrfs_release_path(extent_root, path); - cond_resched(); - } - /* reached lowest level */ - ret = 1; - goto out; -walk_up: - level = ref_path->current_level; - while (level < BTRFS_MAX_LEVEL - 1) { - u64 ref_objectid; - - if (level >= 0) - bytenr = ref_path->nodes[level]; - else - bytenr = ref_path->extent_start; - - BUG_ON(bytenr == 0); - - key.objectid = bytenr; - key.offset = 0; - key.type = BTRFS_EXTENT_REF_KEY; - - ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret > 0) { - /* the extent was freed by someone */ - if (ref_path->lowest_level == level) - goto out; - btrfs_release_path(extent_root, path); - goto walk_down; - } - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != bytenr || - found_key.type != BTRFS_EXTENT_REF_KEY) { - /* the extent was freed by someone */ - if (ref_path->lowest_level == level) { - ret = 1; - goto out; - } - btrfs_release_path(extent_root, path); - goto walk_down; - } -found: - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - ref_objectid = btrfs_ref_objectid(leaf, ref); - if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { - if (first_time) { - level = (int)ref_objectid; - BUG_ON(level >= BTRFS_MAX_LEVEL); - ref_path->lowest_level = level; - ref_path->current_level = level; - ref_path->nodes[level] = bytenr; - } else { - WARN_ON(ref_objectid != level); - } - } else { - WARN_ON(level != -1); - } - first_time = 0; - - if (ref_path->lowest_level == level) { - ref_path->owner_objectid = ref_objectid; - ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); - } - - /* - * the block is tree root or the block isn't in reference - * counted tree. - */ - if (found_key.objectid == found_key.offset || - is_cowonly_root(btrfs_ref_root(leaf, ref))) { - ref_path->root_objectid = btrfs_ref_root(leaf, ref); - ref_path->root_generation = - btrfs_ref_generation(leaf, ref); - if (level < 0) { - /* special reference from the tree log */ - ref_path->nodes[0] = found_key.offset; - ref_path->current_level = 0; - } - ret = 0; - goto out; - } - - level++; - BUG_ON(ref_path->nodes[level] != 0); - ref_path->nodes[level] = found_key.offset; - ref_path->current_level = level; - - /* - * the reference was created in the running transaction, - * no need to continue walking up. - */ - if (btrfs_ref_generation(leaf, ref) == trans->transid) { - ref_path->root_objectid = btrfs_ref_root(leaf, ref); - ref_path->root_generation = - btrfs_ref_generation(leaf, ref); - ret = 0; - goto out; - } - - btrfs_release_path(extent_root, path); - cond_resched(); - } - /* reached max tree level, but no tree root found. */ - BUG(); -out: - btrfs_free_path(path); - return ret; -} - -static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_ref_path *ref_path, - u64 extent_start) -{ - memset(ref_path, 0, sizeof(*ref_path)); - ref_path->extent_start = extent_start; - - return __next_ref_path(trans, extent_root, ref_path, 1); -} - -static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_ref_path *ref_path) -{ - return __next_ref_path(trans, extent_root, ref_path, 0); -} - -static noinline int get_new_locations(struct inode *reloc_inode, - struct btrfs_key *extent_key, - u64 offset, int no_fragment, - struct disk_extent **extents, - int *nr_extents) -{ - struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - struct disk_extent *exts = *extents; - struct btrfs_key found_key; - u64 cur_pos; - u64 last_byte; - u32 nritems; - int nr = 0; - int max = *nr_extents; - int ret; - - WARN_ON(!no_fragment && *extents); - if (!exts) { - max = 1; - exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); - if (!exts) - return -ENOMEM; - } - - path = btrfs_alloc_path(); - if (!path) { - if (exts != *extents) - kfree(exts); - return -ENOMEM; - } - - cur_pos = extent_key->objectid - offset; - last_byte = extent_key->objectid + extent_key->offset; - ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, - cur_pos, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } - - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.offset != cur_pos || - found_key.type != BTRFS_EXTENT_DATA_KEY || - found_key.objectid != reloc_inode->i_ino) - break; - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) != - BTRFS_FILE_EXTENT_REG || - btrfs_file_extent_disk_bytenr(leaf, fi) == 0) - break; - - if (nr == max) { - struct disk_extent *old = exts; - max *= 2; - exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); - if (!exts) { - ret = -ENOMEM; - goto out; - } - memcpy(exts, old, sizeof(*exts) * nr); - if (old != *extents) - kfree(old); - } - - exts[nr].disk_bytenr = - btrfs_file_extent_disk_bytenr(leaf, fi); - exts[nr].disk_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, fi); - exts[nr].offset = btrfs_file_extent_offset(leaf, fi); - exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - exts[nr].compression = btrfs_file_extent_compression(leaf, fi); - exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); - exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, - fi); - BUG_ON(exts[nr].offset > 0); - BUG_ON(exts[nr].compression || exts[nr].encryption); - BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); - - cur_pos += exts[nr].num_bytes; - nr++; - - if (cur_pos + offset >= last_byte) - break; - - if (no_fragment) { - ret = 1; - goto out; - } - path->slots[0]++; - } - - BUG_ON(cur_pos + offset > last_byte); - if (cur_pos + offset < last_byte) { - ret = -ENOENT; - goto out; - } - ret = 0; -out: - btrfs_free_path(path); - if (ret) { - if (exts != *extents) - kfree(exts); - } else { - *extents = exts; - *nr_extents = nr; - } - return ret; -} - -static noinline int replace_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *extent_key, - struct btrfs_key *leaf_key, - struct btrfs_ref_path *ref_path, - struct disk_extent *new_extents, - int nr_extents) -{ - struct extent_buffer *leaf; - struct btrfs_file_extent_item *fi; - struct inode *inode = NULL; - struct btrfs_key key; - u64 lock_start = 0; - u64 lock_end = 0; - u64 num_bytes; - u64 ext_offset; - u64 search_end = (u64)-1; - u32 nritems; - int nr_scaned = 0; - int extent_locked = 0; - int extent_type; - int ret; - - memcpy(&key, leaf_key, sizeof(key)); - if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { - if (key.objectid < ref_path->owner_objectid || - (key.objectid == ref_path->owner_objectid && - key.type < BTRFS_EXTENT_DATA_KEY)) { - key.objectid = ref_path->owner_objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; - } - } - - while (1) { - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); -next: - if (extent_locked && ret > 0) { - /* - * the file extent item was modified by someone - * before the extent got locked. - */ - unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, - lock_end, GFP_NOFS); - extent_locked = 0; - } - - if (path->slots[0] >= nritems) { - if (++nr_scaned > 2) - break; - - BUG_ON(extent_locked); - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { - if ((key.objectid > ref_path->owner_objectid) || - (key.objectid == ref_path->owner_objectid && - key.type > BTRFS_EXTENT_DATA_KEY) || - key.offset >= search_end) - break; - } - - if (inode && key.objectid != inode->i_ino) { - BUG_ON(extent_locked); - btrfs_release_path(root, path); - mutex_unlock(&inode->i_mutex); - iput(inode); - inode = NULL; - continue; - } - - if (key.type != BTRFS_EXTENT_DATA_KEY) { - path->slots[0]++; - ret = 1; - goto next; - } - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - if ((extent_type != BTRFS_FILE_EXTENT_REG && - extent_type != BTRFS_FILE_EXTENT_PREALLOC) || - (btrfs_file_extent_disk_bytenr(leaf, fi) != - extent_key->objectid)) { - path->slots[0]++; - ret = 1; - goto next; - } - - num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - ext_offset = btrfs_file_extent_offset(leaf, fi); - - if (search_end == (u64)-1) { - search_end = key.offset - ext_offset + - btrfs_file_extent_ram_bytes(leaf, fi); - } - - if (!extent_locked) { - lock_start = key.offset; - lock_end = lock_start + num_bytes - 1; - } else { - if (lock_start > key.offset || - lock_end + 1 < key.offset + num_bytes) { - unlock_extent(&BTRFS_I(inode)->io_tree, - lock_start, lock_end, GFP_NOFS); - extent_locked = 0; - } - } - - if (!inode) { - btrfs_release_path(root, path); - - inode = btrfs_iget_locked(root->fs_info->sb, - key.objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = - key.objectid; - BTRFS_I(inode)->location.type = - BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - /* - * some code call btrfs_commit_transaction while - * holding the i_mutex, so we can't use mutex_lock - * here. - */ - if (is_bad_inode(inode) || - !mutex_trylock(&inode->i_mutex)) { - iput(inode); - inode = NULL; - key.offset = (u64)-1; - goto skip; - } - } - - if (!extent_locked) { - struct btrfs_ordered_extent *ordered; - - btrfs_release_path(root, path); - - lock_extent(&BTRFS_I(inode)->io_tree, lock_start, - lock_end, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, - lock_end); - if (ordered && - ordered->file_offset <= lock_end && - ordered->file_offset + ordered->len > lock_start) { - unlock_extent(&BTRFS_I(inode)->io_tree, - lock_start, lock_end, GFP_NOFS); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - key.offset += num_bytes; - goto skip; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - - extent_locked = 1; - continue; - } - - if (nr_extents == 1) { - /* update extent pointer in place */ - btrfs_set_file_extent_disk_bytenr(leaf, fi, - new_extents[0].disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, - new_extents[0].disk_num_bytes); - btrfs_mark_buffer_dirty(leaf); - - btrfs_drop_extent_cache(inode, key.offset, - key.offset + num_bytes - 1, 0); - - ret = btrfs_inc_extent_ref(trans, root, - new_extents[0].disk_bytenr, - new_extents[0].disk_num_bytes, - leaf->start, - root->root_key.objectid, - trans->transid, - key.objectid); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, root, - extent_key->objectid, - extent_key->offset, - leaf->start, - btrfs_header_owner(leaf), - btrfs_header_generation(leaf), - key.objectid, 0); - BUG_ON(ret); - - btrfs_release_path(root, path); - key.offset += num_bytes; - } else { - BUG_ON(1); -#if 0 - u64 alloc_hint; - u64 extent_len; - int i; - /* - * drop old extent pointer at first, then insert the - * new pointers one bye one - */ - btrfs_release_path(root, path); - ret = btrfs_drop_extents(trans, root, inode, key.offset, - key.offset + num_bytes, - key.offset, &alloc_hint); - BUG_ON(ret); - - for (i = 0; i < nr_extents; i++) { - if (ext_offset >= new_extents[i].num_bytes) { - ext_offset -= new_extents[i].num_bytes; - continue; - } - extent_len = min(new_extents[i].num_bytes - - ext_offset, num_bytes); - - ret = btrfs_insert_empty_item(trans, root, - path, &key, - sizeof(*fi)); - BUG_ON(ret); - - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, - trans->transid); - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_disk_bytenr(leaf, fi, - new_extents[i].disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, - new_extents[i].disk_num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, fi, - new_extents[i].ram_bytes); - - btrfs_set_file_extent_compression(leaf, fi, - new_extents[i].compression); - btrfs_set_file_extent_encryption(leaf, fi, - new_extents[i].encryption); - btrfs_set_file_extent_other_encoding(leaf, fi, - new_extents[i].other_encoding); - - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_len); - ext_offset += new_extents[i].offset; - btrfs_set_file_extent_offset(leaf, fi, - ext_offset); - btrfs_mark_buffer_dirty(leaf); - - btrfs_drop_extent_cache(inode, key.offset, - key.offset + extent_len - 1, 0); - - ret = btrfs_inc_extent_ref(trans, root, - new_extents[i].disk_bytenr, - new_extents[i].disk_num_bytes, - leaf->start, - root->root_key.objectid, - trans->transid, key.objectid); - BUG_ON(ret); - btrfs_release_path(root, path); - - inode_add_bytes(inode, extent_len); - - ext_offset = 0; - num_bytes -= extent_len; - key.offset += extent_len; - - if (num_bytes == 0) - break; - } - BUG_ON(i >= nr_extents); -#endif - } - - if (extent_locked) { - unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, - lock_end, GFP_NOFS); - extent_locked = 0; - } -skip: - if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && - key.offset >= search_end) - break; - - cond_resched(); - } - ret = 0; -out: - btrfs_release_path(root, path); - if (inode) { - mutex_unlock(&inode->i_mutex); - if (extent_locked) { - unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, - lock_end, GFP_NOFS); - } - iput(inode); - } - return ret; -} - -int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, u64 orig_start) -{ - int level; - int ret; - - BUG_ON(btrfs_header_generation(buf) != trans->transid); - BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - - level = btrfs_header_level(buf); - if (level == 0) { - struct btrfs_leaf_ref *ref; - struct btrfs_leaf_ref *orig_ref; - - orig_ref = btrfs_lookup_leaf_ref(root, orig_start); - if (!orig_ref) - return -ENOENT; - - ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); - if (!ref) { - btrfs_free_leaf_ref(root, orig_ref); - return -ENOMEM; - } - - ref->nritems = orig_ref->nritems; - memcpy(ref->extents, orig_ref->extents, - sizeof(ref->extents[0]) * ref->nritems); - - btrfs_free_leaf_ref(root, orig_ref); - - ref->root_gen = trans->transid; - ref->bytenr = buf->start; - ref->owner = btrfs_header_owner(buf); - ref->generation = btrfs_header_generation(buf); - - ret = btrfs_add_leaf_ref(root, ref, 0); - WARN_ON(ret); - btrfs_free_leaf_ref(root, ref); - } - return 0; -} - -static noinline int invalidate_extent_cache(struct btrfs_root *root, - struct extent_buffer *leaf, - struct btrfs_block_group_cache *group, - struct btrfs_root *target_root) -{ - struct btrfs_key key; - struct inode *inode = NULL; - struct btrfs_file_extent_item *fi; - struct extent_state *cached_state = NULL; - u64 num_bytes; - u64 skip_objectid = 0; - u32 nritems; - u32 i; - - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - btrfs_item_key_to_cpu(leaf, &key, i); - if (key.objectid == skip_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) - continue; - if (!inode || inode->i_ino != key.objectid) { - iput(inode); - inode = btrfs_ilookup(target_root->fs_info->sb, - key.objectid, target_root, 1); - } - if (!inode) { - skip_objectid = key.objectid; - continue; - } - num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset, - key.offset + num_bytes - 1, 0, &cached_state, - GFP_NOFS); - btrfs_drop_extent_cache(inode, key.offset, - key.offset + num_bytes - 1, 1); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset, - key.offset + num_bytes - 1, &cached_state, - GFP_NOFS); - cond_resched(); - } - iput(inode); - return 0; -} - -static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *leaf, - struct btrfs_block_group_cache *group, - struct inode *reloc_inode) -{ - struct btrfs_key key; - struct btrfs_key extent_key; - struct btrfs_file_extent_item *fi; - struct btrfs_leaf_ref *ref; - struct disk_extent *new_extent; - u64 bytenr; - u64 num_bytes; - u32 nritems; - u32 i; - int ext_index; - int nr_extent; - int ret; - - new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); - if (!new_extent) - return -ENOMEM; - - ref = btrfs_lookup_leaf_ref(root, leaf->start); - BUG_ON(!ref); - - ext_index = -1; - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - btrfs_item_key_to_cpu(leaf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - if (bytenr == 0) - continue; - - ext_index++; - if (bytenr >= group->key.objectid + group->key.offset || - bytenr + num_bytes <= group->key.objectid) - continue; - - extent_key.objectid = bytenr; - extent_key.offset = num_bytes; - extent_key.type = BTRFS_EXTENT_ITEM_KEY; - nr_extent = 1; - ret = get_new_locations(reloc_inode, &extent_key, - group->key.objectid, 1, - &new_extent, &nr_extent); - if (ret > 0) - continue; - BUG_ON(ret < 0); - - BUG_ON(ref->extents[ext_index].bytenr != bytenr); - BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); - ref->extents[ext_index].bytenr = new_extent->disk_bytenr; - ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; - - btrfs_set_file_extent_disk_bytenr(leaf, fi, - new_extent->disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, - new_extent->disk_num_bytes); - btrfs_mark_buffer_dirty(leaf); - - ret = btrfs_inc_extent_ref(trans, root, - new_extent->disk_bytenr, - new_extent->disk_num_bytes, - leaf->start, - root->root_key.objectid, - trans->transid, key.objectid); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, root, - bytenr, num_bytes, leaf->start, - btrfs_header_owner(leaf), - btrfs_header_generation(leaf), - key.objectid, 0); - BUG_ON(ret); - cond_resched(); - } - kfree(new_extent); - BUG_ON(ext_index + 1 != ref->nritems); - btrfs_free_leaf_ref(root, ref); - return 0; -} - -int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - int ret; - - if (root->reloc_root) { - reloc_root = root->reloc_root; - root->reloc_root = NULL; - list_add(&reloc_root->dead_list, - &root->fs_info->dead_reloc_roots); - - btrfs_set_root_bytenr(&reloc_root->root_item, - reloc_root->node->start); - btrfs_set_root_level(&root->root_item, - btrfs_header_level(reloc_root->node)); - memset(&reloc_root->root_item.drop_progress, 0, - sizeof(struct btrfs_disk_key)); - reloc_root->root_item.drop_level = 0; - - ret = btrfs_update_root(trans, root->fs_info->tree_root, - &reloc_root->root_key, - &reloc_root->root_item); - BUG_ON(ret); - } - return 0; -} - -int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *reloc_root; - struct btrfs_root *prev_root = NULL; - struct list_head dead_roots; - int ret; - unsigned long nr; - - INIT_LIST_HEAD(&dead_roots); - list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); - - while (!list_empty(&dead_roots)) { - reloc_root = list_entry(dead_roots.prev, - struct btrfs_root, dead_list); - list_del_init(&reloc_root->dead_list); - - BUG_ON(reloc_root->commit_root != NULL); - while (1) { - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - - mutex_lock(&root->fs_info->drop_mutex); - ret = btrfs_drop_snapshot(trans, reloc_root); - if (ret != -EAGAIN) - break; - mutex_unlock(&root->fs_info->drop_mutex); - - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); - btrfs_btree_balance_dirty(root, nr); - } - - free_extent_buffer(reloc_root->node); - - ret = btrfs_del_root(trans, root->fs_info->tree_root, - &reloc_root->root_key); - BUG_ON(ret); - mutex_unlock(&root->fs_info->drop_mutex); - - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); - btrfs_btree_balance_dirty(root, nr); - - kfree(prev_root); - prev_root = reloc_root; - } - if (prev_root) { - btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); - kfree(prev_root); - } - return 0; -} - -int btrfs_add_dead_reloc_root(struct btrfs_root *root) -{ - list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); - return 0; -} - -int btrfs_cleanup_reloc_trees(struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - struct btrfs_trans_handle *trans; - struct btrfs_key location; - int found; - int ret; - - mutex_lock(&root->fs_info->tree_reloc_mutex); - ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); - BUG_ON(ret); - found = !list_empty(&root->fs_info->dead_reloc_roots); - mutex_unlock(&root->fs_info->tree_reloc_mutex); - - if (found) { - trans = btrfs_start_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - } - - location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - location.offset = (u64)-1; - location.type = BTRFS_ROOT_ITEM_KEY; - - reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); - BUG_ON(!reloc_root); - ret = btrfs_orphan_cleanup(reloc_root); - BUG_ON(ret); - return 0; -} - -static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - struct extent_buffer *eb; - struct btrfs_root_item *root_item; - struct btrfs_key root_key; - int ret; - - BUG_ON(!root->ref_cows); - if (root->reloc_root) - return 0; - - root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - if (!root_item) - return -ENOMEM; - - ret = btrfs_copy_root(trans, root, root->commit_root, - &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); - - root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; - root_key.offset = root->root_key.objectid; - root_key.type = BTRFS_ROOT_ITEM_KEY; - - memcpy(root_item, &root->root_item, sizeof(root_item)); - btrfs_set_root_refs(root_item, 0); - btrfs_set_root_bytenr(root_item, eb->start); - btrfs_set_root_level(root_item, btrfs_header_level(eb)); - btrfs_set_root_generation(root_item, trans->transid); - - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - - ret = btrfs_insert_root(trans, root->fs_info->tree_root, - &root_key, root_item); - BUG_ON(ret); - kfree(root_item); - - reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &root_key); - BUG_ON(IS_ERR(reloc_root)); - reloc_root->last_trans = trans->transid; - reloc_root->commit_root = NULL; - reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; - - root->reloc_root = reloc_root; - return 0; -} - -/* - * Core function of space balance. - * - * The idea is using reloc trees to relocate tree blocks in reference - * counted roots. There is one reloc tree for each subvol, and all - * reloc trees share same root key objectid. Reloc trees are snapshots - * of the latest committed roots of subvols (root->commit_root). - * - * To relocate a tree block referenced by a subvol, there are two steps. - * COW the block through subvol's reloc tree, then update block pointer - * in the subvol to point to the new block. Since all reloc trees share - * same root key objectid, doing special handing for tree blocks owned - * by them is easy. Once a tree block has been COWed in one reloc tree, - * we can use the resulting new block directly when the same block is - * required to COW again through other reloc trees. By this way, relocated - * tree blocks are shared between reloc trees, so they are also shared - * between subvols. - */ -static noinline int relocate_one_path(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *first_key, - struct btrfs_ref_path *ref_path, - struct btrfs_block_group_cache *group, - struct inode *reloc_inode) -{ - struct btrfs_root *reloc_root; - struct extent_buffer *eb = NULL; - struct btrfs_key *keys; - u64 *nodes; - int level; - int shared_level; - int lowest_level = 0; - int ret; - - if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) - lowest_level = ref_path->owner_objectid; - - if (!root->ref_cows) { - path->lowest_level = lowest_level; - ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); - BUG_ON(ret < 0); - path->lowest_level = 0; - btrfs_release_path(root, path); - return 0; - } - - mutex_lock(&root->fs_info->tree_reloc_mutex); - ret = init_reloc_tree(trans, root); - BUG_ON(ret); - reloc_root = root->reloc_root; - - shared_level = ref_path->shared_level; - ref_path->shared_level = BTRFS_MAX_LEVEL - 1; - - keys = ref_path->node_keys; - nodes = ref_path->new_nodes; - memset(&keys[shared_level + 1], 0, - sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); - memset(&nodes[shared_level + 1], 0, - sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); - - if (nodes[lowest_level] == 0) { - path->lowest_level = lowest_level; - ret = btrfs_search_slot(trans, reloc_root, first_key, path, - 0, 1); - BUG_ON(ret); - for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { - eb = path->nodes[level]; - if (!eb || eb == reloc_root->node) - break; - nodes[level] = eb->start; - if (level == 0) - btrfs_item_key_to_cpu(eb, &keys[level], 0); - else - btrfs_node_key_to_cpu(eb, &keys[level], 0); - } - if (nodes[0] && - ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { - eb = path->nodes[0]; - ret = replace_extents_in_leaf(trans, reloc_root, eb, - group, reloc_inode); - BUG_ON(ret); - } - btrfs_release_path(reloc_root, path); - } else { - ret = btrfs_merge_path(trans, reloc_root, keys, nodes, - lowest_level); - BUG_ON(ret); - } - - /* - * replace tree blocks in the fs tree with tree blocks in - * the reloc tree. - */ - ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); - BUG_ON(ret < 0); - - if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_search_slot(trans, reloc_root, first_key, path, - 0, 0); - BUG_ON(ret); - extent_buffer_get(path->nodes[0]); - eb = path->nodes[0]; - btrfs_release_path(reloc_root, path); - ret = invalidate_extent_cache(reloc_root, eb, group, root); - BUG_ON(ret); - free_extent_buffer(eb); - } - - mutex_unlock(&root->fs_info->tree_reloc_mutex); - path->lowest_level = 0; - return 0; -} - -static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *first_key, - struct btrfs_ref_path *ref_path) -{ - int ret; - - ret = relocate_one_path(trans, root, path, first_key, - ref_path, NULL, NULL); - BUG_ON(ret); - - return 0; -} - -static noinline int del_extent_zero(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct btrfs_path *path, - struct btrfs_key *extent_key) -{ - int ret; - - ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); - if (ret) - goto out; - ret = btrfs_del_item(trans, extent_root, path); -out: - btrfs_release_path(extent_root, path); - return ret; -} - -static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, - struct btrfs_ref_path *ref_path) -{ - struct btrfs_key root_key; - - root_key.objectid = ref_path->root_objectid; - root_key.type = BTRFS_ROOT_ITEM_KEY; - if (is_cowonly_root(ref_path->root_objectid)) - root_key.offset = 0; - else - root_key.offset = (u64)-1; - - return btrfs_read_fs_root_no_name(fs_info, &root_key); -} - -static noinline int relocate_one_extent(struct btrfs_root *extent_root, - struct btrfs_path *path, - struct btrfs_key *extent_key, - struct btrfs_block_group_cache *group, - struct inode *reloc_inode, int pass) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *found_root; - struct btrfs_ref_path *ref_path = NULL; - struct disk_extent *new_extents = NULL; - int nr_extents = 0; - int loops; - int ret; - int level; - struct btrfs_key first_key; - u64 prev_block = 0; - - - trans = btrfs_start_transaction(extent_root, 1); - BUG_ON(IS_ERR(trans)); - - if (extent_key->objectid == 0) { - ret = del_extent_zero(trans, extent_root, path, extent_key); - goto out; - } - - ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); - if (!ref_path) { - ret = -ENOMEM; - goto out; - } - - for (loops = 0; ; loops++) { - if (loops == 0) { - ret = btrfs_first_ref_path(trans, extent_root, ref_path, - extent_key->objectid); - } else { - ret = btrfs_next_ref_path(trans, extent_root, ref_path); - } - if (ret < 0) - goto out; - if (ret > 0) - break; - - if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || - ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) - continue; - - found_root = read_ref_root(extent_root->fs_info, ref_path); - BUG_ON(!found_root); - /* - * for reference counted tree, only process reference paths - * rooted at the latest committed root. - */ - if (found_root->ref_cows && - ref_path->root_generation != found_root->root_key.offset) - continue; - - if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { - if (pass == 0) { - /* - * copy data extents to new locations - */ - u64 group_start = group->key.objectid; - ret = relocate_data_extent(reloc_inode, - extent_key, - group_start); - if (ret < 0) - goto out; - break; - } - level = 0; - } else { - level = ref_path->owner_objectid; - } - - if (prev_block != ref_path->nodes[level]) { - struct extent_buffer *eb; - u64 block_start = ref_path->nodes[level]; - u64 block_size = btrfs_level_size(found_root, level); - - eb = read_tree_block(found_root, block_start, - block_size, 0); - if (!eb) { - ret = -EIO; - goto out; - } - btrfs_tree_lock(eb); - BUG_ON(level != btrfs_header_level(eb)); - - if (level == 0) - btrfs_item_key_to_cpu(eb, &first_key, 0); - else - btrfs_node_key_to_cpu(eb, &first_key, 0); - - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - prev_block = block_start; - } - - mutex_lock(&extent_root->fs_info->trans_mutex); - btrfs_record_root_in_trans(found_root); - mutex_unlock(&extent_root->fs_info->trans_mutex); - if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { - /* - * try to update data extent references while - * keeping metadata shared between snapshots. - */ - if (pass == 1) { - ret = relocate_one_path(trans, found_root, - path, &first_key, ref_path, - group, reloc_inode); - if (ret < 0) - goto out; - continue; - } - /* - * use fallback method to process the remaining - * references. - */ - if (!new_extents) { - u64 group_start = group->key.objectid; - new_extents = kmalloc(sizeof(*new_extents), - GFP_NOFS); - if (!new_extents) { - ret = -ENOMEM; - goto out; - } - nr_extents = 1; - ret = get_new_locations(reloc_inode, - extent_key, - group_start, 1, - &new_extents, - &nr_extents); - if (ret) - goto out; - } - ret = replace_one_extent(trans, found_root, - path, extent_key, - &first_key, ref_path, - new_extents, nr_extents); - } else { - ret = relocate_tree_block(trans, found_root, path, - &first_key, ref_path); - } - if (ret < 0) - goto out; - } - ret = 0; -out: - btrfs_end_transaction(trans, extent_root); - kfree(new_extents); - kfree(ref_path); - return ret; -} -#endif - static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { u64 num_devices; -- cgit v1.2.3 From 1aba86d67f340a8001d67183ec32e8a62e3ec658 Mon Sep 17 00:00:00 2001 From: liubo Date: Fri, 8 Apr 2011 08:44:37 +0000 Subject: Btrfs: fix easily get into ENOSPC in mixed case When a btrfs disk is created by mixed data & metadata option, it will have no pure data or pure metadata space info. In btrfs's for-linus branch, commit 78b1ea13838039cd88afdd62519b40b344d6c920 (Btrfs: fix OOPS of empty filesystem after balance) initializes space infos at the very beginning. The problem is this initialization does not take the mixed case into account, which will cause btrfs will easily get into ENOSPC in mixed case. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd52f7f556ef..9ee6bd55e16c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8856,23 +8856,38 @@ out: int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; int ret; - ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM, 0, 0, - &space_info); - if (ret) - return ret; + disk_super = &fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return 1; - ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA, 0, 0, - &space_info); - if (ret) - return ret; + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; - ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA, 0, 0, - &space_info); + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); if (ret) - return ret; + goto out; + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } +out: return ret; } -- cgit v1.2.3 From 16cdcec736cd214350cdb591bf1091f8beedefa0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 22 Apr 2011 18:12:22 +0800 Subject: btrfs: implement delayed inode items operation Changelog V5 -> V6: - Fix oom when the memory load is high, by storing the delayed nodes into the root's radix tree, and letting btrfs inodes go. Changelog V4 -> V5: - Fix the race on adding the delayed node to the inode, which is spotted by Chris Mason. - Merge Chris Mason's incremental patch into this patch. - Fix deadlock between readdir() and memory fault, which is reported by Itaru Kitayama. Changelog V3 -> V4: - Fix nested lock, which is reported by Itaru Kitayama, by updating space cache inode in time. Changelog V2 -> V3: - Fix the race between the delayed worker and the task which does delayed items balance, which is reported by Tsutomu Itoh. - Modify the patch address David Sterba's comment. - Fix the bug of the cpu recursion spinlock, reported by Chris Mason Changelog V1 -> V2: - break up the global rb-tree, use a list to manage the delayed nodes, which is created for every directory and file, and used to manage the delayed directory name index items and the delayed inode item. - introduce a worker to deal with the delayed nodes. Compare with Ext3/4, the performance of file creation and deletion on btrfs is very poor. the reason is that btrfs must do a lot of b+ tree insertions, such as inode item, directory name item, directory name index and so on. If we can do some delayed b+ tree insertion or deletion, we can improve the performance, so we made this patch which implemented delayed directory name index insertion/deletion and delayed inode update. Implementation: - introduce a delayed root object into the filesystem, that use two lists to manage the delayed nodes which are created for every file/directory. One is used to manage all the delayed nodes that have delayed items. And the other is used to manage the delayed nodes which is waiting to be dealt with by the work thread. - Every delayed node has two rb-tree, one is used to manage the directory name index which is going to be inserted into b+ tree, and the other is used to manage the directory name index which is going to be deleted from b+ tree. - introduce a worker to deal with the delayed operation. This worker is used to deal with the works of the delayed directory name index items insertion and deletion and the delayed inode update. When the delayed items is beyond the lower limit, we create works for some delayed nodes and insert them into the work queue of the worker, and then go back. When the delayed items is beyond the upper bound, we create works for all the delayed nodes that haven't been dealt with, and insert them into the work queue of the worker, and then wait for that the untreated items is below some threshold value. - When we want to insert a directory name index into b+ tree, we just add the information into the delayed inserting rb-tree. And then we check the number of the delayed items and do delayed items balance. (The balance policy is above.) - When we want to delete a directory name index from the b+ tree, we search it in the inserting rb-tree at first. If we look it up, just drop it. If not, add the key of it into the delayed deleting rb-tree. Similar to the delayed inserting rb-tree, we also check the number of the delayed items and do delayed items balance. (The same to inserting manipulation) - When we want to update the metadata of some inode, we cached the data of the inode into the delayed node. the worker will flush it into the b+ tree after dealing with the delayed insertion and deletion. - We will move the delayed node to the tail of the list after we access the delayed node, By this way, we can cache more delayed items and merge more inode updates. - If we want to commit transaction, we will deal with all the delayed node. - the delayed node will be freed when we free the btrfs inode. - Before we log the inode items, we commit all the directory name index items and the delayed inode update. I did a quick test by the benchmark tool[1] and found we can improve the performance of file creation by ~15%, and file deletion by ~20%. Before applying this patch: Create files: Total files: 50000 Total time: 1.096108 Average time: 0.000022 Delete files: Total files: 50000 Total time: 1.510403 Average time: 0.000030 After applying this patch: Create files: Total files: 50000 Total time: 0.932899 Average time: 0.000019 Delete files: Total files: 50000 Total time: 1.215732 Average time: 0.000024 [1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3 Many thanks for Kitayama-san's help! Signed-off-by: Miao Xie Reviewed-by: David Sterba Tested-by: Tsutomu Itoh Tested-by: Itaru Kitayama Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9ee6bd55e16c..7b0433866f36 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3973,12 +3973,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->chunk_block_rsv.reserved > 0); } -static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) -{ - return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * - 3 * num_items; -} - int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, int num_items) @@ -3989,7 +3983,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, if (num_items == 0 || root->fs_info->chunk_root == root) return 0; - num_bytes = calc_trans_metadata_size(root, num_items); + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, num_bytes); if (!ret) { @@ -4028,14 +4022,14 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, * If all of the metadata space is used, we can commit * transaction and use space it freed. */ - u64 num_bytes = calc_trans_metadata_size(root, 4); + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes = calc_trans_metadata_size(root, 4); + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } @@ -4049,7 +4043,7 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, * two for root back/forward refs, two for directory entries * and one for root of the snapshot. */ - u64 num_bytes = calc_trans_metadata_size(root, 5); + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); dst_rsv->space_info = src_rsv->space_info; return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } @@ -4078,7 +4072,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (nr_extents > reserved_extents) { nr_extents -= reserved_extents; - to_reserve = calc_trans_metadata_size(root, nr_extents); + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); } else { nr_extents = 0; to_reserve = 0; @@ -4132,7 +4126,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes); if (nr_extents > 0) - to_free += calc_trans_metadata_size(root, nr_extents); + to_free += btrfs_calc_trans_metadata_size(root, nr_extents); btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); -- cgit v1.2.3 From 7a7eaa40a39bde4eefc91aadeb1ce3dc4e6a1252 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Apr 2011 12:54:33 -0400 Subject: Btrfs: take away the num_items argument from btrfs_join_transaction I keep forgetting that btrfs_join_transaction() just ignores the num_items argument, which leads me to sending pointless patches and looking stupid :). So just kill the num_items argument from btrfs_join_transaction and btrfs_start_ioctl_transaction, since neither of them use it. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9ee6bd55e16c..941b28e78931 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3174,7 +3174,7 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3202,7 +3202,7 @@ alloc: commit_trans: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); @@ -3589,7 +3589,7 @@ again: goto out; ret = -ENOSPC; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto out; ret = btrfs_commit_transaction(trans, root); @@ -3816,7 +3816,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (trans) return -EAGAIN; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); return 0; @@ -7649,7 +7649,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) BUG_ON(reloc_root->commit_root != NULL); while (1) { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); mutex_lock(&root->fs_info->drop_mutex); @@ -8176,7 +8176,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); alloc_flags = update_block_group_flags(root, cache->flags); -- cgit v1.2.3 From a4abeea41adfa3c143c289045f4625dfaeba2212 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Apr 2011 17:25:13 -0400 Subject: Btrfs: kill trans_mutex We use trans_mutex for lots of things, here's a basic list 1) To serialize trans_handles joining the currently running transaction 2) To make sure that no new trans handles are started while we are committing 3) To protect the dead_roots list and the transaction lists Really the serializing trans_handles joining is not too hard, and can really get bogged down in acquiring a reference to the transaction. So replace the trans_mutex with a trans_lock spinlock and use it to do the following 1) Protect fs_info->running_transaction. All trans handles have to do is check this, and then take a reference of the transaction and keep on going. 2) Protect the fs_info->trans_list. This doesn't get used too much, basically it just holds the current transactions, which will usually just be the currently committing transaction and the currently running transaction at most. 3) Protect the dead roots list. This is only ever processed by splicing the list so this is relatively simple. 4) Protect the fs_info->reloc_ctl stuff. This is very lightweight and was using the trans_mutex before, so this is a pretty straightforward change. 5) Protect fs_info->no_trans_join. Because we don't hold the trans_lock over the entirety of the commit we need to have a way to block new people from creating a new transaction while we're doing our work. So we set no_trans_join and in join_transaction we test to see if that is set, and if it is we do a wait_on_commit. 6) Make the transaction use count atomic so we don't need to take locks to modify it when we're dropping references. 7) Add a commit_lock to the transaction to make sure multiple people trying to commit the same transaction don't race and commit at the same time. 8) Make open_ioctl_trans an atomic so we don't have to take any locks for ioctl trans. I have tested this with xfstests, but obviously it is a pretty hairy change so lots of testing is greatly appreciated. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 941b28e78931..ca599654ce19 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3200,7 +3200,8 @@ alloc: /* commit the current transaction and try again */ commit_trans: - if (!committed && !root->fs_info->open_ioctl_trans) { + if (!committed && + !atomic_read(&root->fs_info->open_ioctl_trans)) { committed = 1; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) -- cgit v1.2.3 From fcb80c2affd63237cff5b34cba5756be7c976a5a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 May 2011 10:40:22 -0400 Subject: Btrfs: fix how we do space reservation for truncate The ceph guys keep running into problems where we have space reserved in our orphan block rsv when freeing it up. This is because they tend to do snapshots alot, so their truncates tend to use a bunch of space, so when we go to do things like update the inode we have to steal reservation space in order to make the reservation happen. This happens because truncate can use as much space as it freaking feels like, but we still have to hold space for removing the orphan item and updating the inode, which will definitely always happen. So in order to fix this we need to split all of the reservation stuf up. So with this patch we have 1) The orphan block reserve which only holds the space for deleting our orphan item when everything is over. 2) The truncate block reserve which gets allocated and used specifically for the space that the truncate will use on a per truncate basis. 3) The transaction will always have 1 item's worth of data reserved so we can update the inode normally. Hopefully this will make the ceph problem go away. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ca599654ce19..a2ca561c70f0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3980,6 +3980,37 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) 3 * num_items; } +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; + u64 num_bytes; + int ret; + + /* + * Truncate should be freeing data, but give us 2 items just in case it + * needs to use some space. We may want to be smarter about this in the + * future. + */ + num_bytes = calc_trans_metadata_size(root, 2); + + /* We already have enough bytes, just return */ + if (rsv->reserved >= num_bytes) + return 0; + + num_bytes -= rsv->reserved; + + /* + * You should have reserved enough space before hand to do this, so this + * should not fail. + */ + ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); + BUG_ON(ret); + + return 0; +} + int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, int num_items) @@ -4020,23 +4051,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; /* - * one for deleting orphan item, one for updating inode and - * two for calling btrfs_truncate_inode_items. - * - * btrfs_truncate_inode_items is a delete operation, it frees - * more space than it uses in most cases. So two units of - * metadata space should be enough for calling it many times. - * If all of the metadata space is used, we can commit - * transaction and use space it freed. + * We need to hold space in order to delete our orphan item once we've + * added it, so this takes the reservation so we can release it later + * when we are truly done with the orphan item. */ - u64 num_bytes = calc_trans_metadata_size(root, 4); + u64 num_bytes = calc_trans_metadata_size(root, 1); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes = calc_trans_metadata_size(root, 4); + u64 num_bytes = calc_trans_metadata_size(root, 1); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } -- cgit v1.2.3 From d82a6f1d7e8b61ed5996334d0db66651bb43641d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 15:26:06 -0400 Subject: Btrfs: kill BTRFS_I(inode)->block_group Originally this was going to be used as a way to give hints to the allocator, but frankly we can get much better hints elsewhere and it's not even used at all for anything usefull. In addition to be completely useless, when we initialize an inode we try and find a freeish block group to set as the inodes block group, and with a completely full 40gb fs this takes _forever_, so I imagine with say 1tb fs this is just unbearable. So just axe the thing altoghether, we don't need it and it saves us 8 bytes in the inode and saves us 500 microseconds per inode lookup in my testcase. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a2ca561c70f0..9f0a4e3bd8a9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5319,6 +5319,7 @@ checks: btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); + btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; @@ -5411,14 +5412,7 @@ loop: ret = -ENOSPC; } else if (!ins->objectid) { ret = -ENOSPC; - } - - /* we found what we needed */ - if (ins->objectid) { - if (!(data & BTRFS_BLOCK_GROUP_DATA)) - trans->block_group = block_group->key.objectid; - - btrfs_put_block_group(block_group); + } else if (ins->objectid) { ret = 0; } -- cgit v1.2.3 From 589d8ade83f07c0f11c8191c0ca309f34d7a2c14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 17:30:53 -0400 Subject: Btrfs: try not to sleep as much when doing slow caching When the fs is super full and we unmount the fs, we could get stuck in this thing where unmount is waiting for the caching kthread to make progress and the caching kthread keeps scheduling because we're in the middle of a commit. So instead just let the caching kthread keep going and only yeild if need_resched(). This makes my horrible umount case go from taking up to 10 minutes to taking less than 20 seconds. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9f0a4e3bd8a9..96be62450318 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -378,15 +378,18 @@ again: if (ret) break; - caching_ctl->progress = last; - btrfs_release_path(extent_root, path); - up_read(&fs_info->extent_commit_sem); - mutex_unlock(&caching_ctl->mutex); - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); - else + if (need_resched() || + btrfs_next_leaf(extent_root, path)) { + caching_ctl->progress = last; + btrfs_release_path(extent_root, path); + up_read(&fs_info->extent_commit_sem); + mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + goto again; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; } if (key.objectid < block_group->key.objectid) { -- cgit v1.2.3 From 026fd317828500524cdc7e5ff9e8e7923abb2868 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 May 2011 10:32:11 -0400 Subject: Btrfs: don't always do readahead Our readahead is sort of sloppy, and really isn't always needed. For example if ls is doing a stating ls (which is the default) it's going to stat in non-disk order, so if say you have a directory with a stupid amount of files, readahead is going to do nothing but waste time in the case of doing the stat. Taking the unconditional readahead out made my test go from 57 minutes to 36 minutes. This means that everywhere we do loop through the tree we want to make sure we do set path->reada properly, so I went through and found all of the places where we loop through the path and set reada to 1. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 96be62450318..1ba2cc58eab5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -347,7 +347,7 @@ static int caching_kthread(void *data) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = 1; key.objectid = last; key.offset = 0; @@ -8556,6 +8556,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); if (cache_gen != 0 && -- cgit v1.2.3 From cca1c81f43e26ab60c0d1090fb90992358d69bdf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 May 2011 11:07:12 -0400 Subject: Btrfs: don't try to allocate from a block group that doesn't have enough space If we have a very large filesystem, we can spend a lot of time in find_free_extent just trying to allocate from empty block groups. So instead check to see if the block group even has enough space for the allocation, and if not go on to the next block group. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1ba2cc58eab5..c8c318494dee 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5159,6 +5159,14 @@ have_block_group: if (unlikely(block_group->ro)) goto loop; + spin_lock(&block_group->tree_lock); + if (cached && + block_group->free_space < num_bytes + empty_size) { + spin_unlock(&block_group->tree_lock); + goto loop; + } + spin_unlock(&block_group->tree_lock); + /* * Ok we want to try and use the cluster allocator, so lets look * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will -- cgit v1.2.3 From c4f675cd40d955d539180506c09515c90169b15b Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Fri, 20 May 2011 20:20:30 +0000 Subject: btrfs: don't spin in shrink_delalloc if there is nothing to free Observed as a large delay when --mixed filesystem is filled up. Test example: 1. create tiny --mixed FS: $ dd if=/dev/zero of=2G.img seek=$((2048 * 1024 * 1024 - 1)) count=1 bs=1 $ mkfs.btrfs --mixed 2G.img $ mount -oloop 2G.img /mnt/ut/ 2. Try to fill it up: $ dd if=/dev/urandom of=10M.file bs=10240 count=1024 $ seq 1 256 | while read file_no; do echo $file_no; time cp 10M.file ${file_no}.copy; done Up to '200.copy' it goes fast, but when disk fills-up each -ENOSPC message takes 3 seconds to pop-up _every_ ENOSPC (and in usermode linux it's even more: 30-60 seconds!). (Maybe, time depends on kernel's timer resolution). No IO, no CPU load, just rescheduling. Some debugging revealed busy spinning in shrink_delalloc. Signed-off-by: Sergei Trofimovich Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9ee6bd55e16c..9f5fdd37451d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3425,6 +3425,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; + /* nothing to shrink - nothing to reclaim */ + if (root->fs_info->delalloc_bytes == 0) + return 0; + max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { -- cgit v1.2.3 From 1cd307990d6e2b4965620e339a92e0d7ae853e13 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 19 May 2011 05:19:08 +0000 Subject: Btrfs: BUG_ON is deleted from the caller of btrfs_truncate_item & btrfs_extend_item Currently, btrfs_truncate_item and btrfs_extend_item returns only 0. So, the check by BUG_ON in the caller is unnecessary. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9f5fdd37451d..103e141afeb3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -947,7 +947,6 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, BUG_ON(ret); ret = btrfs_extend_item(trans, root, path, new_size); - BUG_ON(ret); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1555,7 +1554,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans, size = btrfs_extent_inline_ref_size(type); ret = btrfs_extend_item(trans, root, path, size); - BUG_ON(ret); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1684,7 +1682,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans, end - ptr - size); item_size -= size; ret = btrfs_truncate_item(trans, root, path, item_size, 1); - BUG_ON(ret); } btrfs_mark_buffer_dirty(leaf); return 0; -- cgit v1.2.3 From 7841cb2898f66a73062c64d0ef5733dde7279e46 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 31 May 2011 18:07:27 +0200 Subject: btrfs: add helper for fs_info->closing wrap checking of filesystem 'closing' flag and fix a few missing memory barriers. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c9173a7827b0..5b9b6b6df242 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -366,8 +366,7 @@ again: nritems = btrfs_header_nritems(leaf); while (1) { - smp_mb(); - if (fs_info->closing > 1) { + if (btrfs_fs_closing(fs_info) > 1) { last = (u64)-1; break; } -- cgit v1.2.3 From f2bb8f5cfb3bce595b2de251ed7638047fc4e530 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 25 May 2011 13:10:16 -0400 Subject: Btrfs: don't commit the transaction if we dont have enough pinned bytes I noticed when running an enospc test that we would get stuck committing the transaction in check_data_space even though we truly didn't have enough space. So check to see if bytes_pinned is bigger than num_bytes, if it's not don't commit the transaction. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5b9b6b6df242..0d0a3fe77bb7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3089,6 +3089,13 @@ alloc: } goto again; } + + /* + * If we have less pinned bytes than we want to allocate then + * don't bother committing the transaction, it won't help us. + */ + if (data_sinfo->bytes_pinned < bytes) + committed = 1; spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ -- cgit v1.2.3 From 723bda2083d44edbd6be0f0b09f902120dc07442 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 27 May 2011 16:11:38 -0400 Subject: Btrfs: fix the allocator loop logic I was testing with empty_cluster = 0 to try and reproduce a problem and kept hitting early enospc panics. This was because our loop logic was a little confused. So this is what I did 1) Make the loop variable the ultimate decider on wether we should loop again isntead of checking to see if we had an uncached bg, empty size or empty cluster. 2) Increment loop before checking to see what we are on to make the loop definitions make more sense. 3) If we are on the chunk alloc loop don't set empty_size/empty_cluster to 0 unless we didn't actually allocate a chunk. If we did allocate a chunk we should be able to easily setup a new cluster so clearing empty_size/empty_cluster makes us less efficient. This kept me from hitting panics while trying to reproduce the other problem. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0d0a3fe77bb7..b42efc2ded51 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5218,9 +5218,7 @@ loop: * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && - (found_uncached_bg || empty_size || empty_cluster || - allowed_chunk_alloc)) { + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; @@ -5260,32 +5258,36 @@ loop: goto search; } - if (loop < LOOP_CACHING_WAIT) { - loop++; - goto search; - } + loop++; if (loop == LOOP_ALLOC_CHUNK) { - empty_size = 0; - empty_cluster = 0; - } + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, + CHUNK_ALLOC_LIMITED); + allowed_chunk_alloc = 0; + if (ret == 1) + done_chunk_alloc = 1; + } else if (!done_chunk_alloc && + space_info->force_alloc == + CHUNK_ALLOC_NO_FORCE) { + space_info->force_alloc = CHUNK_ALLOC_LIMITED; + } - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, - CHUNK_ALLOC_LIMITED); - allowed_chunk_alloc = 0; - done_chunk_alloc = 1; - } else if (!done_chunk_alloc && - space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) { - space_info->force_alloc = CHUNK_ALLOC_LIMITED; + /* + * We didn't allocate a chunk, go ahead and drop the + * empty size and loop again. + */ + if (!done_chunk_alloc) + loop = LOOP_NO_EMPTY_SIZE; } - if (loop < LOOP_NO_EMPTY_SIZE) { - loop++; - goto search; + if (loop == LOOP_NO_EMPTY_SIZE) { + empty_size = 0; + empty_cluster = 0; } - ret = -ENOSPC; + + goto search; } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { -- cgit v1.2.3 From f4c44016218a6fce357715b9bbabbbbe1f69853c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 11:30:47 -0400 Subject: Btrfs: drop the delalloc_bytes check in shrink_delalloc Even when delalloc_bytes is zero, we may need to sleep while waiting for delalloc space. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b42efc2ded51..1f61bf5b4960 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3314,10 +3314,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; - /* nothing to shrink - nothing to reclaim */ - if (root->fs_info->delalloc_bytes == 0) - return 0; - max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { -- cgit v1.2.3