From 546888da82082555a56528730a83f0afd12f33bf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 21 Apr 2009 11:53:38 -0400 Subject: Btrfs: fix btrfs fallocate oops and deadlock Btrfs fallocate was incorrectly starting a transaction with a lock held on the extent_io tree for the file, which could deadlock. Strictly speaking it was using join_transaction which would be safe, but it is better to move the transaction outside of the lock. When preallocated extents are overwritten, btrfs_mark_buffer_dirty was being called on an unlocked buffer. This was triggering an assertion and oops because the lock is supposed to be held. The bug was calling btrfs_mark_buffer_dirty on a leaf after btrfs_del_item had been run. btrfs_del_item takes care of dirtying things, so the solution is a to skip the btrfs_mark_buffer_dirty call in this case. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0d1dd492a58..65219f6a16a1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4970,10 +4970,10 @@ out_fail: return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, +static int prealloc_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end, u64 alloc_hint, int mode) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 alloc_size; @@ -4981,10 +4981,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, u64 num_bytes = end - start; int ret = 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - btrfs_set_trans_block_group(trans, inode); - while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, alloc_size, @@ -5015,7 +5011,6 @@ out: BUG_ON(ret); } - btrfs_end_transaction(trans, root); return ret; } @@ -5029,11 +5024,18 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 alloc_hint = 0; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + struct btrfs_trans_handle *trans; int ret; alloc_start = offset & ~mask; alloc_end = (offset + len + mask) & ~mask; + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + mutex_lock(&inode->i_mutex); if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); @@ -5043,6 +5045,16 @@ static long btrfs_fallocate(struct inode *inode, int mode, while (1) { struct btrfs_ordered_extent *ordered; + + trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); + if (!trans) { + ret = -EIO; + goto out; + } + + /* the extent lock is ordered inside the running + * transaction + */ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, @@ -5053,6 +5065,12 @@ static long btrfs_fallocate(struct inode *inode, int mode, btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, GFP_NOFS); + btrfs_end_transaction(trans, BTRFS_I(inode)->root); + + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); } else { @@ -5070,7 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, last_byte = min(extent_map_end(em), alloc_end); last_byte = (last_byte + mask) & ~mask; if (em->block_start == EXTENT_MAP_HOLE) { - ret = prealloc_file_range(inode, cur_offset, + ret = prealloc_file_range(trans, inode, cur_offset, last_byte, alloc_hint, mode); if (ret < 0) { free_extent_map(em); @@ -5089,6 +5107,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, } unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, GFP_NOFS); + + btrfs_end_transaction(trans, BTRFS_I(inode)->root); out: mutex_unlock(&inode->i_mutex); return ret; -- cgit v1.2.3 From 9601e3f6336f6ca66929f451b1f66085e68e36e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Apr 2009 15:33:09 +0200 Subject: Btrfs: kill btrfs_cache_create Just use kmem_cache_create directly. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 65219f6a16a1..176b6cc28b1e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4640,39 +4640,35 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); } -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *)) -{ - return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | extra_flags), ctor); -} - int btrfs_init_cachep(void) { - btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", - sizeof(struct btrfs_inode), - 0, init_once); + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = - btrfs_cache_create("btrfs_trans_handle_cache", - sizeof(struct btrfs_trans_handle), - 0, NULL); + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", - sizeof(struct btrfs_transaction), - 0, NULL); + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", - sizeof(struct btrfs_path), - 0, NULL); + + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, - SLAB_DESTROY_BY_RCU, NULL); + + btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", 256, 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_DESTROY_BY_RCU, NULL); if (!btrfs_bit_radix_cachep) goto fail; return 0; -- cgit v1.2.3 From e980b50cda1610f1c17978d9b7fd311a9dd93877 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 24 Apr 2009 14:39:24 -0400 Subject: Btrfs: fix fallocate deadlock on inode extent lock The btrfs fallocate call takes an extent lock on the entire range being fallocated, and then runs through insert_reserved_extent on each extent as they are allocated. The problem with this is that btrfs_drop_extents may decide to try and take the same extent lock fallocate was already holding. The solution used here is to push down knowledge of the range that is already locked going into btrfs_drop_extents. It turns out that at least one other caller had the same bug. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 176b6cc28b1e..2fdb2995be64 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -234,7 +234,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, &hint_byte); BUG_ON(ret); if (isize > actual_end) @@ -1439,6 +1439,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, + u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1455,7 +1456,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, file_pos, &hint); + file_pos + num_bytes, locked_end, + file_pos, &hint); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1590,6 +1592,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, + ordered_extent->file_offset + + ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); BUG_ON(ret); @@ -2877,6 +2881,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) err = btrfs_drop_extents(trans, root, inode, cur_offset, cur_offset + hole_size, + block_end, cur_offset, &hint_byte); if (err) break; @@ -4968,7 +4973,7 @@ out_fail: static int prealloc_file_range(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode) + u64 locked_end, u64 alloc_hint, int mode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -4989,7 +4994,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, 0, 0, 0, + ins.offset, locked_end, + 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); num_bytes -= ins.offset; @@ -5018,6 +5024,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; + u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; struct btrfs_trans_handle *trans; @@ -5039,6 +5046,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -5051,8 +5059,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, /* the extent lock is ordered inside the running * transaction */ - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, - alloc_end - 1, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -5060,7 +5068,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, - alloc_start, alloc_end - 1, GFP_NOFS); + alloc_start, locked_end, GFP_NOFS); btrfs_end_transaction(trans, BTRFS_I(inode)->root); /* @@ -5085,7 +5093,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, last_byte = (last_byte + mask) & ~mask; if (em->block_start == EXTENT_MAP_HOLE) { ret = prealloc_file_range(trans, inode, cur_offset, - last_byte, alloc_hint, mode); + last_byte, locked_end + 1, + alloc_hint, mode); if (ret < 0) { free_extent_map(em); break; @@ -5101,7 +5110,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, break; } } - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); btrfs_end_transaction(trans, BTRFS_I(inode)->root); -- cgit v1.2.3 From 193f284d4985db0370a8a1bbdfb20df548cf9ffb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 07:29:05 -0400 Subject: Btrfs: ratelimit IO error printks Btrfs has printks for various IO errors, including bad checksums and mismatches between what we expect the block headers to contain and what we actually find on the disk. Longer term we need a real reporting mechanism for this, but for now printk is going to have to do. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2fdb2995be64..552e08afc7fb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1823,10 +1823,12 @@ good: return 0; zeroit: - printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " - "private %llu\n", page->mapping->host->i_ino, - (unsigned long long)start, csum, - (unsigned long long)private); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + } memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); -- cgit v1.2.3 From 45c06543afe2772c02f21efee0e2138b4e1c911e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 07:49:10 -0400 Subject: Btrfs: remove unused btrfs_bit_radix slab Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 552e08afc7fb..98bd5069d54a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; -struct kmem_cache *btrfs_bit_radix_cachep; struct kmem_cache *btrfs_path_cachep; #define S_SHIFT 12 @@ -4641,8 +4640,6 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); if (btrfs_transaction_cachep) kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_bit_radix_cachep) - kmem_cache_destroy(btrfs_bit_radix_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); } @@ -4673,11 +4670,6 @@ int btrfs_init_cachep(void) if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", 256, 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_DESTROY_BY_RCU, NULL); - if (!btrfs_bit_radix_cachep) - goto fail; return 0; fail: btrfs_destroy_cachep(); -- cgit v1.2.3 From 7b1a14bbb0e547aaa4d30cc376e6c8c12539ab0f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 10:49:53 -0400 Subject: Btrfs: fix acl caching Linus noticed the btrfs code to cache acls wasn't properly caching a NULL acl when the inode didn't have any acls. This meant the common case of no acls resulted in expensive btree searches every time the kernel checked permissions (which is quite often). This is a modified version of Linus' original patch: Properly set initial acl fields to BTRFS_ACL_NOT_CACHED in the inode. This forces an acl lookup when permission checks are done. Fix btrfs_get_acl to avoid lookups and locking when the inode acls fields are set to null. Fix btrfs_get_acl to use the right return value from __btrfs_getxattr when deciding to cache a NULL acl. It was storing a NULL acl when __btrfs_getxattr return -ENOENT, but __btrfs_getxattr was actually returning -ENODATA for this case. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98bd5069d54a..dc812ec551f6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3047,8 +3047,8 @@ static noinline void init_btrfs_i(struct inode *inode) { struct btrfs_inode *bi = BTRFS_I(inode); - bi->i_acl = NULL; - bi->i_default_acl = NULL; + bi->i_acl = BTRFS_ACL_NOT_CACHED; + bi->i_default_acl = BTRFS_ACL_NOT_CACHED; bi->generation = 0; bi->sequence = 0; -- cgit v1.2.3 From 46a53cca826e71effe59e3cb4f383622c33ebdcb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 11:47:50 -0400 Subject: Btrfs: look for acls during btrfs_read_locked_inode This changes btrfs_read_locked_inode() to peek ahead in the btree for acl items. If it is certain a given inode has no acls, it will set the in memory acl fields to null to avoid acl lookups completely. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dc812ec551f6..90c23eb28829 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2015,6 +2015,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) btrfs_free_path(path); } +/* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + int scanned = 0; + + slot++; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) + return 1; + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + return 1; +} + /* * read an inode from the btree into the in-memory inode */ @@ -2026,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + int maybe_acls; u64 alloc_group_block; u32 rdev; int ret; @@ -2072,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode) alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); + if (!maybe_acls) { + BTRFS_I(inode)->i_acl = NULL; + BTRFS_I(inode)->i_default_acl = NULL; + } + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); -- cgit v1.2.3