From 3c64a1aba7cfcb04f79e76f859b3d66660275d59 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 13 May 2013 13:53:35 +0000 Subject: Btrfs: cleanup: don't check the same thing twice btrfs_read_fs_root_no_name() already checks if btrfs_root_refs() is zero and returns ENOENT in this case. There is no need to do it again in six places. Signed-off-by: Stefan Behrens Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c931a4dbd031..3817c1e49035 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2262,11 +2262,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path, return 0; return PTR_ERR(root); } - if (btrfs_root_refs(&root->root_item) == 0) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - /* parse ENOENT to 0 */ - return 0; - } /* step 2: get inode */ key.objectid = backref->inum; @@ -4821,11 +4816,6 @@ static int fixup_tree_root_location(struct btrfs_root *root, goto out; } - if (btrfs_root_refs(&new_root->root_item) == 0) { - err = -ENOENT; - goto out; - } - *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); location->type = BTRFS_INODE_ITEM_KEY; -- cgit v1.2.3 From eb73c1b7cea7d533288ef5297a0ea0e159db85b0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:22 +0000 Subject: Btrfs: introduce per-subvolume delalloc inode list When we create a snapshot, we need flush all delalloc inodes in the fs, just flushing the inodes in the source tree is OK. So we introduce per-subvolume delalloc inode list. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 167 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 123 insertions(+), 44 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3817c1e49035..18191f193b47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1528,6 +1528,46 @@ static void btrfs_merge_extent_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } +static void btrfs_add_delalloc_inodes(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(!list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, + &root->fs_info->delalloc_roots); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes--; + if (!root->nr_delalloc_inodes) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(list_empty(&root->delalloc_root)); + list_del_init(&root->delalloc_root); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + /* * extent_io.c set_bit_hook, used to track delayed allocation * bytes in this file, and to maintain the list of inodes that @@ -1560,16 +1600,8 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); - } - spin_unlock(&root->fs_info->delalloc_lock); - } + &BTRFS_I(inode)->runtime_flags)) + btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1612,15 +1644,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, BTRFS_I(inode)->delalloc_bytes -= len; if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); - } - spin_unlock(&root->fs_info->delalloc_lock); - } + &BTRFS_I(inode)->runtime_flags)) + btrfs_del_delalloc_inode(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } } @@ -8338,7 +8363,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { struct btrfs_inode *binode; struct inode *inode; @@ -8347,30 +8372,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) struct list_head splice; int ret = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); - spin_lock(&root->fs_info->delalloc_lock); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); - list_del_init(&binode->delalloc_inodes); - + list_move_tail(&binode->delalloc_inodes, + &root->delalloc_inodes); inode = igrab(&binode->vfs_inode); if (!inode) { - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &binode->runtime_flags); + cond_resched_lock(&root->delalloc_lock); continue; } - - list_add_tail(&binode->delalloc_inodes, - &root->fs_info->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); if (unlikely(!work)) { @@ -8382,16 +8400,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) &work->work); cond_resched(); - spin_lock(&root->fs_info->delalloc_lock); + spin_lock(&root->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + return 0; +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->delalloc_lock); + list_splice_tail(&splice, &root->delalloc_inodes); + spin_unlock(&root->delalloc_lock); + } + return ret; +} - /* the filemap_flush will queue IO into the worker threads, but +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +{ + int ret; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + ret = __start_delalloc_inodes(root, delay_iput); + /* + * the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return */ @@ -8403,17 +8444,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - return 0; -out: - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); + return ret; +} + +int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, + int delay_iput) +{ + struct btrfs_root *root; + struct list_head splice; + int ret; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->delalloc_root, + &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + + ret = __start_delalloc_inodes(root, delay_iput); + btrfs_put_fs_root(root); + if (ret) + goto out; + + spin_lock(&fs_info->delalloc_root_lock); } + spin_unlock(&fs_info->delalloc_root_lock); + atomic_inc(&fs_info->async_submit_draining); + while (atomic_read(&fs_info->nr_async_submits) || + atomic_read(&fs_info->async_delalloc_pages)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0 && + atomic_read(&fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&fs_info->async_submit_draining); + return 0; +out: if (!list_empty_careful(&splice)) { - spin_lock(&root->fs_info->delalloc_lock); - list_splice_tail(&splice, &root->fs_info->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); + spin_lock(&fs_info->delalloc_root_lock); + list_splice_tail(&splice, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } return ret; } -- cgit v1.2.3 From 199c2a9c3d1389db7f7a211e64f6809d352ce5f6 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 May 2013 07:48:23 +0000 Subject: Btrfs: introduce per-subvolume ordered extent list The reason we introduce per-subvolume ordered extent list is the same as the per-subvolume delalloc inode list. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 18191f193b47..51520755f4dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7991,9 +7991,9 @@ void btrfs_destroy_inode(struct inode *inode) */ smp_mb(); if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); } if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, -- cgit v1.2.3 From d52be818e618bd252601b340ca6df760d77410e8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 May 2013 14:54:47 -0400 Subject: Btrfs: simplify unlink reservations Dave pointed out a problem where if you filled up a file system as much as possible you couldn't remove any files. The whole unlink reservation thing is convoluted because it tries to guess if it's going to add space to unlink something or not, and has all these odd uncommented cases where it simply does not try. So to fix this I've added a way to conditionally steal from the global reserve if we can't make our normal reservation. If we have more than half the space in the global reserve free we will go ahead and steal from the global reserve. With this patch Dave's reproducer now works and I can rm all the files on the file system. Thanks, Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 212 ++++++------------------------------------------------- 1 file changed, 22 insertions(+), 190 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 51520755f4dc..c0e95b1554a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3679,53 +3679,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, } return ret; } - - -/* helper to check if there is any shared block in the path */ -static int check_path_shared(struct btrfs_root *root, - struct btrfs_path *path) -{ - struct extent_buffer *eb; - int level; - u64 refs = 1; - - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - int ret; - - if (!path->nodes[level]) - break; - eb = path->nodes[level]; - if (!btrfs_block_can_be_shared(root, eb)) - continue; - ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1, - &refs, NULL); - if (refs > 1) - return 1; - } - return 0; -} /* * helper to start transaction for unlink and rmdir. * - * unlink and rmdir are special in btrfs, they do not always free space. - * so in enospc case, we should make sure they will free space before - * allowing them to use the global metadata reservation. + * unlink and rmdir are special in btrfs, they do not always free space, so + * if we cannot make our reservations the normal way try and see if there is + * plenty of slack room in the global reserve to migrate, otherwise we cannot + * allow the unlink to occur. */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, - struct dentry *dentry) +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_path *path; - struct btrfs_dir_item *di; - struct inode *inode = dentry->d_inode; - u64 index; - int check_link = 1; - int err = -ENOSPC; int ret; - u64 ino = btrfs_ino(inode); - u64 dir_ino = btrfs_ino(dir); /* * 1 for the possible orphan item @@ -3738,158 +3705,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; - if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) - return ERR_PTR(-ENOSPC); - - /* check if there is someone else holds reference */ - if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) - return ERR_PTR(-ENOSPC); - - if (atomic_read(&inode->i_count) > 2) - return ERR_PTR(-ENOSPC); - - if (xchg(&root->fs_info->enospc_unlink, 1)) - return ERR_PTR(-ENOSPC); - - path = btrfs_alloc_path(); - if (!path) { - root->fs_info->enospc_unlink = 0; - return ERR_PTR(-ENOMEM); - } + if (PTR_ERR(trans) == -ENOSPC) { + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); - /* 1 for the orphan item */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_free_path(path); - root->fs_info->enospc_unlink = 0; - return trans; - } - - path->skip_locking = 1; - path->search_commit_root = 1; - - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(dir)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); - - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(inode)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); - - if (ret == 0 && S_ISREG(inode->i_mode)) { - ret = btrfs_lookup_file_extent(trans, root, path, - ino, (u64)-1, 0); - if (ret < 0) { - err = ret; - goto out; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return trans; + ret = btrfs_cond_migrate_bytes(root->fs_info, + &root->fs_info->trans_block_rsv, + num_bytes, 5); + if (ret) { + btrfs_end_transaction(trans, root); + return ERR_PTR(ret); } - BUG_ON(ret == 0); /* Corruption */ - if (check_path_shared(root, path)) - goto out; - btrfs_release_path(path); - } - - if (!check_link) { - err = 0; - goto out; - } - - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - if (di) { - if (check_path_shared(root, path)) - goto out; - } else { - err = 0; - goto out; - } - btrfs_release_path(path); - - ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name, - dentry->d_name.len, ino, dir_ino, 0, - &index); - if (ret) { - err = ret; - goto out; - } - - if (check_path_shared(root, path)) - goto out; - - btrfs_release_path(path); - - /* - * This is a commit root search, if we can lookup inode item and other - * relative items in the commit root, it means the transaction of - * dir/file creation has been committed, and the dir index item that we - * delay to insert has also been inserted into the commit root. So - * we needn't worry about the delayed insertion of the dir index item - * here. - */ - di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - BUG_ON(ret == -ENOENT); - if (check_path_shared(root, path)) - goto out; - - err = 0; -out: - btrfs_free_path(path); - /* Migrate the orphan reservation over */ - if (!err) - err = btrfs_block_rsv_migrate(trans->block_rsv, - &root->fs_info->global_block_rsv, - trans->bytes_reserved); - - if (err) { - btrfs_end_transaction(trans, root); - root->fs_info->enospc_unlink = 0; - return ERR_PTR(err); - } - - trans->block_rsv = &root->fs_info->global_block_rsv; - return trans; -} - -static void __unlink_end_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) { - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); trans->block_rsv = &root->fs_info->trans_block_rsv; - BUG_ON(!root->fs_info->enospc_unlink); - root->fs_info->enospc_unlink = 0; + trans->bytes_reserved = num_bytes; } - btrfs_end_transaction(trans, root); + return trans; } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -3899,7 +3731,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int ret; - trans = __unlink_start_trans(dir, dentry); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3917,7 +3749,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } out: - __unlink_end_trans(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); return ret; } @@ -4014,7 +3846,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) return -EPERM; - trans = __unlink_start_trans(dir, dentry); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -4036,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (!err) btrfs_i_size_write(inode, 0); out: - __unlink_end_trans(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); return err; -- cgit v1.2.3 From c69b26b0116dc2c064457991bec91b6d94e96d65 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Jun 2013 16:51:23 -0400 Subject: Btrfs: add some missing iput()'s in btrfs_orphan_cleanup There are some error cases that we don't do an iput() on our inode, fix this. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c0e95b1554a0..c52ceb8c24e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3234,13 +3234,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* 1 for the orphan item deletion. */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { + iput(inode); ret = PTR_ERR(trans); goto out; } ret = btrfs_orphan_add(trans, inode); btrfs_end_transaction(trans, root); - if (ret) + if (ret) { + iput(inode); goto out; + } ret = btrfs_truncate(inode); if (ret) -- cgit v1.2.3 From 01cd33674e95296e1647da3534b9aef1e98556b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Jun 2013 21:39:49 -0400 Subject: Btrfs: put our inode if orphan cleanup fails When we cross into a different subvol when doing a lookup we will run the orhpan cleanup. If this fails however we do not drop the ref to the inode we were looking up before we return an error, which leads to busy inodes on umount. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c52ceb8c24e0..a2df4690b000 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4941,8 +4941,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (!(inode->i_sb->s_flags & MS_RDONLY)) ret = btrfs_orphan_cleanup(sub_root); up_read(&root->fs_info->cleanup_work_sem); - if (ret) + if (ret) { + iput(inode); inode = ERR_PTR(ret); + } } return inode; -- cgit v1.2.3 From fdf8e2ea3cba9ef03087482b11258d844d6cbea3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Jun 2013 16:58:23 -0400 Subject: Btrfs: unlock extent range on enospc in compressed submit A user reported a deadlock where the async submit thread was blocked on the lock_extent() lock, and then everybody behind him was locked on the page lock for the page he was holding. Looking at the code I noticed we do not unlock the extent range when we get ENOSPC and goto retry. This is bad because we immediately try to lock that range again to do the cow, which will cause a deadlock. Fix this by unlocking the range. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2df4690b000..509112da6118 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -700,8 +700,12 @@ retry: async_extent->nr_pages = 0; async_extent->pages = NULL; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1); goto retry; + } goto out_free; } -- cgit v1.2.3 From a71754fc68f740b7ed46bb83123c63fbbc130611 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 17 Jun 2013 17:14:39 -0400 Subject: Btrfs: move btrfs_truncate_page to btrfs_cont_expand instead of btrfs_truncate This has plagued us forever and I'm so over working around it. When we truncate down to a non-page aligned offset we will call btrfs_truncate_page to zero out the end of the page and write it back to disk, this will keep us from exposing stale data if we truncate back up from that point. The problem with this is it requires data space to do this, and people don't really expect to get ENOSPC from truncate() for these sort of things. This also tends to bite the orphan cleanup stuff too which keeps people from mounting. To get around this we can just move this into btrfs_cont_expand() to make sure if we are truncating up from a non-page size aligned i_size we will zero out the rest of this page so that we don't expose stale data. This will give ENOSPC if you try to truncate() up or if you try to write past the end of isize, which is much more reasonable. This fixes xfstests generic/083 failing to mount because of the orphan cleanup failing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 509112da6118..b7fa96f72ecd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4253,6 +4253,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) u64 hole_size; int err = 0; + /* + * If our size started in the middle of a page we need to zero out the + * rest of the page before we expand the i_size, otherwise we could + * expose stale data. + */ + err = btrfs_truncate_page(inode, oldsize, 0, 0); + if (err) + return err; + if (size <= hole_start) return 0; @@ -7565,16 +7574,12 @@ static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - int ret; + int ret = 0; int err = 0; struct btrfs_trans_handle *trans; u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); - if (ret) - return ret; - btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); -- cgit v1.2.3 From f23b5a59955c0ea13c6da211fb06f39348e3c794 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2013 10:16:26 -0400 Subject: Btrfs: check for actual acls rather than just xattrs when caching no acl We have an optimization that will go ahead and cache no acls on an inode if there are no xattrs on the inode. This saves us a lookup later to check the acls for writes or any other access. The problem is I use selinux so I always have an xattr on inodes, so make this test a little smarter and check for the actual acl hash on the key and if it isn't there then we still get to cache no acl which makes everybody who uses selinux a little happier. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b7fa96f72ecd..8edcdf6910f7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -56,6 +57,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "backref.h" +#include "hash.h" struct btrfs_iget_args { u64 ino; @@ -3300,8 +3302,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; + static u64 xattr_access = 0; + static u64 xattr_default = 0; int scanned = 0; + if (!xattr_access) { + xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)); + xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)); + } + slot++; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -3311,8 +3322,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, return 0; /* we found an xattr, assume we've got an acl */ - if (found_key.type == BTRFS_XATTR_ITEM_KEY) - return 1; + if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (found_key.offset == xattr_access || + found_key.offset == xattr_default) + return 1; + } /* * we found a key greater than an xattr key, there can't -- cgit v1.2.3 From 7ee9e4405f264e9eda808aa5ca4522746a1af9c1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Jun 2013 16:37:03 -0400 Subject: Btrfs: check if we can nocow if we don't have data space We always just try and reserve data space when we write, but if we are out of space but have prealloc'ed extents we should still successfully write. This patch will try and see if we can write to prealloc'ed space and if we can go ahead and allow the write to continue. With this patch we now pass xfstests generic/274. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8edcdf6910f7..4d7c02258390 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1641,7 +1641,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_delalloc_release_metadata(inode, len); if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list) + && do_list && !(state->state & EXTENT_NORESERVE)) btrfs_free_reserved_data_space(inode, len); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, @@ -6396,10 +6396,10 @@ out: * returns 1 when the nocow is safe, < 1 on error, 0 if the * block must be cow'd */ -static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, - struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes) +noinline int can_nocow_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes) { struct btrfs_path *path; int ret; @@ -6413,7 +6413,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, u64 num_bytes; int slot; int found_type; - + bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6453,18 +6453,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, /* not a regular extent, must cow */ goto out; } + + if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) + goto out; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (disk_bytenr == 0) + goto out; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + backref_offset = btrfs_file_extent_offset(leaf, fi); - *orig_start = key.offset - backref_offset; - *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (orig_start) { + *orig_start = key.offset - backref_offset; + *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end < offset + *len) { - /* extent doesn't include our full range, must cow */ - goto out; - } if (btrfs_extent_readonly(root, disk_bytenr)) goto out; @@ -6708,8 +6718,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (IS_ERR(trans)) goto must_cow; - if (can_nocow_odirect(trans, inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + if (can_nocow_extent(trans, inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes) == 1) { if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = create_pinned_em(inode, start, len, -- cgit v1.2.3 From e6da5d2ec9870ddadf4dbc6a1835470636df25bb Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 19 Jun 2013 18:19:17 +0800 Subject: Btrfs: cleanup redundant code in btrfs_submit_direct() Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4d7c02258390..0a43d42268f7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7148,7 +7148,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; - struct bio_vec *bvec = dio_bio->bi_io_vec; struct bio *io_bio; int skip_sum; int write = rw & REQ_WRITE; @@ -7170,16 +7169,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, } dip->private = dio_bio->bi_private; - io_bio->bi_private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; - - dip->bytes = 0; - do { - dip->bytes += bvec->bv_len; - bvec++; - } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1)); - + dip->bytes = dio_bio->bi_size; dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; io_bio->bi_private = dip; dip->errors = 0; -- cgit v1.2.3 From 0e267c44c3a402d35111d1935be1167240b5b79f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 2 Jul 2013 10:38:02 -0400 Subject: Btrfs: wait ordered range before doing direct io My recent truncate patch uncovered this bug, but I can reproduce it without the truncate patch. If you mount with -o compress-force, do a direct write to some area, do a buffered write to some other area, and then do a direct read you will get the wrong data for where you did the buffered write. This is because the generic direct io helpers only call filemap_write_and_wait once, and for compression we need it twice. So to be safe add the btrfs_wait_ordered_range to the start of the direct io function to make sure any compressed writes have truly been written. This patch makes xfstests 130 pass when you mount with -o compress-force=lzo. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0a43d42268f7..55dda871437f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7270,8 +7270,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, atomic_inc(&inode->i_dio_count); smp_mb__after_atomic_inc(); + /* + * The generic stuff only does filemap_write_and_wait_range, which isn't + * enough if we've written compressed pages to this area, so we need to + * call btrfs_wait_ordered_range to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + count = iov_length(iov, nr_segs); + btrfs_wait_ordered_range(inode, offset, count); + if (rw & WRITE) { - count = iov_length(iov, nr_segs); /* * If the write DIO is beyond the EOF, we need update * the isize, but it is protected by i_mutex. So we can -- cgit v1.2.3