summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/acl.c3
-rw-r--r--fs/btrfs/compression.c14
-rw-r--r--fs/btrfs/ctree.c52
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/btrfs/ioctl.c27
-rw-r--r--fs/btrfs/ordered-data.c6
-rw-r--r--fs/btrfs/qgroup.c9
-rw-r--r--fs/btrfs/scrub.c9
-rw-r--r--fs/btrfs/send.c24
-rw-r--r--fs/btrfs/sysfs.c7
-rw-r--r--fs/btrfs/tests/btrfs-tests.c2
-rw-r--r--fs/btrfs/tree-log.c59
-rw-r--r--fs/btrfs/volumes.c39
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/btrfs/zoned.c49
-rw-r--r--fs/btrfs/zoned.h11
18 files changed, 261 insertions, 77 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 548d6a5477b4..1e47b3ec3989 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -110,10 +110,11 @@ out:
return ret;
}
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int ret;
+ struct inode *inode = d_inode(dentry);
umode_t old_mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f1f051ad3147..e6635fe70067 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -512,7 +512,7 @@ static u64 bio_end_offset(struct bio *bio)
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
struct compressed_bio *cb,
- unsigned long *pflags)
+ int *memstall, unsigned long *pflags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
@@ -581,8 +581,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
continue;
}
- if (PageWorkingset(page))
+ if (!*memstall && PageWorkingset(page)) {
psi_memstall_enter(pflags);
+ *memstall = 1;
+ }
ret = set_page_extent_mapped(page);
if (ret < 0) {
@@ -670,8 +672,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
- /* Initialize to 1 to make skip psi_memstall_leave unless needed */
- unsigned long pflags = 1;
+ unsigned long pflags;
+ int memstall = 0;
blk_status_t ret;
int ret2;
int i;
@@ -727,7 +729,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
goto fail;
}
- add_ra_bio_pages(inode, em_start + em_len, cb, &pflags);
+ add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags);
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
@@ -807,7 +809,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
}
- if (!pflags)
+ if (memstall)
psi_memstall_leave(&pflags);
if (refcount_dec_and_test(&cb->pending_ios))
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b39b339fbf96..dcb510f38dda 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -114,6 +114,22 @@ noinline void btrfs_release_path(struct btrfs_path *p)
}
/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+bool __cold abort_should_print_stack(int errno)
+{
+ switch (errno) {
+ case -EIO:
+ case -EROFS:
+ case -ENOMEM:
+ return false;
+ }
+ return true;
+}
+
+/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
@@ -4647,7 +4663,12 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int ret;
int i;
- ASSERT(!path->nowait);
+ /*
+ * The nowait semantics are used only for write paths, where we don't
+ * use the tree mod log and sequence numbers.
+ */
+ if (time_seq)
+ ASSERT(!path->nowait);
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
@@ -4667,7 +4688,14 @@ again:
if (path->need_commit_sem) {
path->need_commit_sem = 0;
need_commit_sem = true;
- down_read(&fs_info->commit_root_sem);
+ if (path->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem)) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
@@ -4743,7 +4771,7 @@ again:
next = c;
ret = read_block_for_search(root, path, &next, level,
slot, &key);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN && !path->nowait)
goto again;
if (ret < 0) {
@@ -4753,6 +4781,10 @@ again:
if (!path->skip_locking) {
ret = btrfs_try_tree_read_lock(next);
+ if (!ret && path->nowait) {
+ ret = -EAGAIN;
+ goto done;
+ }
if (!ret && time_seq) {
/*
* If we don't get the lock, we may be racing
@@ -4783,7 +4815,7 @@ again:
ret = read_block_for_search(root, path, &next, level,
0, &key);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN && !path->nowait)
goto again;
if (ret < 0) {
@@ -4791,8 +4823,16 @@ again:
goto done;
}
- if (!path->skip_locking)
- btrfs_tree_read_lock(next);
+ if (!path->skip_locking) {
+ if (path->nowait) {
+ if (!btrfs_try_tree_read_lock(next)) {
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ btrfs_tree_read_lock(next);
+ }
+ }
}
ret = 0;
done:
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f677b49df8ae..919670d35919 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3796,9 +3796,11 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
unsigned int line, int errno, bool first_hit);
+bool __cold abort_should_print_stack(int errno);
+
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact line number is reported.
+ * detected, that way the exact stack trace is reported for some errors.
*/
#define btrfs_abort_transaction(trans, errno) \
do { \
@@ -3807,10 +3809,11 @@ do { \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
first = true; \
- if ((errno) != -EIO && (errno) != -EROFS) { \
- WARN(1, KERN_DEBUG \
+ if (WARN(abort_should_print_stack(errno), \
+ KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
- (errno)); \
+ (errno))) { \
+ /* Stack trace printed. */ \
} else { \
btrfs_debug((trans)->fs_info, \
"Transaction aborted (error %d)", \
@@ -3990,7 +3993,7 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
struct posix_acl *acl, int type);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b28263c3d32..d99bf7c64611 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2551,7 +2551,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->dev_root = root;
}
/* Initialize fs_info for all devices in any case */
- btrfs_init_devices_late(fs_info);
+ ret = btrfs_init_devices_late(fs_info);
+ if (ret)
+ goto out;
/*
* This tree can share blocks with some other fs tree during relocation
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0e516aefbf51..5a54bb93c413 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5256,7 +5256,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
}
return err;
@@ -11296,7 +11296,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.mknod = btrfs_mknod,
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
@@ -11349,7 +11349,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.fileattr_get = btrfs_fileattr_get,
@@ -11360,7 +11360,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.listxattr = btrfs_listxattr,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d5dd8bed1488..f897be9ec1e9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3105,6 +3105,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
}
}
+ btrfs_free_path(path);
+ path = NULL;
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
ret = -EFAULT;
@@ -3194,6 +3196,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
}
out:
+ btrfs_free_path(path);
+
if (!ret || ret == -EOVERFLOW) {
rootrefs->num_items = found;
/* update min_treeid for next search */
@@ -3205,7 +3209,6 @@ out:
}
kfree(rootrefs);
- btrfs_free_path(path);
return ret;
}
@@ -4231,6 +4234,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
ipath->fspath->val[i] = rel_ptr;
}
+ btrfs_free_path(path);
+ path = NULL;
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
ipath->fspath, size);
if (ret) {
@@ -4281,21 +4286,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
size = min_t(u32, loi->size, SZ_16M);
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
- inodes = NULL;
- goto out;
+ goto out_loi;
}
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
inodes, ignore_offset);
+ btrfs_free_path(path);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -4307,7 +4311,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
ret = -EFAULT;
out:
- btrfs_free_path(path);
kvfree(inodes);
out_loi:
kfree(loi);
@@ -5283,7 +5286,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
goto out_acct;
}
- ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
@@ -5382,7 +5385,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (args.len > args.unencoded_len - args.unencoded_offset)
goto out_acct;
- ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e54f8280031f..100d9f4836b1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -761,11 +761,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
struct btrfs_ordered_extent *ordered;
if (start + len < start) {
- orig_end = INT_LIMIT(loff_t);
+ orig_end = OFFSET_MAX;
} else {
orig_end = start + len - 1;
- if (orig_end > INT_LIMIT(loff_t))
- orig_end = INT_LIMIT(loff_t);
+ if (orig_end > OFFSET_MAX)
+ orig_end = OFFSET_MAX;
}
/* start IO across the range first to instantiate any delalloc
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9334c3157c22..b74105a10f16 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2951,14 +2951,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
dstgroup->rsv_excl = inherit->lim.rsv_excl;
- ret = update_qgroup_limit_item(trans, dstgroup);
- if (ret) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_info(fs_info,
- "unable to update quota limit for %llu",
- dstgroup->qgroupid);
- goto unlock;
- }
+ qgroup_dirty(fs_info, dstgroup);
}
if (srcid) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f260c53829e5..196c4c6ed1ed 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2672,17 +2672,11 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
- /*
- * Block size determines how many scrub_block will be allocated. Here
- * we use BTRFS_STRIPE_LEN (64KiB) as default limit, so we won't
- * allocate too many scrub_block, while still won't cause too large
- * bios for large extents.
- */
if (flags & BTRFS_EXTENT_FLAG_DATA) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
blocksize = map->stripe_len;
else
- blocksize = BTRFS_STRIPE_LEN;
+ blocksize = sctx->fs_info->sectorsize;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
@@ -3917,7 +3911,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
- spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
goto skip;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 145c84b44fd0..1c4b693ee4a3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5702,6 +5702,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
u64 ext_len;
u64 clone_len;
u64 clone_data_offset;
+ bool crossed_src_i_size = false;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
@@ -5759,8 +5760,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
if (key.offset >= clone_src_i_size)
break;
- if (key.offset + ext_len > clone_src_i_size)
+ if (key.offset + ext_len > clone_src_i_size) {
ext_len = clone_src_i_size - key.offset;
+ crossed_src_i_size = true;
+ }
clone_data_offset = btrfs_file_extent_offset(leaf, ei);
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5821,6 +5824,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
+ } else if (crossed_src_i_size && clone_len < len) {
+ /*
+ * If we are at i_size of the clone source inode and we
+ * can not clone from it, terminate the loop. This is
+ * to avoid sending two write operations, one with a
+ * length matching clone_len and the final one after
+ * this loop with a length of len - clone_len.
+ *
+ * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+ * was passed to the send ioctl), this helps avoid
+ * sending an encoded write for an offset that is not
+ * sector size aligned, in case the i_size of the source
+ * inode is not sector size aligned. That will make the
+ * receiver fallback to decompression of the data and
+ * writing it using regular buffered IO, therefore while
+ * not incorrect, it's not optimal due decompression and
+ * possible re-compression at the receiver.
+ */
+ break;
} else {
ret = send_extent_data(sctx, dst_path, offset,
clone_len);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 699b54b3acaa..74fef1f49c35 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -2321,8 +2321,11 @@ int __init btrfs_init_sysfs(void)
#ifdef CONFIG_BTRFS_DEBUG
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
- if (ret)
- goto out2;
+ if (ret) {
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ goto out_remove_group;
+ }
#endif
return 0;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9c478fa256f6..d43cb5242fec 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -200,7 +200,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
void btrfs_free_dummy_root(struct btrfs_root *root)
{
- if (!root)
+ if (IS_ERR_OR_NULL(root))
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 813986e38258..c3cf3dabe0b1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3694,15 +3694,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
u64 *last_old_dentry_offset)
{
struct btrfs_root *log = inode->root->log_root;
- struct extent_buffer *src = path->nodes[0];
- const int nritems = btrfs_header_nritems(src);
+ struct extent_buffer *src;
+ const int nritems = btrfs_header_nritems(path->nodes[0]);
const u64 ino = btrfs_ino(inode);
bool last_found = false;
int batch_start = 0;
int batch_size = 0;
int i;
- for (i = path->slots[0]; i < nritems; i++) {
+ /*
+ * We need to clone the leaf, release the read lock on it, and use the
+ * clone before modifying the log tree. See the comment at copy_items()
+ * about why we need to do this.
+ */
+ src = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!src)
+ return -ENOMEM;
+
+ i = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = src;
+ path->slots[0] = i;
+
+ for (; i < nritems; i++) {
struct btrfs_dir_item *di;
struct btrfs_key key;
int ret;
@@ -4303,7 +4317,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
{
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
- struct extent_buffer *src = src_path->nodes[0];
+ struct extent_buffer *src;
int ret = 0;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -4314,6 +4328,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
const u64 i_size = i_size_read(&inode->vfs_inode);
+ /*
+ * To keep lockdep happy and avoid deadlocks, clone the source leaf and
+ * use the clone. This is because otherwise we would be changing the log
+ * tree, to insert items from the subvolume tree or insert csum items,
+ * while holding a read lock on a leaf from the subvolume tree, which
+ * creates a nasty lock dependency when COWing log tree nodes/leaves:
+ *
+ * 1) Modifying the log tree triggers an extent buffer allocation while
+ * holding a write lock on a parent extent buffer from the log tree.
+ * Allocating the pages for an extent buffer, or the extent buffer
+ * struct, can trigger inode eviction and finally the inode eviction
+ * will trigger a release/remove of a delayed node, which requires
+ * taking the delayed node's mutex;
+ *
+ * 2) Allocating a metadata extent for a log tree can trigger the async
+ * reclaim thread and make us wait for it to release enough space and
+ * unblock our reservation ticket. The reclaim thread can start
+ * flushing delayed items, and that in turn results in the need to
+ * lock delayed node mutexes and in the need to write lock extent
+ * buffers of a subvolume tree - all this while holding a write lock
+ * on the parent extent buffer in the log tree.
+ *
+ * So one task in scenario 1) running in parallel with another task in
+ * scenario 2) could lead to a deadlock, one wanting to lock a delayed
+ * node mutex while having a read lock on a leaf from the subvolume,
+ * while the other is holding the delayed node's mutex and wants to
+ * write lock the same subvolume leaf for flushing delayed items.
+ */
+ src = btrfs_clone_extent_buffer(src_path->nodes[0]);
+ if (!src)
+ return -ENOMEM;
+
+ i = src_path->slots[0];
+ btrfs_release_path(src_path);
+ src_path->nodes[0] = src;
+ src_path->slots[0] = i;
+
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
if (!ins_data)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a8d4bc6a1937..635f45f1a2ef 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1011,6 +1011,18 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
rcu_assign_pointer(device->name, name);
}
+ if (orig_dev->zone_info) {
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = btrfs_clone_dev_zone_info(orig_dev);
+ if (!zone_info) {
+ btrfs_free_device(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ device->zone_info = zone_info;
+ }
+
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
@@ -6918,18 +6930,18 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
const struct btrfs_device *device)
{
- ASSERT((args->devid != (u64)-1) || args->missing);
+ if (args->missing) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+ }
- if ((args->devid != (u64)-1) && device->devid != args->devid)
+ if (device->devid != args->devid)
return false;
if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
return false;
- if (!args->missing)
- return true;
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
- !device->bdev)
- return true;
- return false;
+ return true;
}
/*
@@ -7744,10 +7756,11 @@ error:
return ret;
}
-void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
+ int ret = 0;
fs_devices->fs_info = fs_info;
@@ -7756,12 +7769,18 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
device->fs_info = fs_info;
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
- list_for_each_entry(device, &seed_devs->devices, dev_list)
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
device->fs_info = fs_info;
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ break;
+ }
seed_devs->fs_info = fs_info;
}
mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f8b668dc8bf8..099def5613b8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -671,7 +671,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats);
-void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index e2d073b08a7d..c9e2b0c85309 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -134,7 +134,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
super[i] = page_address(page[i]);
}
- if (super[0]->generation > super[1]->generation)
+ if (btrfs_super_generation(super[0]) >
+ btrfs_super_generation(super[1]))
sector = zones[1].start;
else
sector = zones[0].start;
@@ -466,7 +467,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
goto out;
}
- zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+ zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
if (!zones) {
ret = -ENOMEM;
goto out;
@@ -585,7 +586,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
}
- kfree(zones);
+ kvfree(zones);
switch (bdev_zoned_model(bdev)) {
case BLK_ZONED_HM:
@@ -617,7 +618,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
return 0;
out:
- kfree(zones);
+ kvfree(zones);
out_free_zone_info:
btrfs_destroy_dev_zone_info(device);
@@ -639,6 +640,46 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
device->zone_info = NULL;
}
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
+{
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return NULL;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones)
+ goto out;
+
+ bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
+ zone_info->nr_zones);
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones)
+ goto out;
+
+ bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
+ zone_info->nr_zones);
+
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones)
+ goto out;
+
+ bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
+ zone_info->nr_zones);
+ zone_info->zone_cache = NULL;
+
+ return zone_info;
+
+out:
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ bitmap_free(zone_info->active_zones);
+ kfree(zone_info);
+ return NULL;
+}
+
int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
{
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index e17462db3a84..8bd16d40b7c6 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -36,6 +36,7 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
@@ -103,6 +104,16 @@ static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { }
+/*
+ * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call
+ * into btrfs_clone_dev_zone_info() so it's safe to return NULL here.
+ */
+static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(
+ struct btrfs_device *orig_dev)
+{
+ return NULL;
+}
+
static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
{
if (!btrfs_is_zoned(fs_info))