diff options
Diffstat (limited to 'fs')
352 files changed, 10209 insertions, 7163 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index eed551d8555f..633da5e37299 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/fs_struct.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <linux/slab.h> diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index eb0b083da269..612a230bc012 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -483,24 +483,15 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) static void v9fs_mmap_vm_close(struct vm_area_struct *vma) { - struct inode *inode; - - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_ALL, - .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE, - /* absolute end, byte at end included */ - .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE + - (vma->vm_end - vma->vm_start - 1), - }; - if (!(vma->vm_flags & VM_SHARED)) return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); - inode = file_inode(vma->vm_file); - filemap_fdatawrite_wbc(inode->i_mapping, &wbc); + filemap_fdatawrite_range(file_inode(vma->vm_file)->i_mapping, + (loff_t)vma->vm_pgoff * PAGE_SIZE, + (loff_t)vma->vm_pgoff * PAGE_SIZE + + (vma->vm_end - vma->vm_start - 1)); } static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index d0c77ec31b1d..8666c9c62258 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -422,7 +422,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; /* * initialize the inode with the stat info diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index be297e335468..1661a25f2772 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -112,7 +112,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; /* * initialize the inode with the stat info diff --git a/fs/Makefile b/fs/Makefile index e3523ab2e587..a04274a3c854 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ + fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ file_attr.o diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0210df8d3500..0bfc7d151dcd 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -29,7 +29,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; pr_debug("affs_iget(%lu)\n", inode->i_ino); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index f31359922e98..71c10a05cebe 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -140,7 +140,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, return ERR_PTR(-ENOMEM); } - cell->name = kmalloc(1 + namelen + 1, GFP_KERNEL); + /* Allocate the cell name and the key name in one go. */ + cell->name = kmalloc(1 + namelen + 1 + + 4 + namelen + 1, GFP_KERNEL); if (!cell->name) { kfree(cell); return ERR_PTR(-ENOMEM); @@ -151,7 +153,11 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->name_len = namelen; for (i = 0; i < namelen; i++) cell->name[i] = tolower(name[i]); - cell->name[i] = 0; + cell->name[i++] = 0; + + cell->key_desc = cell->name + i; + memcpy(cell->key_desc, "afs@", 4); + memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1); cell->net = net; refcount_set(&cell->ref, 1); @@ -229,7 +235,7 @@ error: * @name: The name of the cell. * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. - * @excl: T if an error should be given if the cell name already exists. + * @reason: The reason we're doing the lookup * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if @@ -239,7 +245,8 @@ error: */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; @@ -247,12 +254,18 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, enum afs_cell_state state; int ret, n; - _enter("%s,%s", name, vllist); + _enter("%s,%s,%u", name, vllist, reason); - if (!excl) { + if (reason != AFS_LOOKUP_CELL_PRELOAD) { cell = afs_find_cell(net, name, namesz, trace); - if (!IS_ERR(cell)) + if (!IS_ERR(cell)) { + if (reason == AFS_LOOKUP_CELL_DYNROOT) + goto no_wait; + if (cell->state == AFS_CELL_SETTING_UP || + cell->state == AFS_CELL_UNLOOKED) + goto lookup_cell; goto wait_for_cell; + } } /* Assume we're probably going to create a cell and preallocate and @@ -298,26 +311,69 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell, afs_cell_trace_queue_new); +lookup_cell: + if (reason != AFS_LOOKUP_CELL_PRELOAD && + reason != AFS_LOOKUP_CELL_ROOTCELL) { + set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); + afs_queue_cell(cell, afs_cell_trace_queue_new); + } wait_for_cell: - _debug("wait_for_cell"); state = smp_load_acquire(&cell->state); /* vs error */ - if (state != AFS_CELL_ACTIVE && - state != AFS_CELL_DEAD) { + switch (state) { + case AFS_CELL_ACTIVE: + case AFS_CELL_DEAD: + break; + case AFS_CELL_UNLOOKED: + default: + if (reason == AFS_LOOKUP_CELL_PRELOAD || + reason == AFS_LOOKUP_CELL_ROOTCELL) + break; + _debug("wait_for_cell"); afs_see_cell(cell, afs_cell_trace_wait); wait_var_event(&cell->state, ({ state = smp_load_acquire(&cell->state); /* vs error */ state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; })); + _debug("waited_for_cell %d %d", cell->state, cell->error); } +no_wait: /* Check the state obtained from the wait check. */ + state = smp_load_acquire(&cell->state); /* vs error */ if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } + if (state == AFS_CELL_ACTIVE) { + switch (cell->dns_status) { + case DNS_LOOKUP_NOT_DONE: + if (cell->dns_source == DNS_RECORD_FROM_CONFIG) { + ret = 0; + break; + } + fallthrough; + default: + ret = -EIO; + goto error; + case DNS_LOOKUP_GOOD: + case DNS_LOOKUP_GOOD_WITH_BAD: + ret = 0; + break; + case DNS_LOOKUP_GOT_NOT_FOUND: + ret = -ENOENT; + goto error; + case DNS_LOOKUP_BAD: + ret = -EREMOTEIO; + goto error; + case DNS_LOOKUP_GOT_LOCAL_FAILURE: + case DNS_LOOKUP_GOT_TEMP_FAILURE: + case DNS_LOOKUP_GOT_NS_FAILURE: + ret = -EDESTADDRREQ; + goto error; + } + } _leave(" = %p [cell]", cell); return cell; @@ -325,7 +381,7 @@ wait_for_cell: cell_already_exists: _debug("cell exists"); cell = cursor; - if (excl) { + if (reason == AFS_LOOKUP_CELL_PRELOAD) { ret = -EEXIST; } else { afs_use_cell(cursor, trace); @@ -384,7 +440,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) return -EINVAL; /* allocate a cell record for the root/workstation cell */ - new_root = afs_lookup_cell(net, rootcell, len, vllist, false, + new_root = afs_lookup_cell(net, rootcell, len, vllist, + AFS_LOOKUP_CELL_ROOTCELL, afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); @@ -660,33 +717,6 @@ void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs) } /* - * Allocate a key to use as a placeholder for anonymous user security. - */ -static int afs_alloc_anon_key(struct afs_cell *cell) -{ - struct key *key; - char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp; - - /* Create a key to represent an anonymous user. */ - memcpy(keyname, "afs@", 4); - dp = keyname + 4; - cp = cell->name; - do { - *dp++ = tolower(*cp); - } while (*cp++); - - key = rxrpc_get_null_key(keyname); - if (IS_ERR(key)) - return PTR_ERR(key); - - cell->anonymous_key = key; - - _debug("anon key %p{%x}", - cell->anonymous_key, key_serial(cell->anonymous_key)); - return 0; -} - -/* * Activate a cell. */ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) @@ -695,12 +725,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) struct afs_cell *pcell; int ret; - if (!cell->anonymous_key) { - ret = afs_alloc_anon_key(cell); - if (ret < 0) - return ret; - } - ret = afs_proc_cell_setup(cell); if (ret < 0) return ret; @@ -777,6 +801,7 @@ static bool afs_manage_cell(struct afs_cell *cell) switch (cell->state) { case AFS_CELL_SETTING_UP: goto set_up_cell; + case AFS_CELL_UNLOOKED: case AFS_CELL_ACTIVE: goto cell_is_active; case AFS_CELL_REMOVING: @@ -797,7 +822,7 @@ set_up_cell: goto remove_cell; } - afs_set_cell_state(cell, AFS_CELL_ACTIVE); + afs_set_cell_state(cell, AFS_CELL_UNLOOKED); cell_is_active: if (afs_has_cell_expired(cell, &next_manage)) @@ -807,6 +832,8 @@ cell_is_active: ret = afs_update_cell(cell); if (ret < 0) cell->error = ret; + if (cell->state == AFS_CELL_UNLOOKED) + afs_set_cell_state(cell, AFS_CELL_ACTIVE); } if (next_manage < TIME64_MAX && cell->net->live) { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 89d36e3e5c79..f4e9e12373ac 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -779,7 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct inode *inode = NULL, *ti; afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version); - bool supports_ibulk; + bool supports_ibulk, isnew; long ret; int i; @@ -850,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) * callback counters. */ ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode, - afs_ilookup5_test_by_fid, &vp->fid); + afs_ilookup5_test_by_fid, &vp->fid, &isnew); if (!IS_ERR_OR_NULL(ti)) { vnode = AFS_FS_I(ti); vp->dv_before = vnode->status.data_version; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 8c6130789fde..aa56e8951e03 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -64,7 +64,7 @@ static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) vnode = AFS_FS_I(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 2); @@ -108,7 +108,8 @@ static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry * dotted = true; } - cell = afs_lookup_cell(net, name, len, NULL, false, + cell = afs_lookup_cell(net, name, len, NULL, + AFS_LOOKUP_CELL_DYNROOT, afs_cell_trace_use_lookup_dynroot); if (IS_ERR(cell)) { ret = PTR_ERR(cell); @@ -258,7 +259,7 @@ static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry vnode = AFS_FS_I(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 1); @@ -383,7 +384,7 @@ struct inode *afs_dynroot_iget_root(struct super_block *sb) vnode = AFS_FS_I(inode); /* there shouldn't be an existing inode */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 2); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e1cb17b85791..dde1857fcabb 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -427,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->netfs.inode.i_state & I_NEW) { + if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); afs_op_set_error(op, ret); if (ret == 0) @@ -579,7 +579,7 @@ struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); /* deal with an existing inode */ - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { _leave(" = %p", inode); return inode; } @@ -639,7 +639,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) _debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid); - BUG_ON(!(inode->i_state & I_NEW)); + BUG_ON(!(inode_state_read_once(inode) & I_NEW)); vnode = AFS_FS_I(inode); vnode->cb_v_check = atomic_read(&as->volume->cb_v_break); @@ -748,7 +748,7 @@ void afs_evict_inode(struct inode *inode) if ((S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - (inode->i_state & I_DIRTY) && + (inode_state_read_once(inode) & I_DIRTY) && !sbi->dyn_root) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a45ae5c2ef8a..009064b8d661 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -343,6 +343,7 @@ extern const char afs_init_sysname[]; enum afs_cell_state { AFS_CELL_SETTING_UP, + AFS_CELL_UNLOOKED, AFS_CELL_ACTIVE, AFS_CELL_REMOVING, AFS_CELL_DEAD, @@ -412,6 +413,7 @@ struct afs_cell { u8 name_len; /* Length of name */ char *name; /* Cell name, case-flattened and NUL-padded */ + char *key_desc; /* Authentication key description */ }; /* @@ -1049,9 +1051,18 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); +enum afs_lookup_cell_for { + AFS_LOOKUP_CELL_DYNROOT, + AFS_LOOKUP_CELL_MOUNTPOINT, + AFS_LOOKUP_CELL_DIRECT_MOUNT, + AFS_LOOKUP_CELL_PRELOAD, + AFS_LOOKUP_CELL_ROOTCELL, + AFS_LOOKUP_CELL_ALIAS_CHECK, +}; struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 1ad048e6e164..57c204a3c04e 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size > AFS_MAXCELLNAME) return -ENAMETOOLONG; - cell = afs_lookup_cell(ctx->net, p, size, NULL, false, + cell = afs_lookup_cell(ctx->net, p, size, NULL, + AFS_LOOKUP_CELL_MOUNTPOINT, afs_cell_trace_use_lookup_mntpt); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 40e879c8ca77..44520549b509 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -122,7 +122,8 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_lookup_cell(net, name, strlen(name), args, true, + cell = afs_lookup_cell(net, name, strlen(name), args, + AFS_LOOKUP_CELL_PRELOAD, afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index c1cadf8fb346..bf0e4ea0aafd 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -82,16 +82,16 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) pr_err("Couldn't create RxGK CM key: %d\n", ret); - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); } if (ret < 0) goto error_2; srx.srx_service = YFS_CM_SERVICE; - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); if (ret < 0) goto error_2; diff --git a/fs/afs/security.c b/fs/afs/security.c index 6a7744c9e2a2..55ddce94af03 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -16,6 +16,31 @@ static DEFINE_HASHTABLE(afs_permits_cache, 10); static DEFINE_SPINLOCK(afs_permits_lock); +static DEFINE_MUTEX(afs_key_lock); + +/* + * Allocate a key to use as a placeholder for anonymous user security. + */ +static int afs_alloc_anon_key(struct afs_cell *cell) +{ + struct key *key; + + mutex_lock(&afs_key_lock); + key = cell->anonymous_key; + if (!key) { + key = rxrpc_get_null_key(cell->key_desc); + if (!IS_ERR(key)) + cell->anonymous_key = key; + } + mutex_unlock(&afs_key_lock); + + if (IS_ERR(key)) + return PTR_ERR(key); + + _debug("anon key %p{%x}", + cell->anonymous_key, key_serial(cell->anonymous_key)); + return 0; +} /* * get a key @@ -23,11 +48,12 @@ static DEFINE_SPINLOCK(afs_permits_lock); struct key *afs_request_key(struct afs_cell *cell) { struct key *key; + int ret; - _enter("{%x}", key_serial(cell->anonymous_key)); + _enter("{%s}", cell->key_desc); - _debug("key %s", cell->anonymous_key->description); - key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description, + _debug("key %s", cell->key_desc); + key = request_key_net(&key_type_rxrpc, cell->key_desc, cell->net->net, NULL); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { @@ -35,6 +61,12 @@ struct key *afs_request_key(struct afs_cell *cell) return key; } + if (!cell->anonymous_key) { + ret = afs_alloc_anon_key(cell); + if (ret < 0) + return ERR_PTR(ret); + } + /* act as anonymous user */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); @@ -52,11 +84,10 @@ struct key *afs_request_key_rcu(struct afs_cell *cell) { struct key *key; - _enter("{%x}", key_serial(cell->anonymous_key)); + _enter("{%s}", cell->key_desc); - _debug("key %s", cell->anonymous_key->description); - key = request_key_net_rcu(&key_type_rxrpc, - cell->anonymous_key->description, + _debug("key %s", cell->key_desc); + key = request_key_net_rcu(&key_type_rxrpc, cell->key_desc, cell->net->net); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { @@ -65,6 +96,8 @@ struct key *afs_request_key_rcu(struct afs_cell *cell) } /* act as anonymous user */ + if (!cell->anonymous_key) + return NULL; /* Need to allocate */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); } else { @@ -408,7 +441,7 @@ int afs_permission(struct mnt_idmap *idmap, struct inode *inode, if (mask & MAY_NOT_BLOCK) { key = afs_request_key_rcu(vnode->volume->cell); - if (IS_ERR(key)) + if (IS_ERR_OR_NULL(key)) return -ECHILD; ret = -ECHILD; diff --git a/fs/afs/super.c b/fs/afs/super.c index da407f2d6f0d..d672b7ab57ae 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -290,7 +290,7 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) /* lookup the cell record */ if (cellname) { cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, - NULL, false, + NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT, afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%*.*s'\n", diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 709b4cdb723e..fc9676abd252 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) if (!name_len || name_len > AFS_MAXCELLNAME) master = ERR_PTR(-EOPNOTSUPP); else - master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false, + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, + AFS_LOOKUP_CELL_ALIAS_CHECK, afs_cell_trace_use_lookup_canonical); kfree(cell_name); if (IS_ERR(master)) @@ -1640,10 +1640,10 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); - const struct cred *old_cred = override_creds(iocb->fsync.creds); - iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); - revert_creds(old_cred); + scoped_with_creds(iocb->fsync.creds) + iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + put_cred(iocb->fsync.creds); iocb_put(iocb); } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 180a458fc4f7..b8381c7fb636 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -280,27 +280,8 @@ static int __anon_inode_getfd(const char *name, const struct inode *context_inode, bool make_inode) { - int error, fd; - struct file *file; - - error = get_unused_fd_flags(flags); - if (error < 0) - return error; - fd = error; - - file = __anon_inode_getfile(name, fops, priv, flags, context_inode, - make_inode); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_put_unused_fd; - } - fd_install(fd, file); - - return fd; - -err_put_unused_fd: - put_unused_fd(fd); - return error; + return FD_ADD(flags, __anon_inode_getfile(name, fops, priv, flags, + context_inode, make_inode)); } /** diff --git a/fs/attr.c b/fs/attr.c index 795f231d00e8..b9ec6b47bab2 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -415,7 +415,7 @@ EXPORT_SYMBOL(may_setattr); * performed on the raw inode simply pass @nop_mnt_idmap. */ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr, struct inode **delegated_inode) + struct iattr *attr, struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 23cea74f9933..4fd555528c5d 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -16,6 +16,7 @@ #include <linux/wait.h> #include <linux/sched.h> #include <linux/sched/signal.h> +#include <uapi/linux/mount.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/uaccess.h> @@ -27,6 +28,9 @@ #include <linux/magic.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> +#include "../mount.h" +#include <linux/ns_common.h> + /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -114,6 +118,7 @@ struct autofs_sb_info { int pipefd; struct file *pipe; struct pid *oz_pgrp; + u64 mnt_ns_id; int version; int sub_version; int min_proto; diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index d8dd150cbd74..a58f9248b0f5 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -231,32 +231,14 @@ static int test_by_type(const struct path *path, void *p) */ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) { - int err, fd; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (likely(fd >= 0)) { - struct file *filp; - struct path path; - - err = find_autofs_mount(name, &path, test_by_dev, &devid); - if (err) - goto out; - - filp = dentry_open(&path, O_RDONLY, current_cred()); - path_put(&path); - if (IS_ERR(filp)) { - err = PTR_ERR(filp); - goto out; - } - - fd_install(fd, filp); - } + struct path path __free(path_put) = {}; + int err; - return fd; + err = find_autofs_mount(name, &path, test_by_dev, &devid); + if (err) + return err; -out: - put_unused_fd(fd); - return err; + return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred())); } /* Open a file descriptor on an autofs mount point */ @@ -381,6 +363,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; + sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id; sbi->flags &= ~AUTOFS_SBI_CATATONIC; } out: diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index f5c16ffba013..732aee76a24c 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -251,6 +251,7 @@ static struct autofs_sb_info *autofs_alloc_sbi(void) sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; sbi->pipefd = -1; + sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id; set_autofs_type_indirect(&sbi->type); mutex_init(&sbi->wq_mutex); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 174c7205fee4..d10df9d89d1c 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -341,6 +341,14 @@ static struct vfsmount *autofs_d_automount(struct path *path) if (autofs_oz_mode(sbi)) return NULL; + /* Refuse to trigger mount if current namespace is not the owner + * and the mount is propagation private. + */ + if (sbi->mnt_ns_id != to_ns_common(current->nsproxy->mnt_ns)->ns_id) { + if (vfsmount_to_propagation_flags(path->mnt) & MS_PRIVATE) + return ERR_PTR(-EPERM); + } + /* * If an expire request is pending everyone must wait. * If the expire fails we're still mounted so continue diff --git a/fs/backing-file.c b/fs/backing-file.c index 15a7f8031084..45da8600d564 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -157,13 +157,37 @@ static int backing_aio_init_wq(struct kiocb *iocb) return sb_init_dio_done_wq(sb); } +static int do_backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags) +{ + struct backing_aio *aio = NULL; + int ret; + + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + return vfs_iter_read(file, iter, &iocb->ki_pos, rwf); + } + + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + return -ENOMEM; + + aio->orig_iocb = iocb; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_complete = backing_aio_rw_complete; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + return ret; +} ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx) { - struct backing_aio *aio = NULL; - const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) @@ -176,41 +200,57 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; - old_cred = override_creds(ctx->cred); + scoped_with_creds(ctx->cred) + ret = do_backing_file_read_iter(file, iter, iocb, flags); + + if (ctx->accessed) + ctx->accessed(iocb->ki_filp); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_read_iter); + +static int do_backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + void (*end_write)(struct kiocb *, ssize_t)) +{ + struct backing_aio *aio; + int ret; + if (is_sync_kiocb(iocb)) { rwf_t rwf = iocb_to_rw_flags(flags); - ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); - } else { - ret = -ENOMEM; - aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); - if (!aio) - goto out; - - aio->orig_iocb = iocb; - kiocb_clone(&aio->iocb, iocb, get_file(file)); - aio->iocb.ki_complete = backing_aio_rw_complete; - refcount_set(&aio->ref, 2); - ret = vfs_iocb_iter_read(file, &aio->iocb, iter); - backing_aio_put(aio); - if (ret != -EIOCBQUEUED) - backing_aio_cleanup(aio, ret); + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); + if (end_write) + end_write(iocb, ret); + return ret; } -out: - revert_creds(old_cred); - if (ctx->accessed) - ctx->accessed(iocb->ki_filp); + ret = backing_aio_init_wq(iocb); + if (ret) + return ret; + + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + return -ENOMEM; + aio->orig_iocb = iocb; + aio->end_write = end_write; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_flags = flags; + aio->iocb.ki_complete = backing_aio_queue_completion; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); return ret; } -EXPORT_SYMBOL_GPL(backing_file_read_iter); ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx) { - const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) @@ -227,46 +267,8 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; - /* - * Stacked filesystems don't support deferred completions, don't copy - * this property in case it is set by the issuer. - */ - flags &= ~IOCB_DIO_CALLER_COMP; - - old_cred = override_creds(ctx->cred); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(flags); - - ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); - if (ctx->end_write) - ctx->end_write(iocb, ret); - } else { - struct backing_aio *aio; - - ret = backing_aio_init_wq(iocb); - if (ret) - goto out; - - ret = -ENOMEM; - aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); - if (!aio) - goto out; - - aio->orig_iocb = iocb; - aio->end_write = ctx->end_write; - kiocb_clone(&aio->iocb, iocb, get_file(file)); - aio->iocb.ki_flags = flags; - aio->iocb.ki_complete = backing_aio_queue_completion; - refcount_set(&aio->ref, 2); - ret = vfs_iocb_iter_write(file, &aio->iocb, iter); - backing_aio_put(aio); - if (ret != -EIOCBQUEUED) - backing_aio_cleanup(aio, ret); - } -out: - revert_creds(old_cred); - - return ret; + scoped_with_creds(ctx->cred) + return do_backing_file_write_iter(file, iter, iocb, flags, ctx->end_write); } EXPORT_SYMBOL_GPL(backing_file_write_iter); @@ -275,15 +277,13 @@ ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb, unsigned int flags, struct backing_file_ctx *ctx) { - const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) return -EIO; - old_cred = override_creds(ctx->cred); - ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags); - revert_creds(old_cred); + scoped_with_creds(ctx->cred) + ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags); if (ctx->accessed) ctx->accessed(iocb->ki_filp); @@ -297,7 +297,6 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, size_t len, unsigned int flags, struct backing_file_ctx *ctx) { - const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) @@ -310,11 +309,11 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, if (ret) return ret; - old_cred = override_creds(ctx->cred); - file_start_write(out); - ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags); - file_end_write(out); - revert_creds(old_cred); + scoped_with_creds(ctx->cred) { + file_start_write(out); + ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags); + file_end_write(out); + } if (ctx->end_write) ctx->end_write(iocb, ret); @@ -326,7 +325,6 @@ EXPORT_SYMBOL_GPL(backing_file_splice_write); int backing_file_mmap(struct file *file, struct vm_area_struct *vma, struct backing_file_ctx *ctx) { - const struct cred *old_cred; struct file *user_file = vma->vm_file; int ret; @@ -338,9 +336,8 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma, vma_set_file(vma, file); - old_cred = override_creds(ctx->cred); - ret = vfs_mmap(vma->vm_file, vma); - revert_creds(old_cred); + scoped_with_creds(ctx->cred) + ret = vfs_mmap(vma->vm_file, vma); if (ctx->accessed) ctx->accessed(user_file); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8f430ff8e445..9fcfdd6b8189 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -307,7 +307,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; befs_ino = BEFS_I(inode); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 1d41ce477df5..ce6f83234b67 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -42,7 +42,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { @@ -61,7 +61,19 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; di = (struct bfs_inode *)bh->b_data + off; - inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); + /* + * https://martin.hinner.info/fs/bfs/bfs-structure.html explains that + * BFS in SCO UnixWare environment used only lower 9 bits of di->i_mode + * value. This means that, although bfs_write_inode() saves whole + * inode->i_mode bits (which include S_IFMT bits and S_IS{UID,GID,VTX} + * bits), middle 7 bits of di->i_mode value can be garbage when these + * bits were not saved by bfs_write_inode(). + * Since we can't tell whether middle 7 bits are garbage, use only + * lower 12 bits (i.e. tolerate S_IS{UID,GID,VTX} bits possibly being + * garbage) and reconstruct S_IFMT bits for Linux environment from + * di->i_vtype value. + */ + inode->i_mode = 0x00000FFF & le32_to_cpu(di->i_mode); if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { inode->i_mode |= S_IFDIR; inode->i_op = &bfs_dir_inops; @@ -71,6 +83,11 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; + } else { + brelse(bh); + printf("Unknown vtype=%u %s:%08lx\n", + le32_to_cpu(di->i_vtype), inode->i_sb->s_id, ino); + goto error; } BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e4653bb99946..3eb734c192e9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -46,7 +46,7 @@ #include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> -#include <linux/rseq.h> +#include <uapi/linux/rseq.h> #include <asm/param.h> #include <asm/page.h> diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a839f960cd4a..d7aec5b87c2b 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -782,8 +782,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); if (e->flags & MISC_FMT_OPEN_FILE) { - const struct cred *old_cred; - /* * Now that we support unprivileged binfmt_misc mounts make * sure we use the credentials that the register @file was @@ -791,9 +789,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, * didn't matter much as only a privileged process could open * the register file. */ - old_cred = override_creds(file->f_cred); - f = open_exec(e->interpreter); - revert_creds(old_cred); + scoped_with_creds(file->f_cred) + f = open_exec(e->interpreter); if (IS_ERR(f)) { pr_notice("register: failed to install interpreter file %s\n", e->interpreter); @@ -837,8 +834,10 @@ out: inode_unlock(d_inode(root)); if (err) { - if (f) + if (f) { + exe_file_allow_write_access(f); filp_close(f, NULL); + } kfree(e); return err; } diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 99b3ced12805..78721412951c 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> +#include "fs.h" #include "extent_io.h" struct extent_buffer; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index e0ba00d64ea0..c336e2ab7f8a 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -14,12 +14,13 @@ #include "ctree.h" #include "xattr.h" #include "acl.h" +#include "misc.h" struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) { int size; const char *name; - char *value = NULL; + char AUTO_KFREE(value); struct posix_acl *acl; if (rcu) @@ -49,7 +50,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) acl = NULL; else acl = ERR_PTR(size); - kfree(value); return acl; } @@ -59,7 +59,7 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, { int ret, size = 0; const char *name; - char *value = NULL; + char AUTO_KFREE(value); switch (type) { case ACL_TYPE_ACCESS: @@ -85,28 +85,23 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, nofs_flag = memalloc_nofs_save(); value = kmalloc(size, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); - if (!value) { - ret = -ENOMEM; - goto out; - } + if (!value) + return -ENOMEM; ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (ret < 0) - goto out; + return ret; } if (trans) ret = btrfs_setxattr(trans, inode, name, value, size, 0); else ret = btrfs_setxattr_trans(inode, name, value, size, 0); + if (ret < 0) + return ret; -out: - kfree(value); - - if (!ret) - set_cached_acl(inode, type, acl); - - return ret; + set_cached_acl(inode, type, acl); + return 0; } int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2ab550a1e715..78da47a3d00e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -666,10 +666,9 @@ static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx, ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq); btrfs_debug(ctx->fs_info, - "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", - ref->root_id, level, ref->count, ret, - ref->key_for_search.objectid, ref->key_for_search.type, - ref->key_for_search.offset); +"search slot in root %llu (level %d, ref count %d) returned %d for key " BTRFS_KEY_FMT, + ref->root_id, level, ref->count, ret, + BTRFS_KEY_FMT_VALUE(&ref->key_for_search)); if (ret < 0) goto out; @@ -1409,12 +1408,12 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, if (!path) return -ENOMEM; if (!ctx->trans) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; } if (ctx->time_seq == BTRFS_SEQ_LAST) - path->skip_locking = 1; + path->skip_locking = true; again: head = NULL; @@ -1561,7 +1560,7 @@ again: btrfs_release_path(path); - ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0); + ret = add_missing_keys(ctx->fs_info, &preftrees, !path->skip_locking); if (ret) goto out; @@ -2786,7 +2785,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) * allocates space to return multiple file system paths for an inode. * total_bytes to allocate are passed, note that space usable for actual path * information will be total_bytes - sizeof(struct inode_fs_paths). - * the returned pointer must be freed with free_ipath() in the end. + * the returned pointer must be freed with __free_inode_fs_paths() in the end. */ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path) @@ -2811,14 +2810,6 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, return ifp; } -void free_ipath(struct inode_fs_paths *ipath) -{ - if (!ipath) - return; - kvfree(ipath->fspath); - kfree(ipath); -} - struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info) { struct btrfs_backref_iter *ret; @@ -2834,8 +2825,8 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf } /* Current backref iterator only supports iteration in commit root */ - ret->path->search_commit_root = 1; - ret->path->skip_locking = 1; + ret->path->search_commit_root = true; + ret->path->skip_locking = true; ret->fs_info = fs_info; return ret; @@ -3308,8 +3299,8 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, level = cur->level + 1; /* Search the tree to find parent blocks referring to the block */ - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; path->lowest_level = level; ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0); path->lowest_level = 0; @@ -3323,9 +3314,9 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, eb = path->nodes[level]; if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { btrfs_err(fs_info, -"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", +"couldn't find block (%llu) (level %d) in tree (%llu) with key " BTRFS_KEY_FMT, cur->bytenr, level - 1, btrfs_root_id(root), - tree_key->objectid, tree_key->type, tree_key->offset); + BTRFS_KEY_FMT_VALUE(tree_key)); btrfs_put_root(root); ret = -ENOENT; goto out; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 25d51c246070..1d009b0f4c69 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -241,7 +241,12 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); -void free_ipath(struct inode_fs_paths *ipath); + +DEFINE_FREE(inode_fs_paths, struct inode_fs_paths *, + if (_T) { + kvfree(_T->fspath); + kfree(_T); + }) int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 21df48e6c4fa..fa1d321a2fb8 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -41,13 +41,17 @@ static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio) * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private) { + /* @inode parameter is mandatory. */ + ASSERT(inode); + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); - bbio->fs_info = fs_info; + bbio->inode = inode; bbio->end_io = end_io; bbio->private = private; + bbio->file_offset = file_offset; atomic_set(&bbio->pending_ios, 1); WRITE_ONCE(bbio->status, BLK_STS_OK); } @@ -60,7 +64,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, * a mempool. */ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private) { struct btrfs_bio *bbio; @@ -68,7 +72,7 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, fs_info, end_io, private); + btrfs_bio_init(bbio, inode, file_offset, end_io, private); return bbio; } @@ -85,13 +89,13 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return ERR_CAST(bio); bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); - bbio->inode = orig_bbio->inode; - bbio->file_offset = orig_bbio->file_offset; + btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio); orig_bbio->file_offset += map_length; if (bbio_has_ordered_extent(bbio)) { refcount_inc(&orig_bbio->ordered->refs); bbio->ordered = orig_bbio->ordered; + bbio->orig_logical = orig_bbio->orig_logical; + orig_bbio->orig_logical += map_length; } bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; atomic_inc(&orig_bbio->pending_ios); @@ -100,6 +104,12 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { + /* Make sure we're already in task context. */ + ASSERT(in_task()); + + if (bbio->async_csum) + wait_for_completion(&bbio->csum_done); + bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; @@ -163,11 +173,30 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct btrfs_failed_bio *fbio = repair_bbio->private; struct btrfs_inode *inode = repair_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + /* + * We can not move forward the saved_iter, as it will be later + * utilized by repair_bbio again. + */ + struct bvec_iter saved_iter = repair_bbio->saved_iter; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT; + const u32 nr_steps = repair_bbio->saved_iter.bi_size / step; int mirror = repair_bbio->mirror_num; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; + phys_addr_t paddr; + unsigned int slot = 0; + + /* Repair bbio should be eaxctly one block sized. */ + ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize); + + btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { + ASSERT(slot < nr_steps); + paddrs[slot] = paddr; + slot++; + } if (repair_bbio->bio.bi_status || - !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { + !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); repair_bbio->bio.bi_iter = repair_bbio->saved_iter; @@ -186,8 +215,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, mirror = prev_repair_mirror(fbio, mirror); btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, - repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - bvec_phys(bv), mirror); + logical, paddrs, step, mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -204,21 +232,25 @@ done: */ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, u32 bio_offset, - phys_addr_t paddr, + phys_addr_t paddrs[], struct btrfs_failed_bio *fbio) { struct btrfs_inode *inode = failed_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct folio *folio = page_folio(phys_to_page(paddr)); const u32 sectorsize = fs_info->sectorsize; - const u32 foff = offset_in_folio(folio, paddr); - const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 nr_steps = sectorsize / step; + /* + * For bs > ps cases, the saved_iter can be partially moved forward. + * In that case we should round it down to the block boundary. + */ + const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + sectorsize); struct btrfs_bio *repair_bbio; struct bio *repair_bio; int num_copies; int mirror; - ASSERT(foff + sectorsize <= folio_size(folio)); btrfs_debug(fs_info, "repair read error: read error at %llu", failed_bbio->file_offset + bio_offset); @@ -238,15 +270,22 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, atomic_inc(&fbio->repair_count); - repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); - repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - bio_add_folio_nofail(repair_bio, folio, sectorsize, foff); + repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT; + for (int i = 0; i < nr_steps; i++) { + int ret; + + ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE); + + ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step, + offset_in_page(paddrs[i])); + ASSERT(ret == step); + } repair_bbio = btrfs_bio(repair_bio); - btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); - repair_bbio->inode = failed_bbio->inode; - repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset, + NULL, fbio); mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); @@ -258,10 +297,13 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 sectorsize = fs_info->sectorsize; + const u32 sectorsize = fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); + const u32 nr_steps = sectorsize / step; struct bvec_iter *iter = &bbio->saved_iter; blk_status_t status = bbio->bio.bi_status; struct btrfs_failed_bio *fbio = NULL; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; phys_addr_t paddr; u32 offset = 0; @@ -280,13 +322,19 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de /* Clear the I/O error. A failed repair will reset it. */ bbio->bio.bi_status = BLK_STS_OK; - btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) { - if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr)) - fbio = repair_one_sector(bbio, offset, paddr, fbio); - offset += sectorsize; + btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) { + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; + + if (IS_ALIGNED(offset, sectorsize)) { + if (status || + !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs)) + fbio = repair_one_sector(bbio, offset - sectorsize, + paddrs, fbio); + } } if (bbio->csum != bbio->csum_inline) - kfree(bbio->csum); + kvfree(bbio->csum); if (fbio) btrfs_repair_done(fbio); @@ -317,36 +365,35 @@ static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_i return fs_info->endio_workers; } -static void btrfs_end_bio_work(struct work_struct *work) +static void simple_end_io_work(struct work_struct *work) { struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + struct bio *bio = &bbio->bio; - /* Metadata reads are checked and repaired by the submitter. */ - if (is_data_bbio(bbio)) - btrfs_check_read_bio(bbio, bbio->bio.bi_private); - else - btrfs_bio_end_io(bbio, bbio->bio.bi_status); + if (bio_op(bio) == REQ_OP_READ) { + /* Metadata reads are checked and repaired by the submitter. */ + if (is_data_bbio(bbio)) + return btrfs_check_read_bio(bbio, bbio->bio.bi_private); + return btrfs_bio_end_io(bbio, bbio->bio.bi_status); + } + if (bio_is_zone_append(bio) && !bio->bi_status) + btrfs_record_physical_zoned(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) { struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_device *dev = bio->bi_private; - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; btrfs_bio_counter_dec(fs_info); if (bio->bi_status) btrfs_log_dev_io_error(bio, dev); - if (bio_op(bio) == REQ_OP_READ) { - INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); - queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); - } else { - if (bio_is_zone_append(bio) && !bio->bi_status) - btrfs_record_physical_zoned(bbio); - btrfs_bio_end_io(bbio, bbio->bio.bi_status); - } + INIT_WORK(&bbio->end_io_work, simple_end_io_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } static void btrfs_raid56_end_io(struct bio *bio) @@ -354,6 +401,9 @@ static void btrfs_raid56_end_io(struct bio *bio) struct btrfs_io_context *bioc = bio->bi_private; struct btrfs_bio *bbio = btrfs_bio(bio); + /* RAID56 endio is always handled in workqueue. */ + ASSERT(in_task()); + btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) @@ -364,11 +414,12 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_put_bioc(bioc); } -static void btrfs_orig_write_end_io(struct bio *bio) +static void orig_write_end_io_work(struct work_struct *work) { + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + struct bio *bio = &bbio->bio; struct btrfs_io_stripe *stripe = bio->bi_private; struct btrfs_io_context *bioc = stripe->bioc; - struct btrfs_bio *bbio = btrfs_bio(bio); btrfs_bio_counter_dec(bioc->fs_info); @@ -393,8 +444,18 @@ static void btrfs_orig_write_end_io(struct bio *bio) btrfs_put_bioc(bioc); } -static void btrfs_clone_write_end_io(struct bio *bio) +static void btrfs_orig_write_end_io(struct bio *bio) { + struct btrfs_bio *bbio = btrfs_bio(bio); + + INIT_WORK(&bbio->end_io_work, orig_write_end_io_work); + queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); +} + +static void clone_write_end_io_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + struct bio *bio = &bbio->bio; struct btrfs_io_stripe *stripe = bio->bi_private; if (bio->bi_status) { @@ -409,6 +470,14 @@ static void btrfs_clone_write_end_io(struct bio *bio) bio_put(bio); } +static void btrfs_clone_write_end_io(struct bio *bio) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + + INIT_WORK(&bbio->end_io_work, clone_write_end_io_work); + queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); +} + static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) { if (!dev || !dev->bdev || @@ -455,6 +524,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) { struct bio *orig_bio = bioc->orig_bio, *bio; + struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio); ASSERT(bio_op(orig_bio) != REQ_OP_READ); @@ -463,8 +533,11 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) bio = orig_bio; bio->bi_end_io = btrfs_orig_write_end_io; } else { - bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + /* We need to use endio_work to run end_io in task context. */ + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset); bio_inc_remaining(orig_bio); + btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, + orig_bbio->file_offset, NULL, NULL); bio->bi_end_io = btrfs_clone_write_end_io; } @@ -509,7 +582,11 @@ static int btrfs_bio_csum(struct btrfs_bio *bbio) { if (bbio->bio.bi_opf & REQ_META) return btree_csum_one_bio(bbio); - return btrfs_csum_one_bio(bbio); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + return btrfs_csum_one_bio(bbio, true); +#else + return btrfs_csum_one_bio(bbio, false); +#endif } /* @@ -581,20 +658,25 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) static bool should_async_write(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; bool auto_csum_mode = true; #ifdef CONFIG_BTRFS_EXPERIMENTAL - struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); - if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) - return false; - - auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO); + if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON) + return true; + /* + * Write bios will calculate checksum and submit bio at the same time. + * Unless explicitly required don't offload serial csum calculate and bio + * submit into a workqueue. + */ + return false; #endif /* Submit synchronously if the checksum implementation is fast. */ - if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) return false; /* @@ -605,7 +687,7 @@ static bool should_async_write(struct btrfs_bio *bbio) return false; /* Zoned devices require I/O to be submitted in order. */ - if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) + if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info)) return false; return true; @@ -620,7 +702,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); @@ -639,11 +721,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; unsigned int nr_segs; int sector_offset; - map_length = min(map_length, bbio->fs_info->max_zone_append_size); - sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + map_length = min(map_length, fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits, &nr_segs, map_length); if (sector_offset) { /* @@ -651,7 +734,7 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) * sectorsize and thus cause unaligned I/Os. Fix that by * always rounding down to the nearest boundary. */ - return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize); } return map_length; } @@ -659,7 +742,7 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; @@ -670,7 +753,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t status; int ret; - if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root)) smap.rst_search_commit_root = true; else smap.rst_search_commit_root = false; @@ -684,6 +767,14 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) goto end_bbio; } + /* + * For fscrypt writes we will get the encrypted bio after we've remapped + * our bio to the physical disk location, so we need to save the + * original bytenr so we know what we're checksumming. + */ + if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio)) + bbio->orig_logical = logical; + map_length = min(map_length, length); if (use_append) map_length = btrfs_append_map_length(bbio, map_length); @@ -734,7 +825,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. */ - if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && + if (!(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) && @@ -782,25 +873,27 @@ end_bbio: static void assert_bbio_alignment(struct btrfs_bio *bbio) { #ifdef CONFIG_BTRFS_ASSERT - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct bio_vec bvec; struct bvec_iter iter; const u32 blocksize = fs_info->sectorsize; + const u32 alignment = min(blocksize, PAGE_SIZE); + const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + const u32 length = bbio->bio.bi_iter.bi_size; - /* Metadata has no extra bs > ps alignment requirement. */ - if (!is_data_bbio(bbio)) - return; + /* The logical and length should still be aligned to blocksize. */ + ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) && + length != 0, "root=%llu inode=%llu logical=%llu length=%u", + btrfs_root_id(bbio->inode->root), + btrfs_ino(bbio->inode), logical, length); bio_for_each_bvec(bvec, &bbio->bio, iter) - ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) && - IS_ALIGNED(bvec.bv_len, blocksize), + ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) && + IS_ALIGNED(bvec.bv_len, alignment), "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u", btrfs_root_id(bbio->inode->root), - btrfs_ino(bbio->inode), - bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT, - bbio->bio.bi_iter.bi_size, iter.bi_idx, - bvec.bv_offset, - bvec.bv_len); + btrfs_ino(bbio->inode), logical, length, iter.bi_idx, + bvec.bv_offset, bvec.bv_len); #endif } @@ -824,18 +917,36 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) * * The I/O is issued synchronously to block the repair read completion from * freeing the bio. + * + * @ino: Offending inode number + * @fileoff: File offset inside the inode + * @length: Length of the repair write + * @logical: Logical address of the range + * @paddrs: Physical address array of the content + * @step: Length of for each paddrs + * @mirror_num: Mirror number to write to. Must not be zero */ -int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, phys_addr_t paddr, int mirror_num) +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, + u32 length, u64 logical, const phys_addr_t paddrs[], + unsigned int step, int mirror_num) { + const u32 nr_steps = DIV_ROUND_UP_POW2(length, step); struct btrfs_io_stripe smap = { 0 }; - struct bio_vec bvec; - struct bio bio; + struct bio *bio = NULL; int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); + /* Basic alignment checks. */ + ASSERT(IS_ALIGNED(logical, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(length, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize)); + /* Either it's a single data or metadata block. */ + ASSERT(length <= BTRFS_MAX_BLOCKSIZE); + ASSERT(step <= length); + ASSERT(is_power_of_2(step)); + if (btrfs_repair_one_zone(fs_info, logical)) return 0; @@ -855,24 +966,27 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, goto out_counter_dec; } - bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr)); - ret = submit_bio_wait(&bio); + bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + for (int i = 0; i < nr_steps; i++) { + ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i])); + /* We should have allocated enough slots to contain all the different pages. */ + ASSERT(ret == step); + } + ret = submit_bio_wait(bio); + bio_put(bio); if (ret) { /* try to remap that extent elsewhere? */ btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); - goto out_bio_uninit; + goto out_counter_dec; } btrfs_info_rl(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, btrfs_dev_name(smap.dev), + ino, fileoff, btrfs_dev_name(smap.dev), smap.physical >> SECTOR_SHIFT); ret = 0; -out_bio_uninit: - bio_uninit(&bio); out_counter_dec: btrfs_bio_counter_dec(fs_info); return ret; @@ -885,16 +999,16 @@ out_counter_dec: */ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bbio->bio.bi_iter.bi_size; struct btrfs_io_stripe smap = { 0 }; int ret; - ASSERT(fs_info); ASSERT(mirror_num > 0); ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); - ASSERT(!bbio->inode); + ASSERT(!is_data_inode(bbio->inode)); + ASSERT(bbio->is_scrub); btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 00883aea55d7..1be74209f0b8 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -18,13 +18,6 @@ struct btrfs_inode; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 -/* - * Maximum number of sectors for a single bio to limit the size of the - * checksum array. This matches the number of bio_vecs per bio and thus the - * I/O size for buffered I/O. - */ -#define BTRFS_MAX_BIO_SECTORS (256) - typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* @@ -34,7 +27,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); struct btrfs_bio { /* * Inode and offset into it that this I/O operates on. - * Only set for data I/O. + * + * If the inode is a data one, csum verification and read-repair + * will be done automatically. + * If the inode is a metadata one, everything is handled by the caller. */ struct btrfs_inode *inode; u64 file_offset; @@ -56,11 +52,16 @@ struct btrfs_bio { * - pointer to the checksums for this bio * - original physical address from the allocator * (for zone append only) + * - original logical address, used for checksumming fscrypt bios */ struct { struct btrfs_ordered_extent *ordered; struct btrfs_ordered_sum *sums; + struct work_struct csum_work; + struct completion csum_done; + struct bvec_iter csum_saved_iter; u64 orig_physical; + u64 orig_logical; }; /* For metadata reads: parentness verification. */ @@ -76,14 +77,21 @@ struct btrfs_bio { atomic_t pending_ios; struct work_struct end_io_work; - /* File system that this I/O operates on. */ - struct btrfs_fs_info *fs_info; - /* Save the first error status of split bio. */ blk_status_t status; /* Use the commit root to look up csums (data read bio only). */ bool csum_search_commit_root; + + /* + * Since scrub will reuse btree inode, we need this flag to distinguish + * scrub bios. + */ + bool is_scrub; + + /* Whether the csum generation for data write is async. */ + bool async_csum; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -99,10 +107,10 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private); struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private); void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); @@ -111,7 +119,8 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); -int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, phys_addr_t paddr, int mirror_num); +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, + u32 length, u64 logical, const phys_addr_t paddrs[], + unsigned int step, int mirror_num); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5322ef2ae015..08b14449fabe 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -613,8 +613,8 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET)); - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; search_offset = index * div_u64(block_group->length, max_index); @@ -744,8 +744,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) * root to add free space. So we skip locking and search the commit * root, since its read-only */ - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; key.objectid = last; @@ -1065,7 +1065,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; struct inode *inode; @@ -1305,7 +1305,6 @@ out: btrfs_put_block_group(block_group); if (remove_rsv) btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); - btrfs_free_path(path); return ret; } @@ -1403,8 +1402,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of * leeway to allow us to mark this block group as read only. */ - if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, - BTRFS_RESERVE_NO_FLUSH)) + if (btrfs_can_overcommit(sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH)) ret = 0; } @@ -1425,7 +1423,7 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false); + btrfs_dump_space_info(cache->space_info, 0, false); } return ret; } @@ -1850,12 +1848,10 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!btrfs_should_reclaim(fs_info)) return; - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - sb_end_write(fs_info->sb); + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) return; - } /* * Long running balances can keep us blocked here for eternity, so @@ -1863,7 +1859,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); return; } @@ -1947,7 +1942,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) /* * Get out fast, in case we're read-only or unmounting the * filesystem. It is OK to drop block groups from the list even - * for the read-only case. As we did sb_start_write(), + * for the read-only case. As we did take the super write lock, * "mount -o remount,ro" won't happen and read-only filesystem * means it is forced read-only due to a fatal error. So, it * never gets back to read-write to let us reclaim again. @@ -2030,7 +2025,6 @@ end: list_splice_tail(&retry_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) @@ -3072,7 +3066,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, * We have allocated a new chunk. We also need to activate that chunk to * grant metadata tickets for zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); + ret = btrfs_zoned_activate_one_bg(space_info, true); if (ret < 0) goto out; @@ -3803,7 +3797,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc, + u64 ram_bytes, u64 num_bytes, bool delalloc, bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; @@ -3814,30 +3808,38 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - goto out; + goto out_error; } if (btrfs_block_group_should_use_size_class(cache)) { size_class = btrfs_calc_block_group_size_class(num_bytes); ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); if (ret) - goto out; + goto out_error; } + cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; + if (delalloc) + cache->delalloc_bytes += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", space_info->flags, num_bytes, 1); + spin_unlock(&cache->lock); + + space_info->bytes_reserved += num_bytes; btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; /* * Compression can use less space than we reserved, so wake tickets if * that happens. */ if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); -out: + btrfs_try_granting_tickets(space_info); + spin_unlock(&space_info->lock); + + return 0; + +out_error: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -3859,22 +3861,25 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, bool is_delalloc) { struct btrfs_space_info *space_info = cache->space_info; + bool bg_ro; spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (cache->ro) + bg_ro = cache->ro; + cache->reserved -= num_bytes; + if (is_delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + + if (bg_ro) space_info->bytes_readonly += num_bytes; else if (btrfs_is_zoned(cache->fs_info)) space_info->bytes_zone_unusable += num_bytes; - cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; - if (is_delalloc) - cache->delalloc_bytes -= num_bytes; - spin_unlock(&cache->lock); - - btrfs_try_granting_tickets(cache->fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } @@ -4192,11 +4197,11 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, should_alloc = should_alloc_chunk(fs_info, space_info, force); if (space_info->full) { /* No more free physical space */ + spin_unlock(&space_info->lock); if (should_alloc) ret = -ENOSPC; else ret = 0; - spin_unlock(&space_info->lock); return ret; } else if (!should_alloc) { spin_unlock(&space_info->lock); @@ -4208,16 +4213,16 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, * recheck if we should continue with our allocation * attempt. */ + spin_unlock(&space_info->lock); wait_for_alloc = true; force = CHUNK_ALLOC_NO_FORCE; - spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { /* Proceed with allocation */ - space_info->chunk_alloc = 1; - wait_for_alloc = false; + space_info->chunk_alloc = true; spin_unlock(&space_info->lock); + wait_for_alloc = false; } cond_resched(); @@ -4264,7 +4269,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) - space_info->full = 1; + space_info->full = true; else goto out; } else { @@ -4274,7 +4279,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; out: - space_info->chunk_alloc = 0; + space_info->chunk_alloc = false; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); @@ -4315,7 +4320,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, bytes, type); - btrfs_dump_space_info(fs_info, info, 0, false); + btrfs_dump_space_info(info, 0, false); } if (left < bytes) { @@ -4340,7 +4345,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, * We have a new chunk. We also need to activate it for * zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + ret = btrfs_zoned_activate_one_bg(info, true); if (ret < 0) return; @@ -4460,7 +4465,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) * indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, false); + btrfs_dump_space_info(space_info, 0, false); /* * If there was a failure to cleanup a log tree, very likely due to an @@ -4471,7 +4476,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, false); + btrfs_dump_space_info(space_info, 0, false); } WARN_ON(space_info->reclaim_size > 0); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9172104a5889..5f933455118c 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -345,7 +345,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc, + u64 ram_bytes, u64 num_bytes, bool delalloc, bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, bool is_delalloc); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 5ad6de738aee..96cf7a162987 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -218,8 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -259,8 +258,7 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -387,7 +385,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = block_rsv->reserved - block_rsv->size; btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes); block_rsv->reserved = block_rsv->size; - btrfs_try_granting_tickets(fs_info, sinfo); + btrfs_try_granting_tickets(sinfo); } block_rsv->full = (block_rsv->reserved == block_rsv->size); @@ -530,8 +528,8 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - blocksize, BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; /* @@ -552,7 +550,7 @@ try_reserve: * one last time to force a reservation if there's enough actual space * on disk to make the reservation. */ - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize, + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize, BTRFS_RESERVE_FLUSH_EMERGENCY); if (!ret) return block_rsv; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index af373d50a901..73602ee8de3f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -18,20 +18,20 @@ #include <linux/lockdep.h> #include <uapi/linux/btrfs_tree.h> #include <trace/events/btrfs.h> +#include "ctree.h" #include "block-rsv.h" #include "extent_map.h" -#include "extent_io.h" #include "extent-io-tree.h" -#include "ordered-data.h" -#include "delayed-inode.h" -struct extent_state; struct posix_acl; struct iov_iter; struct writeback_control; struct btrfs_root; struct btrfs_fs_info; struct btrfs_trans_handle; +struct btrfs_bio; +struct btrfs_file_extent; +struct btrfs_delayed_node; /* * Since we search a directory based on f_pos (struct dir_context::pos) we have @@ -543,16 +543,14 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) #endif } -/* Array of bytes with variable length, hexadecimal format 0x1234 */ -#define CSUM_FMT "0x%*phN" -#define CSUM_FMT_VALUE(size, bytes) size, bytes - -void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, - u8 *dest); +void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info, + const phys_addr_t paddr, u8 *dest); +void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, + const phys_addr_t paddrs[], u8 *dest); int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, phys_addr_t paddr); + u32 bio_offset, const phys_addr_t paddrs[]); noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bacad18357b3..7dda6cc68379 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -67,9 +67,7 @@ static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode, bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op, GFP_NOFS, &btrfs_compressed_bioset)); - btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL); - bbio->inode = inode; - bbio->file_offset = start; + btrfs_bio_init(bbio, inode, start, end_io, NULL); return to_compressed_bio(bbio); } @@ -194,15 +192,13 @@ static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_c static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc) { - struct list_head remove; + LIST_HEAD(remove); struct list_head *tmp, *next; int freed; if (compr_pool.count == 0) return SHRINK_STOP; - INIT_LIST_HEAD(&remove); - /* For now, just simply drain the whole list. */ spin_lock(&compr_pool.lock); list_splice_init(&compr_pool.list, &remove); @@ -321,22 +317,6 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) /* the inode may be gone now */ } -static void btrfs_finish_compressed_write_work(struct work_struct *work) -{ - struct compressed_bio *cb = - container_of(work, struct compressed_bio, write_end_work); - - btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, - cb->bbio.bio.bi_status == BLK_STS_OK); - - if (cb->writeback) - end_compressed_writeback(cb); - /* Note, our inode could be gone now */ - - btrfs_free_compressed_folios(cb); - bio_put(&cb->bbio.bio); -} - /* * Do the cleanup once all the compressed pages hit the disk. This will clear * writeback on the file pages and free the compressed pages. @@ -347,28 +327,33 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) static void end_bbio_compressed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); + btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, + cb->bbio.bio.bi_status == BLK_STS_OK); + + if (cb->writeback) + end_compressed_writeback(cb); + /* Note, our inode could be gone now. */ + btrfs_free_compressed_folios(cb); + bio_put(&cb->bbio.bio); } static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { - struct btrfs_fs_info *fs_info = cb->bbio.fs_info; struct bio *bio = &cb->bbio.bio; u32 offset = 0; + unsigned int findex = 0; while (offset < cb->compressed_len) { - struct folio *folio; + struct folio *folio = cb->compressed_folios[findex]; + u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio)); int ret; - u32 len = min_t(u32, cb->compressed_len - offset, - btrfs_min_folio_size(fs_info)); - folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)]; /* Maximum compressed extent is smaller than bio size limit. */ ret = bio_add_folio(bio, folio, len, 0); ASSERT(ret); offset += len; + findex++; } } @@ -402,7 +387,6 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb->compressed_folios = compressed_folios; cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; - INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_folios = nr_folios; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; @@ -1100,7 +1084,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) /* * a less complex decompression routine. Our compressed data fits in a * single page, and we want to read a single page out of it. - * start_byte tells us the offset into the compressed data we're interested in + * dest_pgoff tells us the offset into the destination folio where we write the + * decompressed data. */ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index eba188a9e3bb..e0228017e861 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -14,14 +14,12 @@ #include <linux/pagemap.h> #include "bio.h" #include "fs.h" -#include "messages.h" +#include "btrfs_inode.h" struct address_space; -struct page; struct inode; struct btrfs_inode; struct btrfs_ordered_extent; -struct btrfs_bio; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -65,11 +63,8 @@ struct compressed_bio { /* Whether this is a write for writeback. */ bool writeback; - union { - /* For reads, this is the bio we are copying the data into */ - struct btrfs_bio *orig_bbio; - struct work_struct write_end_work; - }; + /* For reads, this is the bio we are copying the data into. */ + struct btrfs_bio *orig_bbio; /* Must be last. */ struct btrfs_bio bbio; @@ -77,7 +72,7 @@ struct compressed_bio { static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb) { - return cb->bbio.fs_info; + return cb->bbio.inode->root->fs_info; } /* @range_end must be exclusive. */ @@ -85,8 +80,8 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6 { /* @cur must be inside the folio. */ ASSERT(folio_pos(folio) <= cur); - ASSERT(cur < folio_end(folio)); - return min(range_end, folio_end(folio)) - cur; + ASSERT(cur < folio_next_pos(folio)); + return umin(range_end, folio_next_pos(folio)) - cur; } int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info); @@ -100,7 +95,7 @@ int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inod u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, - unsigned long start_byte, size_t srclen, size_t destlen); + unsigned long dest_pgoff, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 561658aca018..a48b4befbee7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -862,6 +862,75 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, } /* + * Promote a child node to become the new tree root. + * + * @trans: Transaction handle + * @root: Tree root structure to update + * @path: Path holding nodes and locks + * @level: Level of the parent (old root) + * @parent: The parent (old root) with exactly one item + * + * This helper is called during rebalancing when the root node contains only + * a single item (nritems == 1). We can reduce the tree height by promoting + * that child to become the new root and freeing the old root node. The path + * locks and references are updated accordingly. + * + * Return: 0 on success, negative errno on failure. The transaction is aborted + * on critical errors. + */ +static int promote_child_to_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int level, struct extent_buffer *parent) +{ + struct extent_buffer *child; + int ret; + + ASSERT(btrfs_header_nritems(parent) == 1); + + child = btrfs_read_node_slot(parent, 0); + if (IS_ERR(child)) + return PTR_ERR(child); + + btrfs_tree_lock(child); + ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + return ret; + } + + ret = btrfs_tree_mod_log_insert_root(root->node, child, true); + if (unlikely(ret < 0)) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + btrfs_abort_transaction(trans, ret); + return ret; + } + rcu_assign_pointer(root->node, child); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + + path->locks[level] = 0; + path->nodes[level] = NULL; + btrfs_clear_buffer_dirty(trans, parent); + btrfs_tree_unlock(parent); + /* Once for the path. */ + free_extent_buffer(parent); + + root_sub_used_bytes(root); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1); + /* Once for the root ptr. */ + free_extent_buffer_stale(parent); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + return 0; +} + +/* * node level balancing, used to make sure nodes are in proper order for * item deletion. We balance from the top down, so we have to make sure * that a deletion won't leave an node completely empty later on. @@ -900,55 +969,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * by promoting the node below to a root */ if (!parent) { - struct extent_buffer *child; - if (btrfs_header_nritems(mid) != 1) return 0; - /* promote the child to a root */ - child = btrfs_read_node_slot(mid, 0); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - goto out; - } - - btrfs_tree_lock(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child, - BTRFS_NESTING_COW); - if (ret) { - btrfs_tree_unlock(child); - free_extent_buffer(child); - goto out; - } - - ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - if (unlikely(ret < 0)) { - btrfs_tree_unlock(child); - free_extent_buffer(child); - btrfs_abort_transaction(trans, ret); - goto out; - } - rcu_assign_pointer(root->node, child); - - add_root_to_dirty_list(root); - btrfs_tree_unlock(child); - - path->locks[level] = 0; - path->nodes[level] = NULL; - btrfs_clear_buffer_dirty(trans, mid); - btrfs_tree_unlock(mid); - /* once for the path */ - free_extent_buffer(mid); - - root_sub_used_bytes(root); - ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); - /* once for the root ptr */ - free_extent_buffer_stale(mid); - if (unlikely(ret < 0)) { - btrfs_abort_transaction(trans, ret); - goto out; - } - return 0; + return promote_child_to_root(trans, root, path, level, mid); } if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) @@ -1101,11 +1125,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { - refcount_inc(&left->refs); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; + /* Left is now owned by path. */ + left = NULL; if (mid) { btrfs_tree_unlock(mid); free_extent_buffer(mid); @@ -1125,8 +1150,7 @@ out: free_extent_buffer(right); } if (left) { - if (path->nodes[level] != left) - btrfs_tree_unlock(left); + btrfs_tree_unlock(left); free_extent_buffer(left); } return ret; @@ -1435,8 +1459,8 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } if (i >= lowest_unlock && i > skip_level) { - check_skip = false; btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + check_skip = false; path->locks[i] = 0; if (write_lock_level && i > min_write_lock_level && @@ -1709,9 +1733,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when - * p->search_commit_root = 1. + * p->search_commit_root is true. */ - ASSERT(p->skip_locking == 1); + ASSERT(p->skip_locking); goto out; } @@ -2599,12 +2623,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, - "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, slot, btrfs_disk_key_objectid(&disk_key), btrfs_disk_key_type(&disk_key), btrfs_disk_key_offset(&disk_key), - new_key->objectid, new_key->type, - new_key->offset); + BTRFS_KEY_FMT_VALUE(new_key)); BUG(); } } @@ -2613,12 +2636,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, - "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, slot, btrfs_disk_key_objectid(&disk_key), btrfs_disk_key_type(&disk_key), btrfs_disk_key_offset(&disk_key), - new_key->objectid, new_key->type, - new_key->offset); + BTRFS_KEY_FMT_VALUE(new_key)); BUG(); } } @@ -2677,10 +2699,9 @@ static bool check_sibling_keys(const struct extent_buffer *left, btrfs_crit(left->fs_info, "right extent buffer:"); btrfs_print_tree(right, false); btrfs_crit(left->fs_info, -"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", - left_last.objectid, left_last.type, - left_last.offset, right_first.objectid, - right_first.type, right_first.offset); +"bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&left_last), + BTRFS_KEY_FMT_VALUE(&right_first)); return true; } return false; @@ -3217,10 +3238,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) - btrfs_clear_buffer_dirty(trans, path->nodes[0]); - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); + btrfs_tree_unlock(left); + free_extent_buffer(left); path->nodes[0] = right; path->slots[1] += 1; } else { @@ -3398,9 +3417,13 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, old_left_nritems + push_items); /* fixup right node */ - if (push_items > right_nritems) - WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, - right_nritems); + if (unlikely(push_items > right_nritems)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)", + push_items, right_nritems); + goto out; + } if (push_items < right_nritems) { push_space = btrfs_item_offset(right, push_items - 1) - @@ -3433,8 +3456,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); + btrfs_tree_unlock(right); + free_extent_buffer(right); path->nodes[0] = left; path->slots[1] -= 1; } else { @@ -3861,10 +3884,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - path->keep_locks = 1; - path->search_for_split = 1; + path->keep_locks = true; + path->search_for_split = true; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - path->search_for_split = 0; + path->search_for_split = false; if (ret > 0) ret = -EAGAIN; if (ret < 0) @@ -3891,11 +3914,11 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, if (ret) goto err; - path->keep_locks = 0; + path->keep_locks = false; btrfs_unlock_up_safe(path, 1); return 0; err: - path->keep_locks = 0; + path->keep_locks = false; return ret; } @@ -4109,7 +4132,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(leaf); - if (btrfs_leaf_free_space(leaf) < data_size) { + if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) { btrfs_print_leaf(leaf); BUG(); } @@ -4139,7 +4162,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, memmove_leaf_data(leaf, data_end - data_size, data_end, old_data - data_end); - data_end = old_data; old_size = btrfs_item_size(leaf, slot); btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(trans, leaf); @@ -4498,9 +4520,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* delete the leaf if we've emptied it */ if (nritems == 0) { - if (leaf == root->node) { - btrfs_set_header_level(leaf, 0); - } else { + if (leaf != root->node) { btrfs_clear_buffer_dirty(trans, leaf); ret = btrfs_del_leaf(trans, root, path, leaf); if (ret < 0) @@ -4566,10 +4586,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; ret = btrfs_del_leaf(trans, root, path, leaf); + free_extent_buffer(leaf); if (ret < 0) return ret; - free_extent_buffer(leaf); - ret = 0; } else { /* if we're still in the path, make sure * we're dirty. Otherwise, one of the @@ -4613,11 +4632,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u32 nritems; int level; int ret = 1; - int keep_locks = path->keep_locks; + const bool keep_locks = path->keep_locks; ASSERT(!path->nowait); ASSERT(path->lowest_level == 0); - path->keep_locks = 1; + path->keep_locks = true; again: cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); @@ -4707,7 +4726,7 @@ out: * 0 is returned if another key is found, < 0 if there are any errors * and 1 is returned if there are no higher keys in the tree * - * path->keep_locks should be set to 1 on the search made before + * path->keep_locks should be set to true on the search made before * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -4806,13 +4825,13 @@ again: next = NULL; btrfs_release_path(path); - path->keep_locks = 1; + path->keep_locks = true; if (time_seq) { ret = btrfs_search_old_slot(root, &key, path, time_seq); } else { if (path->need_commit_sem) { - path->need_commit_sem = 0; + path->need_commit_sem = false; need_commit_sem = true; if (path->nowait) { if (!down_read_trylock(&fs_info->commit_root_sem)) { @@ -4825,41 +4844,30 @@ again: } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); } - path->keep_locks = 0; + path->keep_locks = false; if (ret < 0) goto done; nritems = btrfs_header_nritems(path->nodes[0]); /* - * by releasing the path above we dropped all our locks. A balance - * could have added more items next to the key that used to be - * at the very end of the block. So, check again here and - * advance the path if there are now more items available. - */ - if (nritems > 0 && path->slots[0] < nritems - 1) { - if (ret == 0) - path->slots[0]++; - ret = 0; - goto done; - } - /* - * So the above check misses one case: - * - after releasing the path above, someone has removed the item that - * used to be at the very end of the block, and balance between leafs - * gets another one with bigger key.offset to replace it. + * By releasing the path above we dropped all our locks. A balance + * could have happened and * - * This one should be returned as well, or we can get leaf corruption - * later(esp. in __btrfs_drop_extents()). + * 1. added more items after the previous last item + * 2. deleted the previous last item * - * And a bit more explanation about this check, - * with ret > 0, the key isn't found, the path points to the slot - * where it should be inserted, so the path->slots[0] item must be the - * bigger one. + * So, check again here and advance the path if there are now more + * items available. */ - if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { - ret = 0; - goto done; + if (nritems > 0 && path->slots[0] <= nritems - 1) { + if (ret == 0 && path->slots[0] != nritems - 1) { + path->slots[0]++; + goto done; + } else if (ret > 0) { + ret = 0; + goto done; + } } while (level < BTRFS_MAX_LEVEL) { @@ -4964,7 +4972,7 @@ done: if (need_commit_sem) { int ret2; - path->need_commit_sem = 1; + path->need_commit_sem = true; ret2 = finish_need_commit_sem_search(path); up_read(&fs_info->commit_root_sem); if (ret2) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe70b593c7cd..692370fc07b2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -17,9 +17,7 @@ #include <linux/refcount.h> #include <uapi/linux/btrfs_tree.h> #include "locking.h" -#include "fs.h" #include "accessors.h" -#include "extent-io-tree.h" struct extent_buffer; struct btrfs_block_rsv; @@ -67,21 +65,21 @@ struct btrfs_path { * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ - unsigned int search_for_split:1; + bool search_for_split:1; /* Keep some upper locks as we walk down. */ - unsigned int keep_locks:1; - unsigned int skip_locking:1; - unsigned int search_commit_root:1; - unsigned int need_commit_sem:1; - unsigned int skip_release_on_error:1; + bool keep_locks:1; + bool skip_locking:1; + bool search_commit_root:1; + bool need_commit_sem:1; + bool skip_release_on_error:1; /* * Indicate that new item (btrfs_search_slot) is extending already * existing item and ins_len contains only the data size and not item * header (ie. sizeof(struct btrfs_item) is not included). */ - unsigned int search_for_extension:1; + bool search_for_extension:1; /* Stop search if any locks need to be taken (for read) */ - unsigned int nowait:1; + bool nowait:1; }; #define BTRFS_PATH_AUTO_FREE(path_name) \ diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 7b277934f66f..b81e224d4a27 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -15,6 +15,7 @@ #include "defrag.h" #include "file-item.h" #include "super.h" +#include "compression.h" static struct kmem_cache *btrfs_inode_defrag_cachep; @@ -254,10 +255,9 @@ again: range.extent_thresh = defrag->extent_thresh; file_ra_state_init(ra, inode->vfs_inode.i_mapping); - sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); - sb_end_write(fs_info->sb); + scoped_guard(super_write, fs_info->sb) + ret = btrfs_defrag_file(inode, ra, &range, + defrag->transid, BTRFS_DEFRAG_BATCH); iput(&inode->vfs_inode); if (ret < 0) @@ -471,7 +471,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, memcpy(&key, &root->defrag_progress, sizeof(key)); } - path->keep_locks = 1; + path->keep_locks = true; ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret < 0) @@ -514,7 +514,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* * Now that we reallocated the node we can find the next key. Note that * btrfs_find_next_key() can release our path and do another search - * without COWing, this is because even with path->keep_locks = 1, + * without COWing, this is because even with path->keep_locks == true, * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a * node when path->slots[node_level - 1] does not point to the last * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore @@ -886,7 +886,7 @@ again: } lock_start = folio_pos(folio); - lock_end = folio_end(folio) - 1; + lock_end = folio_next_pos(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; @@ -1178,7 +1178,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, if (!folio) break; - if (start >= folio_end(folio) || start + len <= folio_pos(folio)) + if (start >= folio_next_pos(folio) || + start + len <= folio_pos(folio)) continue; btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); @@ -1219,7 +1220,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, folios[i] = NULL; goto free_folios; } - cur = folio_end(folios[i]); + cur = folio_next_pos(folios[i]); } for (int i = 0; i < nr_pages; i++) { if (!folios[i]) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 288e1776c02d..0970799d0aa4 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, noflush); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, meta_reserve, + flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 3df7b9d7fbe8..ce6e9f8812e0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -668,7 +668,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_key first_key; const u32 first_data_size = first_item->data_len; int total_size; - char *ins_data = NULL; + char AUTO_KFREE(ins_data); int ret; bool continuous_keys_only = false; @@ -740,10 +740,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, ins_data = kmalloc_array(batch.nr, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); - if (!ins_data) { - ret = -ENOMEM; - goto out; - } + if (!ins_data) + return -ENOMEM; ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); batch.keys = ins_keys; @@ -759,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret) - goto out; + return ret; list_for_each_entry(curr, &item_list, tree_list) { char *data_ptr; @@ -814,9 +812,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } -out: - kfree(ins_data); - return ret; + + return 0; } static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, @@ -2011,13 +2008,10 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) * It is very rare. */ mutex_lock(&delayed_node->mutex); - if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) - goto release_node; - - set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); - delayed_node->count++; - atomic_inc(&fs_info->delayed_root->items); -release_node: + if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { + delayed_node->count++; + atomic_inc(&fs_info->delayed_root->items); + } mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 481802efaa14..e8bc37453336 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -228,7 +228,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(space_info, num_bytes, flush); if (ret) return ret; @@ -798,9 +798,13 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, } /* - * helper function to actually insert a head node into the rbtree. - * this does all the dirty work in terms of maintaining the correct - * overall modification count. + * Helper function to actually insert a head node into the xarray. This does all + * the dirty work in terms of maintaining the correct overall modification + * count. + * + * The caller is responsible for calling kfree() on @qrecord. More specifically, + * if this function reports that it did not insert it as noted in + * @qrecord_inserted_ret, then it's safe to call kfree() on it. * * Returns an error pointer in case of an error. */ @@ -814,7 +818,14 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); - bool qrecord_inserted = false; + + /* + * If 'qrecord_inserted_ret' is provided, then the first thing we need + * to do is to initialize it to false just in case we have an exit + * before trying to insert the record. + */ + if (qrecord_inserted_ret) + *qrecord_inserted_ret = false; delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&delayed_refs->lock); @@ -833,6 +844,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, /* Record qgroup extent info if provided */ if (qrecord) { + /* + * Setting 'qrecord' but not 'qrecord_inserted_ret' will likely + * result in a memory leakage. + */ + ASSERT(qrecord_inserted_ret != NULL); + int ret; ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, @@ -840,12 +857,10 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, index); - /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); - kfree(qrecord); - } else { - qrecord_inserted = true; + } else if (qrecord_inserted_ret) { + *qrecord_inserted_ret = true; } } @@ -888,8 +903,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, delayed_refs->num_heads++; delayed_refs->num_heads_ready++; } - if (qrecord_inserted_ret) - *qrecord_inserted_ret = qrecord_inserted; return head_ref; } @@ -1049,6 +1062,14 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, xa_release(&delayed_refs->head_refs, index); spin_unlock(&delayed_refs->lock); ret = PTR_ERR(new_head_ref); + + /* + * It's only safe to call kfree() on 'qrecord' if + * add_delayed_ref_head() has _not_ inserted it for + * tracing. Otherwise we need to handle this here. + */ + if (!qrecord_reserved || qrecord_inserted) + goto free_head_ref; goto free_record; } head_ref = new_head_ref; @@ -1071,6 +1092,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); + + kfree(record); return 0; free_record: diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a4eaef60549e..b6c7da8e1bc8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -489,8 +489,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, } path->reada = READA_FORWARD; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; key.objectid = src_dev->devid; key.type = BTRFS_DEV_EXTENT_KEY; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 69863e398e22..085a83ae9e62 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -9,6 +9,7 @@ #include "transaction.h" #include "accessors.h" #include "dir-item.h" +#include "delayed-inode.h" /* * insert a name into a directory, doing overflow properly if there is a hash @@ -111,7 +112,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, int ret = 0; int ret2 = 0; struct btrfs_root *root = dir->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *dir_item; struct extent_buffer *leaf; unsigned long name_ptr; @@ -163,7 +164,6 @@ second_insert: ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: - btrfs_free_path(path); if (ret) return ret; if (ret2) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 802d4dbe5b38..07e19e88ba4b 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -10,6 +10,8 @@ #include "fs.h" #include "transaction.h" #include "volumes.h" +#include "bio.h" +#include "ordered-data.h" struct btrfs_dio_data { ssize_t submitted; @@ -184,7 +186,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len); again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, - 0, alloc_hint, &ins, 1, 1); + 0, alloc_hint, &ins, true, true); if (ret == -EAGAIN) { ASSERT(btrfs_is_zoned(fs_info)); wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, @@ -385,7 +387,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to allocate a contiguous array for the checksums. */ if (!write) - len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); + len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS); lockstart = start; lockend = start + len - 1; @@ -713,10 +715,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, + btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset, btrfs_dio_end_io, bio->bi_private); - bbio->inode = BTRFS_I(iter->inode); - bbio->file_offset = file_offset; dip->file_offset = file_offset; dip->bytes = bio->bi_iter.bi_size; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0aa7e5d1b05f..89149fac804c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,6 +50,7 @@ #include "relocation.h" #include "scrub.h" #include "super.h" +#include "delayed-inode.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -182,26 +183,33 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; + const u32 step = min(fs_info->nodesize, PAGE_SIZE); + const u32 nr_steps = eb->len / step; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_extent_folios(eb); i++) { + for (int i = 0; i < num_extent_pages(eb); i++) { struct folio *folio = eb->folios[i]; - u64 start = max_t(u64, eb->start, folio_pos(folio)); - u64 end = min_t(u64, eb->start + eb->len, - folio_pos(folio) + eb->folio_size); - u32 len = end - start; - phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) + - offset_in_folio(folio, start); - - ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, - paddr, mirror_num); - if (ret) - break; + + /* No large folio support yet. */ + ASSERT(folio_order(folio) == 0); + ASSERT(i < nr_steps); + + /* + * For nodesize < page size, there is just one paddr, with some + * offset inside the page. + * + * For nodesize >= page size, it's one or more paddrs, and eb->start + * must be aligned to page boundary. + */ + paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start); } + ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start, + paddrs, step, mirror_num); return ret; } @@ -398,10 +406,10 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, -"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s", +"checksum verify failed on logical %llu mirror %u wanted " BTRFS_CSUM_FMT " found " BTRFS_CSUM_FMT " level %d%s", eb->start, eb->read_mirror, - CSUM_FMT_VALUE(csum_size, header_csum), - CSUM_FMT_VALUE(csum_size, result), + BTRFS_CSUM_FMT_VALUE(csum_size, header_csum), + BTRFS_CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); if (unlikely(!ignore_csum)) { @@ -644,20 +652,10 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, if (!root) return NULL; - memset(&root->root_key, 0, sizeof(root->root_key)); - memset(&root->root_item, 0, sizeof(root->root_item)); - memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); root->fs_info = fs_info; root->root_key.objectid = objectid; - root->node = NULL; - root->commit_root = NULL; - root->state = 0; RB_CLEAR_NODE(&root->rb_node); - btrfs_set_root_last_trans(root, 0); - root->free_objectid = 0; - root->nr_delalloc_inodes = 0; - root->nr_ordered_extents = 0; xa_init(&root->inodes); xa_init(&root->delayed_nodes); @@ -691,10 +689,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); - btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - btrfs_set_root_last_log_commit(root, 0); - root->anon_dev = 0; if (!btrfs_is_testing(fs_info)) { btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES); @@ -1773,8 +1768,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) destroy_workqueue(fs_info->endio_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); - if (fs_info->compressed_write_workers) - destroy_workqueue(fs_info->compressed_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -1986,8 +1979,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); - fs_info->compressed_write_workers = - alloc_workqueue("btrfs-compressed-write", flags, max_active); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); @@ -2003,7 +1994,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->compressed_write_workers && fs_info->endio_write_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && @@ -3255,12 +3245,6 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) PAGE_SIZE, fs_info->sectorsize); return -EINVAL; } - if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) { - btrfs_err(fs_info, - "RAID56 is not supported for page size %lu with sectorsize %u", - PAGE_SIZE, fs_info->sectorsize); - return -EINVAL; - } /* This can be called by remount, we need to protect the super block. */ spin_lock(&fs_info->super_lock); @@ -4290,7 +4274,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* * When finishing a compressed write bio we schedule a work queue item - * to finish an ordered extent - btrfs_finish_compressed_write_work() + * to finish an ordered extent - end_bbio_compressed_write() * calls btrfs_finish_ordered_extent() which in turns does a call to * btrfs_queue_ordered_fn(), and that queues the ordered extent * completion either in the endio_write_workers work queue or in the @@ -4298,7 +4282,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * below, so before we flush them we must flush this queue for the * workers of compressed writes. */ - flush_workqueue(fs_info->compressed_write_workers); + flush_workqueue(fs_info->endio_workers); /* * After we parked the cleaner kthread, ordered extents may have diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 57920f2c6fe4..5320da83d0cf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -9,7 +9,8 @@ #include <linux/sizes.h> #include <linux/compiler_types.h> #include "ctree.h" -#include "fs.h" +#include "bio.h" +#include "ordered-data.h" struct block_device; struct super_block; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dc4ca98c3780..e4cae34620d1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -40,6 +40,7 @@ #include "orphan.h" #include "tree-checker.h" #include "raid-stripe-tree.h" +#include "delayed-inode.h" #undef SCRAMBLE_DELAYED_REFS @@ -164,8 +165,8 @@ search_again: if (unlikely(num_refs == 0)) { ret = -EUCLEAN; btrfs_err(fs_info, - "unexpected zero reference count for extent item (%llu %u %llu)", - key.objectid, key.type, key.offset); + "unexpected zero reference count for extent item " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key)); btrfs_abort_transaction(trans, ret); return ret; } @@ -597,8 +598,8 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, num_refs = btrfs_shared_data_ref_count(leaf, ref2); } else { btrfs_err(trans->fs_info, - "unrecognized backref key (%llu %u %llu)", - key.objectid, key.type, key.offset); + "unrecognized backref key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key)); btrfs_abort_transaction(trans, -EUCLEAN); return -EUCLEAN; } @@ -788,7 +789,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, want = extent_ref_type(parent, owner); if (insert) { extra_size = btrfs_extent_inline_ref_size(want); - path->search_for_extension = 1; + path->search_for_extension = true; } else extra_size = -1; @@ -954,7 +955,7 @@ again: if (!path->keep_locks) { btrfs_release_path(path); - path->keep_locks = 1; + path->keep_locks = true; goto again; } @@ -975,11 +976,11 @@ out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: if (path->keep_locks) { - path->keep_locks = 0; + path->keep_locks = false; btrfs_unlock_up_safe(path, 1); } if (insert) - path->search_for_extension = 0; + path->search_for_extension = false; return ret; } @@ -1764,7 +1765,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans)) { if (insert_reserved) { - btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); free_head_ref_squota_rsv(trans->fs_info, href); } return 0; @@ -1783,7 +1784,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, else BUG(); if (ret && insert_reserved) - btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); if (ret < 0) btrfs_err(trans->fs_info, "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", @@ -1890,7 +1891,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { - btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); + btrfs_pin_extent(trans, head->bytenr, head->num_bytes); if (head->is_data) { struct btrfs_root *csum_root; @@ -2591,34 +2592,34 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info) } static int pin_down_extent(struct btrfs_trans_handle *trans, - struct btrfs_block_group *cache, - u64 bytenr, u64 num_bytes, int reserved) + struct btrfs_block_group *bg, + u64 bytenr, u64 num_bytes, bool reserved) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); - if (reserved) { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + struct btrfs_space_info *space_info = bg->space_info; + const u64 reserved_bytes = (reserved ? num_bytes : 0); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + bg->pinned += num_bytes; + bg->reserved -= reserved_bytes; + spin_unlock(&bg->lock); + space_info->bytes_reserved -= reserved_bytes; + btrfs_space_info_update_bytes_pinned(space_info, num_bytes); + spin_unlock(&space_info->lock); btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } -int btrfs_pin_extent(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int reserved) +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(trans, cache, bytenr, num_bytes, reserved); + pin_down_extent(trans, cache, bytenr, num_bytes, true); btrfs_put_block_group(cache); return 0; @@ -2642,7 +2643,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, if (ret) goto out; - pin_down_extent(trans, cache, eb->start, eb->len, 0); + pin_down_extent(trans, cache, eb->start, eb->len, false); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, eb->start, eb->len); @@ -2747,13 +2748,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, struct btrfs_free_cluster *cluster = NULL; u64 total_unpinned = 0; u64 empty_cluster = 0; - bool readonly; - int ret = 0; while (start <= end) { u64 len; + bool readonly; - readonly = false; if (!cache || start >= cache->start + cache->length) { if (cache) @@ -2762,8 +2761,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache = btrfs_lookup_block_group(fs_info, start); if (unlikely(cache == NULL)) { /* Logic error, something removed the block group. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } cluster = fetch_cluster_info(fs_info, @@ -2797,27 +2795,28 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); + readonly = cache->ro; cache->pinned -= len; + spin_unlock(&cache->lock); + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; - if (cache->ro) { + + if (readonly) { space_info->bytes_readonly += len; - readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ btrfs_space_info_update_bytes_zone_unusable(space_info, len); - readonly = true; - } - spin_unlock(&cache->lock); - if (!readonly && return_free_space) + } else if (return_free_space) { btrfs_return_free_space(space_info, len); + } spin_unlock(&space_info->lock); } if (cache) btrfs_put_block_group(cache); -out: - return ret; + + return 0; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) @@ -3086,7 +3085,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -3121,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, node->bytenr, refs_to_drop); ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } if (is_data) @@ -3166,15 +3165,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); @@ -3223,7 +3221,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } extent_slot = path->slots[0]; } @@ -3232,10 +3230,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", bytenr, node->parent, node->ref_root, owner_objectid, owner_offset, path->slots[0]); - goto out; + return ret; } else { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } leaf = path->nodes[0]; @@ -3246,7 +3244,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); @@ -3260,8 +3258,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.objectid, key.type, key.offset, path->slots[0], owner_objectid, item_size, sizeof(*ei) + sizeof(*bi)); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); @@ -3272,8 +3269,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", refs_to_drop, refs, bytenr, path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } refs -= refs_to_drop; @@ -3289,8 +3285,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } } else { btrfs_set_extent_refs(leaf, ei, refs); @@ -3300,7 +3295,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, iref, refs_to_drop, is_data); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } } else { @@ -3320,17 +3315,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", extent_data_ref_count(path, iref), refs_to_drop, path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } if (iref) { if (unlikely(path->slots[0] != extent_slot)) { abort_and_dump(trans, path, -"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", - key.objectid, key.type, - key.offset, path->slots[0]); - ret = -EUCLEAN; - goto out; +"invalid iref, extent item key " BTRFS_KEY_FMT " slot %u doesn't have wanted iref", + BTRFS_KEY_FMT_VALUE(&key), + path->slots[0]); + return -EUCLEAN; } } else { /* @@ -3343,8 +3336,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } path->slots[0] = extent_slot; num_to_del = 2; @@ -3365,7 +3357,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, num_to_del); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); @@ -3373,8 +3365,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); -out: - btrfs_free_path(path); return ret; } @@ -3483,7 +3473,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, bg = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(trans, bg, buf->start, buf->len, 1); + pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); goto out; } @@ -3507,7 +3497,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) || btrfs_is_zoned(fs_info)) { - pin_down_extent(trans, bg, buf->start, buf->len, 1); + pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); goto out; } @@ -3537,7 +3527,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { - btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); + btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); @@ -3588,15 +3578,14 @@ enum btrfs_loop_type { }; static inline void -btrfs_lock_block_group(struct btrfs_block_group *cache, - int delalloc) +btrfs_lock_block_group(struct btrfs_block_group *cache, bool delalloc) { if (delalloc) down_read(&cache->data_rwsem); } static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, - int delalloc) + bool delalloc) { btrfs_get_block_group(cache); if (delalloc) @@ -3606,7 +3595,7 @@ static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, - int delalloc) + bool delalloc) __acquires(&cluster->refill_lock) { struct btrfs_block_group *used_bg = NULL; @@ -3643,8 +3632,7 @@ static struct btrfs_block_group *btrfs_lock_cluster( } static inline void -btrfs_release_block_group(struct btrfs_block_group *cache, - int delalloc) +btrfs_release_block_group(struct btrfs_block_group *cache, bool delalloc) { if (delalloc) up_read(&cache->data_rwsem); @@ -4034,7 +4022,7 @@ static int do_allocation(struct btrfs_block_group *block_group, static void release_block_group(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, - int delalloc) + bool delalloc) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: @@ -4690,7 +4678,7 @@ loop: int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc) + struct btrfs_key *ins, bool is_data, bool delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct find_free_extent_ctl ffe_ctl = {}; @@ -4735,8 +4723,7 @@ again: "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) - btrfs_dump_space_info(fs_info, sinfo, - num_bytes, 1); + btrfs_dump_space_info(sinfo, num_bytes, 1); } } @@ -4776,7 +4763,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, return -ENOSPC; } - ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); + ret = pin_down_extent(trans, cache, eb->start, eb->len, true); btrfs_put_block_group(cache); return ret; } @@ -5022,7 +5009,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1, root_objectid); if (ret) - btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); + btrfs_pin_extent(trans, ins->objectid, ins->offset); ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; @@ -5168,7 +5155,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, - empty_size, hint, &ins, 0, 0); + empty_size, hint, &ins, false, false); if (ret) goto out_unuse; @@ -6061,7 +6048,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root_item *root_item = &root->root_item; - struct walk_control *wc; + struct walk_control AUTO_KFREE(wc); struct btrfs_key key; const u64 rootid = btrfs_root_id(root); int ret = 0; @@ -6079,9 +6066,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { - btrfs_free_path(path); ret = -ENOMEM; - goto out; + goto out_free; } /* @@ -6291,7 +6277,6 @@ out_end_trans: btrfs_end_transaction_throttle(trans); out_free: - kfree(wc); btrfs_free_path(path); out: if (!ret && root_dropped) { @@ -6334,7 +6319,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; BTRFS_PATH_AUTO_FREE(path); - struct walk_control *wc; + struct walk_control AUTO_KFREE(wc); int level; int parent_level; int ret = 0; @@ -6373,18 +6358,17 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, while (1) { ret = walk_down_tree(trans, root, path, wc); if (ret < 0) - break; + return ret; ret = walk_up_tree(trans, root, path, wc, parent_level); if (ret) { - if (ret > 0) - ret = 0; + if (ret < 0) + return ret; break; } } - kfree(wc); - return ret; + return 0; } /* diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index e970ac42a871..71bb8109c969 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -30,7 +30,6 @@ struct find_free_extent_ctl { u64 min_alloc_size; u64 empty_size; u64 flags; - int delalloc; /* Where to start the search inside the bg */ u64 search_start; @@ -40,6 +39,7 @@ struct find_free_extent_ctl { struct btrfs_free_cluster *last_ptr; bool use_cluster; + bool delalloc; bool have_caching_bg; bool orig_have_caching_bg; @@ -49,6 +49,16 @@ struct find_free_extent_ctl { /* Allocation is called for data relocation */ bool for_data_reloc; + /* + * Set to true if we're retrying the allocation on this block group + * after waiting for caching progress, this is so that we retry only + * once before moving on to another block group. + */ + bool retry_uncached; + + /* Whether or not the allocator is currently following a hint. */ + bool hinted; + /* RAID index, converted from flags */ int index; @@ -57,13 +67,6 @@ struct find_free_extent_ctl { */ int loop; - /* - * Set to true if we're retrying the allocation on this block group - * after waiting for caching progress, this is so that we retry only - * once before moving on to another block group. - */ - bool retry_uncached; - /* If current block group is cached */ int cached; @@ -82,9 +85,6 @@ struct find_free_extent_ctl { /* Allocation policy */ enum btrfs_extent_allocation_policy policy; - /* Whether or not the allocator is currently following a hint */ - bool hinted; - /* Size class of block groups to prefer in early loops */ enum btrfs_block_group_size_class size_class; }; @@ -110,8 +110,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags, u64 *owner_root); -int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, - int reserved); +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); @@ -138,7 +137,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc); + struct btrfs_key *ins, bool is_data, bool delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, bool full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 23273d0e6f22..629fd5af4286 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -333,7 +333,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, goto out; } range_start = max_t(u64, folio_pos(folio), start); - range_len = min_t(u64, folio_end(folio), end + 1) - range_start; + range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start; btrfs_folio_set_lock(fs_info, folio, range_start, range_len); processed_end = range_start + range_len - 1; @@ -374,8 +374,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; - /* The sanity tests may not set a valid fs_info. */ - u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; + u64 max_bytes = fs_info->max_extent_size; u64 delalloc_start; u64 delalloc_end; bool found; @@ -387,7 +386,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, ASSERT(orig_end > orig_start); /* The range should at least cover part of the folio */ - ASSERT(!(orig_start >= folio_end(locked_folio) || + ASSERT(!(orig_start >= folio_next_pos(locked_folio) || orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ @@ -493,7 +492,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_end(folio)); + start + len <= folio_next_pos(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); @@ -518,7 +517,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le */ static void end_bbio_data_write(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; @@ -574,7 +573,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) */ static void end_bbio_data_read(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi; @@ -739,12 +738,10 @@ static void alloc_new_bio(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_bio *bbio; - bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, - bio_ctrl->end_io_func, NULL); + bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode, + file_offset, bio_ctrl->end_io_func, NULL); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; - bbio->inode = inode; - bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; bio_ctrl->next_file_offset = file_offset; @@ -1201,7 +1198,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * finished our folio read and unlocked the folio. */ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_end(folio), + u64 range_len = umin(folio_next_pos(folio), ordered->file_offset + ordered->num_bytes) - cur; ret = true; @@ -1223,7 +1220,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * So we return true and update @next_ret to the OE/folio boundary. */ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_end(folio), + u64 range_len = umin(folio_next_pos(folio), ordered->file_offset + ordered->num_bytes) - cur; /* @@ -1691,14 +1688,17 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, unsigned long range_bitmap = 0; bool submitted_io = false; int found_error = 0; + const u64 end = start + len; const u64 folio_start = folio_pos(folio); + const u64 folio_end = folio_start + folio_size(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; - ASSERT(start >= folio_start && - start + len <= folio_start + folio_size(folio)); + ASSERT(start >= folio_start, "start=%llu folio_start=%llu", start, folio_start); + ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu", + start, len, folio_start, folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); if (ret == -EAGAIN) { @@ -1714,7 +1714,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, return ret; } - for (cur = start; cur < start + len; cur += fs_info->sectorsize) + for (cur = start; cur < end; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, blocks_per_folio); @@ -1725,8 +1725,24 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_first_ordered_range(inode, cur, + folio_end - cur); + /* + * We have just run delalloc before getting here, so + * there must be an ordered extent. + */ + ASSERT(ordered != NULL); + spin_lock(&inode->ordered_tree_lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + ordered->truncated_len = min(ordered->truncated_len, + cur - ordered->file_offset); + spin_unlock(&inode->ordered_tree_lock); + btrfs_put_ordered_extent(ordered); + btrfs_mark_ordered_io_finished(inode, folio, cur, - start + len - cur, true); + end - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1735,8 +1751,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, folio, cur, - start + len - cur); + btrfs_folio_clear_dirty(fs_info, folio, cur, end - cur); break; } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); @@ -1856,7 +1871,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; - if (ret < 0) + if (unlikely(ret < 0)) btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", btrfs_root_id(inode->root), btrfs_ino(inode), @@ -2206,16 +2221,15 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, end_bbio_meta_write, eb); + BTRFS_I(fs_info->btree_inode), eb->start, + end_bbio_meta_write, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, &bbio->bio); - bbio->inode = BTRFS_I(eb->fs_info->btree_inode); - bbio->file_offset = eb->start; for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_end(folio), + u32 range_len = min_t(u64, folio_next_pos(folio), eb->start + eb->len) - range_start; folio_lock(folio); @@ -2468,10 +2482,7 @@ static int extent_write_cache_pages(struct address_space *mapping, &BTRFS_I(inode)->runtime_flags)) wbc->tagged_writepages = 1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); @@ -2627,7 +2638,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f continue; } - cur_end = min_t(u64, folio_end(folio) - 1, end); + cur_end = min_t(u64, folio_next_pos(folio) - 1, end); cur_len = cur_end + 1 - cur; ASSERT(folio_test_locked(folio)); @@ -3826,6 +3837,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *check) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_bio *bbio; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) @@ -3859,16 +3871,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, refcount_inc(&eb->refs); bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, - REQ_OP_READ | REQ_META, eb->fs_info, - end_bbio_meta_read, eb); + REQ_OP_READ | REQ_META, BTRFS_I(fs_info->btree_inode), + eb->start, end_bbio_meta_read, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; - bbio->inode = BTRFS_I(eb->fs_info->btree_inode); - bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_end(folio), + u32 range_len = min_t(u64, folio_next_pos(folio), eb->start + eb->len) - range_start; bio_add_folio_nofail(&bbio->bio, folio, range_len, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5fcbfe44218c..02ebb2f238af 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -12,7 +12,6 @@ #include <linux/rwsem.h> #include <linux/list.h> #include <linux/slab.h> -#include "compression.h" #include "messages.h" #include "ulist.h" #include "misc.h" diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index d4b81ee4d97b..6f685f3c9327 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -8,8 +8,7 @@ #include <linux/rbtree.h> #include <linux/list.h> #include <linux/refcount.h> -#include "misc.h" -#include "compression.h" +#include "fs.h" struct btrfs_inode; struct btrfs_fs_info; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a42e6d54e7cd..14e5257f0f04 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -18,6 +18,7 @@ #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "volumes.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -372,7 +373,7 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) return -ENOMEM; if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + bbio->csum = kvcalloc(nblocks, csum_size, GFP_NOFS); if (!bbio->csum) return -ENOMEM; } else { @@ -393,8 +394,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) * between reading the free space cache and updating the csum tree. */ if (btrfs_is_free_space_inode(inode)) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; } /* @@ -422,8 +423,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) * from across transactions. */ if (bbio->csum_search_commit_root) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; down_read(&fs_info->commit_root_sem); } @@ -438,7 +439,7 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (count < 0) { ret = count; if (bbio->csum != bbio->csum_inline) - kfree(bbio->csum); + kvfree(bbio->csum); bbio->csum = NULL; break; } @@ -764,21 +765,55 @@ fail: return ret; } +static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src) +{ + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + struct bio *bio = &bbio->bio; + struct btrfs_ordered_sum *sums = bbio->sums; + struct bvec_iter iter = *src; + phys_addr_t paddr; + const u32 blocksize = fs_info->sectorsize; + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; + u32 offset = 0; + int index = 0; + + shash->tfm = fs_info->csum_shash; + + btrfs_bio_for_each_block(paddr, bio, &iter, step) { + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; + + if (IS_ALIGNED(offset, blocksize)) { + btrfs_calculate_block_csum_pages(fs_info, paddrs, sums->sums + index); + index += fs_info->csum_size; + } + } +} + +static void csum_one_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, csum_work); + + ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); + ASSERT(bbio->async_csum == true); + csum_one_bio(bbio, &bbio->csum_saved_iter); + complete(&bbio->csum_done); +} + /* * Calculate checksums of the data contained inside a bio. */ -int btrfs_csum_one_bio(struct btrfs_bio *bbio) +int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async) { struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; struct btrfs_ordered_sum *sums; - struct bvec_iter iter = bio->bi_iter; - phys_addr_t paddr; - const u32 blocksize = fs_info->sectorsize; - int index; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -789,21 +824,21 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) if (!sums) return -ENOMEM; + sums->logical = bbio->orig_logical; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - - sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - index = 0; - - shash->tfm = fs_info->csum_shash; - - btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) { - btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index); - index += fs_info->csum_size; - } - bbio->sums = sums; btrfs_add_ordered_sum(ordered, sums); + + if (!async) { + csum_one_bio(bbio, &bbio->bio.bi_iter); + return 0; + } + init_completion(&bbio->csum_done); + bbio->async_csum = true; + bbio->csum_saved_iter = bbio->bio.bi_iter; + INIT_WORK(&bbio->csum_work, csum_one_bio_work); + schedule_work(&bbio->csum_work); return 0; } @@ -1142,10 +1177,10 @@ again: } btrfs_release_path(path); - path->search_for_extension = 1; + path->search_for_extension = true; ret = btrfs_search_slot(trans, root, &file_key, path, csum_size, 1); - path->search_for_extension = 0; + path->search_for_extension = false; if (ret < 0) goto out; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 63216c43676d..5645c5e3abdb 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -7,7 +7,7 @@ #include <linux/list.h> #include <uapi/linux/btrfs_tree.h> #include "ctree.h" -#include "accessors.h" +#include "ordered-data.h" struct extent_map; struct btrfs_file_extent_item; @@ -64,7 +64,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async); int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa82def46e39..7a501e73d880 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -75,7 +75,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos u64 num_bytes; u64 start_pos; u64 end_of_last_block; - u64 end_pos = pos + write_bytes; + const u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; @@ -86,10 +86,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos extra_bits |= EXTENT_NORESERVE; start_pos = round_down(pos, fs_info->sectorsize); - num_bytes = round_up(write_bytes + pos - start_pos, - fs_info->sectorsize); + num_bytes = round_up(end_pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); - ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); + ASSERT(folio_pos(folio) <= pos && folio_next_pos(folio) >= end_pos); end_of_last_block = start_pos + num_bytes - 1; @@ -799,7 +798,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); - u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio)); const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; @@ -1254,8 +1253,8 @@ again: * The reserved range goes beyond the current folio, shrink the reserved * space to the folio boundary. */ - if (reserved_start + reserved_len > folio_end(folio)) { - const u64 last_block = folio_end(folio); + if (reserved_start + reserved_len > folio_next_pos(folio)) { + const u64 last_block = folio_next_pos(folio); shrink_reserved_space(inode, *data_reserved, reserved_start, reserved_len, last_block - reserved_start, @@ -1441,6 +1440,8 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; + if (unlikely(btrfs_is_shutdown(inode->root->fs_info))) + return -EIO; /* * If the fs flips readonly due to some impossible error, although we * have opened a file as writable, we have to stop this write operation @@ -2043,6 +2044,8 @@ static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *filp = desc->file; struct address_space *mapping = filp->f_mapping; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))) + return -EIO; if (!mapping->a_ops->read_folio) return -ENOEXEC; @@ -3112,6 +3115,9 @@ static long btrfs_fallocate(struct file *file, int mode, int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + return -EIO; + /* Do not allow fallocate in ZONED mode */ if (btrfs_is_zoned(inode_to_fs_info(inode))) return -EOPNOTSUPP; @@ -3803,6 +3809,9 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + return -EIO; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); @@ -3815,6 +3824,9 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))) + return -EIO; + if (iocb->ki_flags & IOCB_DIRECT) { ret = btrfs_direct_read(iocb, to); if (ret < 0 || !iov_iter_count(to) || @@ -3825,10 +3837,20 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return filemap_read(iocb, to, ret); } +static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))) + return -EIO; + + return filemap_splice_read(in, ppos, pipe, len, flags); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = btrfs_file_read_iter, - .splice_read = filemap_splice_read, + .splice_read = btrfs_file_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, .mmap_prepare = btrfs_file_mmap_prepare, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ab873bd67192..f0f72850fab2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -968,8 +968,8 @@ int load_free_space_cache(struct btrfs_block_group *block_group) path = btrfs_alloc_path(); if (!path) return 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; /* * We must pass a path with search_commit_root set to btrfs_iget in @@ -3656,7 +3656,7 @@ static int do_trimming(struct btrfs_block_group *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; - int update = 0; + bool bg_ro; const u64 end = start + bytes; const u64 reserved_end = reserved_start + reserved_bytes; enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; @@ -3664,12 +3664,14 @@ static int do_trimming(struct btrfs_block_group *block_group, spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (!block_group->ro) { + bg_ro = block_group->ro; + if (!bg_ro) { block_group->reserved += reserved_bytes; + spin_unlock(&block_group->lock); space_info->bytes_reserved += reserved_bytes; - update = 1; + } else { + spin_unlock(&block_group->lock); } - spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); @@ -3690,14 +3692,16 @@ static int do_trimming(struct btrfs_block_group *block_group, list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); - if (update) { + if (!bg_ro) { spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += reserved_bytes; + bg_ro = block_group->ro; block_group->reserved -= reserved_bytes; - space_info->bytes_reserved -= reserved_bytes; spin_unlock(&block_group->lock); + + space_info->bytes_reserved -= reserved_bytes; + if (bg_ro) + space_info->bytes_readonly += reserved_bytes; spin_unlock(&space_info->lock); } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index d86541073d42..1ad2ad384b9e 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -165,11 +165,9 @@ static unsigned long *alloc_bitmap(u32 bitmap_size) /* * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse - * into the filesystem as the free space bitmap can be modified in the - * critical section of a transaction commit. - * - * TODO: push the memalloc_nofs_{save,restore}() to the caller where we - * know that recursion is unsafe. + * into the filesystem here. All callers hold a transaction handle + * open, so if a GFP_KERNEL allocation recurses into the filesystem + * and triggers a transaction commit, we would deadlock. */ nofs_flag = memalloc_nofs_save(); ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL); @@ -218,11 +216,8 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (unlikely(!bitmap)) { - ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); - goto out; - } + if (unlikely(!bitmap)) + return 0; start = block_group->start; end = block_group->start + block_group->length; @@ -361,11 +356,8 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (unlikely(!bitmap)) { - ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); - goto out; - } + if (unlikely(!bitmap)) + return 0; start = block_group->start; end = block_group->start + block_group->length; @@ -841,7 +833,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { struct btrfs_block_group *block_group; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) @@ -851,7 +843,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } block_group = btrfs_lookup_block_group(trans->fs_info, start); @@ -859,7 +851,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } mutex_lock(&block_group->free_space_lock); @@ -869,8 +861,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); -out: - btrfs_free_path(path); + return ret; } @@ -1023,7 +1014,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { struct btrfs_block_group *block_group; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) @@ -1033,7 +1024,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } block_group = btrfs_lookup_block_group(trans->fs_info, start); @@ -1041,7 +1032,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } mutex_lock(&block_group->free_space_lock); @@ -1051,8 +1042,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); -out: - btrfs_free_path(path); + return ret; } @@ -1466,7 +1456,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *root = btrfs_free_space_root(block_group); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key, found_key; struct extent_buffer *leaf; u64 start, end; @@ -1485,7 +1475,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } start = block_group->start; @@ -1499,7 +1489,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } leaf = path->nodes[0]; @@ -1530,14 +1520,13 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, path->slots[0], nr); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); } ret = 0; -out: - btrfs_free_path(path); + return ret; } @@ -1702,8 +1691,8 @@ int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl) * Just like caching_thread() doesn't want to deadlock on the extent * tree, we don't want to deadlock on the free space tree. */ - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; info = btrfs_search_free_space_info(NULL, block_group, path, 0); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 814bbc9417d2..0f7e1ef27891 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -29,6 +29,7 @@ #include "extent-io-tree.h" #include "async-thread.h" #include "block-rsv.h" +#include "messages.h" struct inode; struct super_block; @@ -73,6 +74,13 @@ struct btrfs_space_info; #define BTRFS_SUPER_INFO_SIZE 4096 static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); +/* Array of bytes with variable length, hexadecimal format 0x1234 */ +#define BTRFS_CSUM_FMT "0x%*phN" +#define BTRFS_CSUM_FMT_VALUE(size, bytes) size, bytes + +#define BTRFS_KEY_FMT "(%llu %u %llu)" +#define BTRFS_KEY_FMT_VALUE(key) (key)->objectid, (key)->type, (key)->offset + /* * Number of metadata items necessary for an unlink operation: * @@ -124,6 +132,12 @@ enum { /* No more delayed iput can be queued. */ BTRFS_FS_STATE_NO_DELAYED_IPUT, + /* + * Emergency shutdown, a step further than transaction aborted by + * rejecting all operations. + */ + BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, + BTRFS_FS_STATE_COUNT }; @@ -644,7 +658,6 @@ struct btrfs_fs_info { struct workqueue_struct *endio_workers; struct workqueue_struct *endio_meta_workers; struct workqueue_struct *rmw_workers; - struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; @@ -1120,6 +1133,27 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) +static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state); +} + +static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info) +{ + /* + * Here we do not want to use handle_fs_error(), which will mark the fs + * read-only. + * Some call sites like shutdown ioctl will mark the fs shutdown when + * the fs is frozen. But thaw path will handle RO and RW fs + * differently. + * + * So here we only mark the fs error without flipping it RO. + */ + WRITE_ONCE(fs_info->fs_error, -EIO); + if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)) + btrfs_crit(fs_info, "emergency shutdown"); +} + /* * We use folio flag owner_2 to indicate there is an ordered extent with * unfinished IO. diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 1bd73b80f9fa..b73e1dd97208 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -312,7 +312,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->skip_release_on_error = 1; + path->skip_release_on_error = true; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { @@ -444,7 +444,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_truncate_control *control) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct btrfs_key key; @@ -730,6 +730,5 @@ out: if (!ret && control->last_size > new_size) control->last_size = new_size; - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6282911e536f..c4bee47829ed 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9,6 +9,7 @@ #include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> @@ -71,6 +72,7 @@ #include "backref.h" #include "raid-stripe-tree.h" #include "fiemap.h" +#include "delayed-inode.h" #define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) #define COW_FILE_RANGE_NO_INLINE (1UL << 1) @@ -130,7 +132,7 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, struct btrfs_fs_info *fs_info = warn->fs_info; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; - struct inode_fs_paths *ipath = NULL; + struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; struct btrfs_root *local_root; struct btrfs_key key; unsigned int nofs_flag; @@ -195,7 +197,6 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, } btrfs_put_root(local_root); - free_ipath(ipath); return 0; err: @@ -203,7 +204,6 @@ err: "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", warn->logical, warn->mirror_num, root, inum, offset, ret); - free_ipath(ipath); return ret; } @@ -235,21 +235,21 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off if (logical == U64_MAX) { btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, -"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %lld ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); return; } logical += file_off; btrfs_warn_rl(fs_info, -"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %lld ino %llu off %llu logical %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); @@ -320,19 +320,19 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, /* Output without objectid, which is more meaningful */ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, -"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %lld ino %lld off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } else { btrfs_warn_rl(root->fs_info, -"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %llu ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } } @@ -411,7 +411,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, continue; } - index = folio_end(folio) >> PAGE_SHIFT; + index = folio_next_index(folio); /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle @@ -593,6 +593,10 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (size < i_size_read(&inode->vfs_inode)) return false; + /* Encrypted file cannot be inlined. */ + if (IS_ENCRYPTED(&inode->vfs_inode)) + return false; + return true; } @@ -864,7 +868,7 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct folio **folios; + struct folio **folios = NULL; unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; @@ -873,6 +877,9 @@ static void compress_file_range(struct btrfs_work *work) int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; + if (unlikely(btrfs_is_shutdown(fs_info))) + goto cleanup_and_bail_uncompressed; + inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* @@ -1134,7 +1141,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, - 0, *alloc_hint, &ins, 1, 1); + 0, *alloc_hint, &ins, true, true); if (ret) { /* * We can't reserve contiguous space for the compressed size. @@ -1288,6 +1295,11 @@ static noinline int cow_file_range(struct btrfs_inode *inode, unsigned long page_ops; int ret = 0; + if (unlikely(btrfs_is_shutdown(fs_info))) { + ret = -EIO; + goto out_unlock; + } + if (btrfs_is_free_space_inode(inode)) { ret = -EINVAL; goto out_unlock; @@ -1352,7 +1364,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, - &ins, 1, 1); + &ins, true, true); if (ret == -EAGAIN) { /* * btrfs_reserve_extent only returns -EAGAIN for zoned @@ -2006,7 +2018,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; - struct btrfs_path *path; + struct btrfs_path *path = NULL; u64 cow_start = (u64)-1; /* * If not 0, represents the inclusive end of the last fallback_to_cow() @@ -2036,6 +2048,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, */ ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); + if (unlikely(btrfs_is_shutdown(fs_info))) { + ret = -EIO; + goto error; + } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -2338,7 +2354,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio))); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_next_pos(locked_folio))); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); @@ -2745,7 +2762,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); - u64 page_end = folio_end(folio) - 1; + u64 page_end = folio_next_pos(folio) - 1; int ret = 0; bool free_delalloc_space = true; @@ -3332,36 +3349,67 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } -void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, - u8 *dest) +/* + * Calculate the checksum of an fs block at physical memory address @paddr, + * and save the result to @dest. + * + * The folio containing @paddr must be large enough to contain a full fs block. + */ +void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info, + const phys_addr_t paddr, u8 *dest) { struct folio *folio = page_folio(phys_to_page(paddr)); const u32 blocksize = fs_info->sectorsize; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; - shash->tfm = fs_info->csum_shash; /* The full block must be inside the folio. */ ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); - if (folio_test_partial_kmap(folio)) { - size_t cur = paddr; + for (int i = 0; i < nr_steps; i++) { + u32 pindex = offset_in_folio(folio, paddr + i * step) >> PAGE_SHIFT; - crypto_shash_init(shash); - while (cur < paddr + blocksize) { - void *kaddr; - size_t len = min(paddr + blocksize - cur, - PAGE_SIZE - offset_in_page(cur)); + /* + * For bs <= ps cases, we will only run the loop once, so the offset + * inside the page will only added to paddrs[0]. + * + * For bs > ps cases, the block must be page aligned, thus offset + * inside the page will always be 0. + */ + paddrs[i] = page_to_phys(folio_page(folio, pindex)) + offset_in_page(paddr); + } + return btrfs_calculate_block_csum_pages(fs_info, paddrs, dest); +} - kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur)); - crypto_shash_update(shash, kaddr, len); - kunmap_local(kaddr); - cur += len; - } - crypto_shash_final(shash, dest); - } else { - crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest); +/* + * Calculate the checksum of a fs block backed by multiple noncontiguous pages + * at @paddrs[] and save the result to @dest. + * + * The folio containing @paddr must be large enough to contain a full fs block. + */ +void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, + const phys_addr_t paddrs[], u8 *dest) +{ + const u32 blocksize = fs_info->sectorsize; + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + for (int i = 0; i < nr_steps; i++) { + const phys_addr_t paddr = paddrs[i]; + void *kaddr; + + ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE); + kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); + crypto_shash_update(shash, kaddr, step); + kunmap_local(kaddr); } + crypto_shash_final(shash, dest); } + /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. @@ -3371,19 +3419,20 @@ void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected) { - btrfs_calculate_block_csum(fs_info, paddr, csum); + btrfs_calculate_block_csum_folio(fs_info, paddr, csum); if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) return -EIO; return 0; } /* - * Verify the checksum of a single data sector. + * Verify the checksum of a single data sector, which can be scattered at + * different noncontiguous pages. * * @bbio: btrfs_io_bio which contains the csum * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) - * @bv: bio_vec to check + * @paddrs: physical addresses which back the fs block * * Check if the checksum on a data block is valid. When a checksum mismatch is * detected, report the error and fill the corrupted range with zero. @@ -3391,12 +3440,13 @@ int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, phys_addr_t paddr) + u32 bio_offset, const phys_addr_t paddrs[]) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 blocksize = fs_info->sectorsize; - struct folio *folio; + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; u64 file_offset = bbio->file_offset + bio_offset; u64 end = file_offset + blocksize - 1; u8 *csum_expected; @@ -3416,7 +3466,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum); + if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) goto zeroit; return true; @@ -3425,9 +3476,8 @@ zeroit: bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); - folio = page_folio(phys_to_page(paddr)); - ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); - folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); + for (int i = 0; i < nr_steps; i++) + memzero_page(phys_to_page(paddrs[i]), offset_in_page(paddrs[i]), step); return false; } @@ -3886,7 +3936,7 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) ASSERT(ret != -ENOMEM); return ret; } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING))); } return 0; @@ -4314,8 +4364,8 @@ skip_backref: * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { - btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, dir, index); + btrfs_del_inode_ref_in_log(trans, name, inode, dir); + btrfs_del_dir_entries_in_log(trans, name, dir, index); } /* @@ -4414,7 +4464,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, { struct btrfs_root *root = dir->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; @@ -4507,7 +4557,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, ret); out: - btrfs_free_path(path); fscrypt_free_filename(&fname); return ret; } @@ -4857,7 +4906,7 @@ again: */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = folio_end(folio); + zero_end = folio_next_pos(folio); folio_zero_range(folio, zero_start - folio_pos(folio), zero_end - zero_start); @@ -5040,7 +5089,7 @@ again: * not reach disk, it still affects our page caches. */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = min_t(u64, folio_end(folio) - 1, end); + zero_end = min_t(u64, folio_next_pos(folio) - 1, end); } else { zero_start = max_t(u64, block_start, start); zero_end = min_t(u64, block_end, end); @@ -5363,7 +5412,7 @@ static void evict_inode_truncate_pages(struct inode *inode) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct rb_node *node; - ASSERT(inode->i_state & I_FREEING); + ASSERT(inode_state_read_once(inode) & I_FREEING); truncate_inode_pages_final(&inode->i_data); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); @@ -5632,9 +5681,9 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, location->type != BTRFS_ROOT_ITEM_KEY)) { ret = -EUCLEAN; btrfs_warn(root->fs_info, -"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", +"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location " BTRFS_KEY_FMT ")", __func__, fname.disk_name.name, btrfs_ino(dir), - location->objectid, location->type, location->offset); + BTRFS_KEY_FMT_VALUE(location)); } if (!ret) *type = btrfs_dir_ftype(path->nodes[0], di); @@ -5801,7 +5850,7 @@ struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->vfs_inode.i_state & I_NEW)) + if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); @@ -5825,7 +5874,7 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->vfs_inode.i_state & I_NEW)) + if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) return inode; path = btrfs_alloc_path(); @@ -5839,6 +5888,8 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (ret) return ERR_PTR(ret); + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC; unlock_new_inode(&inode->vfs_inode); return inode; } @@ -6291,8 +6342,8 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) } /* - * This is a copy of file_update_time. We need this so we can return error on - * ENOSPC for updating the inode in the case of file write and mmap writes. + * We need our own ->update_time so that we can return error on ENOSPC for + * updating the inode in the case of file write and mmap writes. */ static int btrfs_update_time(struct inode *inode, int flags) { @@ -6790,8 +6841,11 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, } ret = btrfs_create_new_inode(trans, &new_inode_args); - if (!ret) + if (!ret) { + if (S_ISDIR(inode->i_mode)) + inode->i_opflags |= IOP_FASTPERM_MAY_EXEC; d_instantiate_new(dentry, inode); + } btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -7067,8 +7121,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, * point the commit_root has everything we need. */ if (btrfs_is_free_space_inode(inode)) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; } ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); @@ -7481,7 +7535,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, u64 page_start = folio_pos(folio); u64 page_end = page_start + folio_size(folio) - 1; u64 cur; - int inode_evicting = inode->vfs_inode.i_state & I_FREEING; + int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING; /* * We have folio locked so no new ordered extent can be created on this @@ -7578,11 +7632,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); /* * If the ordered extent has finished, we're safe to delete all @@ -7644,19 +7698,22 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) .ino = btrfs_ino(inode), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, + .new_size = inode->vfs_inode.i_size, }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv rsv; int ret; struct btrfs_trans_handle *trans; - u64 mask = fs_info->sectorsize - 1; const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + const u64 lock_start = round_down(inode->vfs_inode.i_size, fs_info->sectorsize); + const u64 i_size_up = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); + + /* Our inode is locked and the i_size can't be changed concurrently. */ + btrfs_assert_inode_locked(inode); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(inode, - inode->vfs_inode.i_size & (~mask), - (u64)-1); + ret = btrfs_wait_ordered_range(inode, lock_start, (u64)-1); if (ret) return ret; } @@ -7720,19 +7777,14 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) while (1) { struct extent_state *cached_state = NULL; - const u64 new_size = inode->vfs_inode.i_size; - const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); - control.new_size = new_size; btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ - btrfs_drop_extent_map_range(inode, - ALIGN(new_size, fs_info->sectorsize), - (u64)-1, false); + btrfs_drop_extent_map_range(inode, i_size_up, (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); @@ -8710,15 +8762,13 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, - struct writeback_control *wbc, bool snapshot, - bool in_reclaim_context) +static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write, + bool snapshot, bool in_reclaim_context) { struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); int ret = 0; - bool full_flush = wbc->nr_to_write == LONG_MAX; mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); @@ -8744,10 +8794,10 @@ static int start_delalloc_inodes(struct btrfs_root *root, if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); - if (full_flush) { - work = btrfs_alloc_delalloc_work(&inode->vfs_inode); + if (nr_to_write == NULL) { + work = btrfs_alloc_delalloc_work(tmp_inode); if (!work) { - iput(&inode->vfs_inode); + iput(tmp_inode); ret = -ENOMEM; goto out; } @@ -8755,9 +8805,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + ret = filemap_flush_nr(tmp_inode->i_mapping, + nr_to_write); btrfs_add_delayed_iput(inode); - if (ret || wbc->nr_to_write <= 0) + + if (ret || *nr_to_write <= 0) goto out; } cond_resched(); @@ -8783,29 +8835,17 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; struct btrfs_fs_info *fs_info = root->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - - return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); + return start_delalloc_inodes(root, NULL, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = nr, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; + long *nr_to_write = nr == LONG_MAX ? NULL : &nr; struct btrfs_root *root; LIST_HEAD(splice); int ret; @@ -8817,13 +8857,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { - /* - * Reset nr_to_write here so we know that we're doing a full - * flush. - */ - if (nr == LONG_MAX) - wbc.nr_to_write = LONG_MAX; - root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); @@ -8832,9 +8865,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); + ret = start_delalloc_inodes(root, nr_to_write, false, + in_reclaim_context); btrfs_put_root(root); - if (ret < 0 || wbc.nr_to_write <= 0) + if (ret < 0 || nr <= 0) goto out; spin_lock(&fs_info->delalloc_root_lock); } @@ -9064,7 +9098,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, */ cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, - min_size, 0, *alloc_hint, &ins, 1, 0); + min_size, 0, *alloc_hint, &ins, true, false); if (ret) break; @@ -9170,6 +9204,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } +/* + * NOTE: in case you are adding MAY_EXEC check for directories: + * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to + * elide calls here. + */ static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { @@ -9395,7 +9434,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 disk_bytenr, u64 disk_io_size, struct page **pages, void *uring_ctx) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private *priv, sync_priv; struct completion sync_reads; unsigned long i = 0; @@ -9420,10 +9458,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, priv->status = 0; priv->uring_ctx = uring_ctx; - bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0, btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - bbio->inode = inode; do { size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); @@ -9432,10 +9469,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); - bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0, btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - bbio->inode = inode; continue; } @@ -9826,8 +9862,6 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, } for (;;) { - struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) goto out_folios; @@ -9877,7 +9911,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, } ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, - disk_num_bytes, 0, 0, &ins, 1, 1); + disk_num_bytes, 0, 0, &ins, true, true); if (ret) goto out_delalloc_release; extent_reserved = true; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8cb7d5a462ef..acb484546b1d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -503,7 +503,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_root_item *root_item; + struct btrfs_root_item AUTO_KFREE(root_item); struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -527,20 +527,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) - goto out_root_item; + return ret; /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assumes subvolume qgroup's level to be 0. */ - if (btrfs_qgroup_level(objectid)) { - ret = -ENOSPC; - goto out_root_item; - } + if (btrfs_qgroup_level(objectid)) + return -ENOSPC; ret = get_anon_bdev(&anon_dev); if (ret < 0) - goto out_root_item; + return ret; new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir); if (!new_inode_args.inode) { @@ -692,8 +690,7 @@ out_inode: out_anon_dev: if (anon_dev) free_anon_bdev(anon_dev); -out_root_item: - kfree(root_item); + return ret; } @@ -904,14 +901,9 @@ static noinline int btrfs_mksubvol(struct dentry *parent, struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len); int ret; - ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (ret == -EINTR) - return ret; - - dentry = lookup_one(idmap, qname, parent); - ret = PTR_ERR(dentry); + dentry = start_creating_killable(idmap, parent, qname); if (IS_ERR(dentry)) - goto out_unlock; + return PTR_ERR(dentry); ret = btrfs_may_create(idmap, dir, dentry); if (ret) @@ -940,9 +932,7 @@ static noinline int btrfs_mksubvol(struct dentry *parent, out_up_read: up_read(&fs_info->subvol_sem); out_dput: - dput(dentry); -out_unlock: - btrfs_inode_unlock(BTRFS_I(dir), 0); + end_creating(dentry); return ret; } @@ -1606,7 +1596,7 @@ static noinline int search_ioctl(struct btrfs_root *root, { struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; int num_found = 0; unsigned long sk_offset = 0; @@ -1626,10 +1616,8 @@ static noinline int search_ioctl(struct btrfs_root *root, } else { /* Look up the root from the arguments. */ root = btrfs_get_fs_root(info, sk->tree_id, true); - if (IS_ERR(root)) { - btrfs_free_path(path); + if (IS_ERR(root)) return PTR_ERR(root); - } } key.objectid = sk->min_objectid; @@ -1663,7 +1651,6 @@ static noinline int search_ioctl(struct btrfs_root *root, sk->nr_items = num_found; btrfs_put_root(root); - btrfs_free_path(path); return ret; } @@ -1746,7 +1733,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, int total_len = 0; struct btrfs_inode_ref *iref; struct extent_buffer *l; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); if (dirid == BTRFS_FIRST_FREE_OBJECTID) { name[0]='\0'; @@ -1807,7 +1794,6 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, ret = 0; out: btrfs_put_root(root); - btrfs_free_path(path); return ret; } @@ -1824,8 +1810,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct btrfs_root *root = NULL; - struct btrfs_path *path; - struct btrfs_key key, key2; + BTRFS_PATH_AUTO_FREE(path); + struct btrfs_key key; struct extent_buffer *leaf; char *ptr; int slot; @@ -1845,10 +1831,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; root = btrfs_get_fs_root(fs_info, treeid, true); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; - } + if (IS_ERR(root)) + return PTR_ERR(root); key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; @@ -1880,24 +1864,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, read_extent_buffer(leaf, ptr, (unsigned long)(iref + 1), len); - /* Check the read+exec permission of this directory */ - ret = btrfs_previous_item(root, path, dirid, - BTRFS_INODE_ITEM_KEY); - if (ret < 0) { - goto out_put; - } else if (ret > 0) { - ret = -ENOENT; - goto out_put; - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &key2, slot); - if (key2.objectid != dirid) { - ret = -ENOENT; - goto out_put; - } - /* * We don't need the path anymore, so release it and * avoid deadlocks and lockdep warnings in case @@ -1905,18 +1871,17 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, * btree and lock the same leaf. */ btrfs_release_path(path); - temp_inode = btrfs_iget(key2.objectid, root); + temp_inode = btrfs_iget(key.offset, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out_put; } + /* Check the read+exec permission of this directory. */ ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); iput(&temp_inode->vfs_inode); - if (ret) { - ret = -EACCES; + if (ret) goto out_put; - } if (key.offset == upper_limit) break; @@ -1942,12 +1907,10 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, key.type = BTRFS_ROOT_REF_KEY; key.offset = args->treeid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret < 0) + return ret; + else if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; slot = path->slots[0]; @@ -1957,10 +1920,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, item_len = btrfs_item_size(leaf, slot); /* Check if dirid in ROOT_REF corresponds to passed dirid */ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); - if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { - ret = -EINVAL; - goto out; - } + if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) + return -EINVAL; /* Copy subvolume's name */ item_off += sizeof(struct btrfs_root_ref); @@ -1970,8 +1931,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, out_put: btrfs_put_root(root); -out: - btrfs_free_path(path); + return ret; } @@ -2417,18 +2377,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto free_subvol_name; } - ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (ret == -EINTR) - goto free_subvol_name; - dentry = lookup_one(idmap, &QSTR(subvol_name), parent); + dentry = start_removing_killable(idmap, parent, &QSTR(subvol_name)); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); - goto out_unlock_dir; - } - - if (d_really_is_negative(dentry)) { - ret = -ENOENT; - goto out_dput; + goto out_end_removing; } inode = d_inode(dentry); @@ -2449,7 +2401,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ ret = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) - goto out_dput; + goto out_end_removing; /* * Do not allow deletion if the parent dir is the same @@ -2460,21 +2412,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ ret = -EINVAL; if (root == dest) - goto out_dput; + goto out_end_removing; ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); if (ret) - goto out_dput; + goto out_end_removing; } /* check if subvolume may be deleted by a user */ ret = btrfs_may_delete(idmap, dir, dentry, 1); if (ret) - goto out_dput; + goto out_end_removing; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; - goto out_dput; + goto out_end_removing; } btrfs_inode_lock(BTRFS_I(inode), 0); @@ -2483,10 +2435,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (!ret) d_delete_notify(dir, dentry); -out_dput: - dput(dentry); -out_unlock_dir: - btrfs_inode_unlock(BTRFS_I(dir), 0); +out_end_removing: + end_removing(dentry); free_subvol_name: kfree(subvol_name_ptr); free_parent: @@ -2956,7 +2906,7 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; - struct btrfs_ioctl_space_info *dest_orig; + struct btrfs_ioctl_space_info AUTO_KFREE(dest_orig); struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; static const u64 types[] = { @@ -3077,9 +3027,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, (arg + sizeof(struct btrfs_ioctl_space_args)); if (copy_to_user(user_dest, dest_orig, alloc_size)) - ret = -EFAULT; + return -EFAULT; - kfree(dest_orig); out: if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) ret = -EFAULT; @@ -3298,7 +3247,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) u64 rel_ptr; int size; struct btrfs_ioctl_ino_path_args *ipa = NULL; - struct inode_fs_paths *ipath = NULL; + struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; struct btrfs_path *path; if (!capable(CAP_DAC_READ_SEARCH)) @@ -3346,7 +3295,6 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) out: btrfs_free_path(path); - free_ipath(ipath); kfree(ipa); return ret; @@ -3611,7 +3559,7 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd) static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_balance_args *bargs; + struct btrfs_ioctl_balance_args AUTO_KFREE(bargs); int ret = 0; if (!capable(CAP_SYS_ADMIN)) @@ -3633,8 +3581,6 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; - - kfree(bargs); out: mutex_unlock(&fs_info->balance_mutex); return ret; @@ -4228,7 +4174,7 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, u64 safe_set, u64 safe_clear) { const char *type = btrfs_feature_set_name(set); - char *names; + const char AUTO_KFREE(names); u64 disallowed, unsupported; u64 set_mask = flags & change_mask; u64 clear_mask = ~flags & change_mask; @@ -4236,12 +4182,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, unsupported = set_mask & ~supported_flags; if (unsupported) { names = btrfs_printable_features(set, unsupported); - if (names) { + if (names) btrfs_warn(fs_info, "this kernel does not support the %s feature bit%s", names, strchr(names, ',') ? "s" : ""); - kfree(names); - } else + else btrfs_warn(fs_info, "this kernel does not support %s bits 0x%llx", type, unsupported); @@ -4251,12 +4196,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, disallowed = set_mask & ~safe_set; if (disallowed) { names = btrfs_printable_features(set, disallowed); - if (names) { + if (names) btrfs_warn(fs_info, "can't set the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); - kfree(names); - } else + else btrfs_warn(fs_info, "can't set %s bits 0x%llx while mounted", type, disallowed); @@ -4266,12 +4210,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, disallowed = clear_mask & ~safe_clear; if (disallowed) { names = btrfs_printable_features(set, disallowed); - if (names) { + if (names) btrfs_warn(fs_info, "can't clear the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); - kfree(names); - } else + else btrfs_warn(fs_info, "can't clear %s bits 0x%llx while mounted", type, disallowed); @@ -4418,10 +4361,6 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; @@ -4513,7 +4452,6 @@ out_acct: static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -4527,11 +4465,6 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } - if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; @@ -4649,8 +4582,9 @@ struct io_btrfs_cmd { struct btrfs_uring_priv *priv; }; -static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +static void btrfs_uring_read_finished(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); struct btrfs_uring_priv *priv = bc->priv; struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); @@ -4695,7 +4629,7 @@ out: btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - io_uring_cmd_done(cmd, ret, issue_flags); + io_uring_cmd_done(cmd, ret, IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); add_rchar(current, ret); for (index = 0; index < priv->nr_pages; index++) @@ -4813,11 +4747,6 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = -EPERM; goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } - sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (issue_flags & IO_URING_F_COMPAT) { @@ -4945,7 +4874,6 @@ out_acct: static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct file *file = cmd->file; - struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); loff_t pos; struct kiocb kiocb; ssize_t ret; @@ -4960,11 +4888,6 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu ret = -EPERM; goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } - sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (!(file->f_mode & FMODE_WRITE)) { @@ -5077,6 +5000,9 @@ out_acct: int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file))))) + return -EIO; + switch (cmd->cmd_op) { case BTRFS_IOC_ENCODED_READ: #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) @@ -5220,6 +5146,43 @@ static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *a return 0; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg) +{ + int ret = 0; + u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (u32 __user *)arg)) + return -EFAULT; + + if (flags >= BTRFS_SHUTDOWN_FLAGS_LAST) + return -EINVAL; + + if (btrfs_is_shutdown(fs_info)) + return 0; + + switch (flags) { + case BTRFS_SHUTDOWN_FLAGS_LOGFLUSH: + case BTRFS_SHUTDOWN_FLAGS_DEFAULT: + ret = freeze_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL); + if (ret) + return ret; + btrfs_force_shutdown(fs_info); + ret = thaw_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL); + if (ret) + return ret; + break; + case BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH: + btrfs_force_shutdown(fs_info); + break; + } + return ret; +} +#endif + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -5375,6 +5338,10 @@ long btrfs_ioctl(struct file *file, unsigned int #endif case BTRFS_IOC_SUBVOL_SYNC_WAIT: return btrfs_ioctl_subvol_sync(fs_info, argp); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_IOC_SHUTDOWN: + return btrfs_ioctl_shutdown(fs_info, arg); +#endif } return -ENOTTY; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index a0cf8effe008..2f853de44473 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -24,6 +24,7 @@ static const char fs_state_chars[] = { [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', [BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S', [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', + [BTRFS_FS_STATE_EMERGENCY_SHUTDOWN] = 'E', }; static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 4416c165644f..d8c0bd17dcda 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -168,7 +168,8 @@ do { \ #endif #else -#define ASSERT(cond, args...) (void)(cond) +/* Compile check the @cond expression but don't generate any code. */ +#define ASSERT(cond, args...) BUILD_BUG_ON_INVALID(cond) #endif #ifdef CONFIG_BTRFS_DEBUG diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 60f9b000d644..12c5a9d6564f 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -14,6 +14,13 @@ #include <linux/bio.h> /* + * Convenience macros to define a pointer with the __free(kfree) and + * __free(kvfree) cleanup attributes and initialized to NULL. + */ +#define AUTO_KFREE(name) *name __free(kfree) = NULL +#define AUTO_KVFREE(name) *name __free(kvfree) = NULL + +/* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. */ #define ENUM_BIT(name) \ @@ -209,9 +216,4 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr, return (found_set == start + nbits); } -static inline u64 folio_end(struct folio *folio) -{ - return folio_pos(folio) + folio_size(folio); -} - #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2829f20d7bb5..5df02c707aee 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -237,14 +237,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) /* One ref for the tree. */ refcount_inc(&entry->refs); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = tree_insert(&inode->ordered_tree, entry->file_offset, &entry->rb_node); if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -328,9 +328,9 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, { struct btrfs_inode *inode = entry->inode; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) @@ -359,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, if (folio) { ASSERT(folio->mapping); ASSERT(folio_pos(folio) <= file_offset); - ASSERT(file_offset + len <= folio_end(folio)); + ASSERT(file_offset + len <= folio_next_pos(folio)); /* * Ordered flag indicates whether we still have @@ -417,15 +417,14 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, bool uptodate) { struct btrfs_inode *inode = ordered->inode; - unsigned long flags; bool ret; trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); ret = can_finish_ordered_extent(ordered, folio, file_offset, len, uptodate); - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); /* * If this is a COW write it means we created new extent maps for the @@ -481,18 +480,16 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; u64 cur = file_offset; + const u64 end = file_offset + num_bytes; - trace_btrfs_writepage_end_io_hook(inode, file_offset, - file_offset + num_bytes - 1, - uptodate); + trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); - while (cur < file_offset + num_bytes) { + spin_lock(&inode->ordered_tree_lock); + while (cur < end) { u64 entry_end; - u64 end; - u32 len; + u64 this_end; + u64 len; node = ordered_tree_search(inode, cur); /* No ordered extents at all */ @@ -535,19 +532,18 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, * | * cur */ - end = min(entry->file_offset + entry->num_bytes, - file_offset + num_bytes) - 1; - ASSERT(end + 1 - cur < U32_MAX); - len = end + 1 - cur; + this_end = min(entry_end, end); + len = this_end - cur; + ASSERT(len < U32_MAX); if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); btrfs_queue_ordered_fn(entry); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); } cur += len; } - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); } /* @@ -573,10 +569,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; bool finished = false; - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); if (cached && *cached) { entry = *cached; goto have_entry; @@ -613,7 +608,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); return finished; } @@ -678,7 +673,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); - spin_lock_irq(&btrfs_inode->ordered_tree_lock); + spin_lock(&btrfs_inode->ordered_tree_lock); node = &entry->rb_node; rb_erase(node, &btrfs_inode->ordered_tree); RB_CLEAR_NODE(node); @@ -686,7 +681,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, btrfs_inode->ordered_tree_last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); - spin_unlock_irq(&btrfs_inode->ordered_tree_lock); + spin_unlock(&btrfs_inode->ordered_tree_lock); /* * The current running transaction is waiting on us, we need to let it @@ -971,9 +966,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -986,7 +980,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino trace_btrfs_ordered_extent_lookup(inode, entry); } out: - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -999,7 +993,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) { node = ordered_tree_search(inode, file_offset + len); @@ -1026,7 +1020,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_range(inode, entry); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1041,7 +1035,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, btrfs_assert_inode_locked(inode); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { struct btrfs_ordered_extent *ordered; @@ -1055,7 +1049,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, refcount_inc(&ordered->refs); trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } /* @@ -1068,7 +1062,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -1077,7 +1071,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_first(inode, entry); out: - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1099,7 +1093,7 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last @@ -1154,7 +1148,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1286,9 +1280,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( /* * Take the root's ordered_extent_lock to avoid a race with * btrfs_wait_ordered_extents() when updating the disk_bytenr and - * disk_num_bytes fields of the ordered extent below. And we disable - * IRQs because the inode's ordered_tree_lock is used in IRQ context - * elsewhere. + * disk_num_bytes fields of the ordered extent below. * * There's no concern about a previous caller of * btrfs_wait_ordered_extents() getting the trimmed ordered extent diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 62b993fae54f..f189bf09ce6a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -131,7 +131,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); - pr_info("\t\ttree block key (%llu %u %llu) level %d\n", + pr_info("\t\ttree block key " BTRFS_KEY_FMT " level %d\n", btrfs_disk_key_objectid(&key), key.type, btrfs_disk_key_offset(&key), btrfs_tree_block_level(eb, info)); @@ -277,9 +277,8 @@ static void print_dir_item(const struct extent_buffer *eb, int i) struct btrfs_key location; btrfs_dir_item_key_to_cpu(eb, di, &location); - pr_info("\t\tlocation key (%llu %u %llu) type %d\n", - location.objectid, location.type, location.offset, - btrfs_dir_ftype(eb, di)); + pr_info("\t\tlocation key " BTRFS_KEY_FMT " type %d\n", + BTRFS_KEY_FMT_VALUE(&location), btrfs_dir_ftype(eb, di)); pr_info("\t\ttransid %llu data_len %u name_len %u\n", btrfs_dir_transid(eb, di), data_len, name_len); di = (struct btrfs_dir_item *)((char *)di + len); @@ -421,7 +420,7 @@ static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID) scnprintf(buf, buf_size, "UNTYPED"); else if (key_to_str[key->type]) - scnprintf(buf, buf_size, key_to_str[key->type]); + scnprintf(buf, buf_size, "%s", key_to_str[key->type]); else scnprintf(buf, buf_size, "UNKNOWN.%d", key->type); } @@ -598,10 +597,9 @@ void btrfs_print_tree(const struct extent_buffer *c, bool follow) print_eb_refs_lock(c); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); - pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n", - i, key.objectid, key.type, key.offset, - btrfs_node_blockptr(c, i), - btrfs_node_ptr_generation(c, i)); + pr_info("\tkey %d " BTRFS_KEY_FMT " block %llu gen %llu\n", + i, BTRFS_KEY_FMT_VALUE(&key), btrfs_node_blockptr(c, i), + btrfs_node_ptr_generation(c, i)); } if (!follow) return; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 31ad8580322a..9e2b53e90dcb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -660,7 +660,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -672,7 +672,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, key.offset = dst; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - btrfs_free_path(path); return ret; } @@ -681,7 +680,7 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -694,24 +693,19 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; - ret = btrfs_del_item(trans, quota_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, quota_root, path); } static int add_qgroup_item(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root, u64 qgroupid) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_qgroup_info_item *qgroup_info; struct btrfs_qgroup_limit_item *qgroup_limit; struct extent_buffer *leaf; @@ -737,7 +731,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_info)); if (ret && ret != -EEXIST) - goto out; + return ret; leaf = path->nodes[0]; qgroup_info = btrfs_item_ptr(leaf, path->slots[0], @@ -754,7 +748,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_limit)); if (ret && ret != -EEXIST) - goto out; + return ret; leaf = path->nodes[0]; qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], @@ -765,17 +759,14 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -787,33 +778,27 @@ static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) key.offset = qgroupid; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; ret = btrfs_del_item(trans, quota_root, path); if (ret) - goto out; + return ret; btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; ret = btrfs_del_item(trans, quota_root, path); -out: - btrfs_free_path(path); return ret; } @@ -821,7 +806,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup) { struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_limit_item *qgroup_limit; @@ -841,7 +826,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, ret = -ENOENT; if (ret) - goto out; + return ret; l = path->nodes[0]; slot = path->slots[0]; @@ -851,8 +836,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); -out: - btrfs_free_path(path); + return ret; } @@ -861,7 +845,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_info_item *qgroup_info; @@ -884,7 +868,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, ret = -ENOENT; if (ret) - goto out; + return ret; l = path->nodes[0]; slot = path->slots[0]; @@ -894,8 +878,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); -out: - btrfs_free_path(path); + return ret; } @@ -903,7 +886,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_status_item *ptr; @@ -923,7 +906,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) ret = -ENOENT; if (ret) - goto out; + return ret; l = path->nodes[0]; slot = path->slots[0]; @@ -933,8 +916,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); -out: - btrfs_free_path(path); + return ret; } @@ -944,7 +926,7 @@ out: static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf = NULL; int ret; @@ -961,7 +943,7 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; nr = btrfs_header_nritems(leaf); if (!nr) @@ -974,14 +956,12 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_quota_enable(struct btrfs_fs_info *fs_info, @@ -1263,7 +1243,14 @@ out: btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); - kfree(prealloc); + + /* + * At this point we either failed at allocating prealloc, or we + * succeeded and passed the ownership to it to add_qgroup_rb(). In any + * case, this needs to be NULL or there is something wrong. + */ + ASSERT(prealloc == NULL); + return ret; } @@ -1695,7 +1682,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); - kfree(prealloc); + /* + * At this point we either failed at allocating prealloc, or we + * succeeded and passed the ownership to it to add_qgroup_rb(). In any + * case, this needs to be NULL or there is something wrong. + */ + ASSERT(prealloc == NULL); return ret; } @@ -1707,8 +1699,7 @@ out: static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { struct btrfs_key key; - struct btrfs_path *path; - int ret; + BTRFS_PATH_AUTO_FREE(path); /* * Squota would never be inconsistent, but there can still be case @@ -1741,13 +1732,11 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup if (!path) return -ENOMEM; - ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); - btrfs_free_path(path); /* * The @ret from btrfs_find_root() exactly matches our definition for * the return value, thus can be returned directly. */ - return ret; + return btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); } int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) @@ -2296,7 +2285,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, bool trace_leaf) { struct btrfs_key key; - struct btrfs_path *src_path; + BTRFS_PATH_AUTO_FREE(src_path); struct btrfs_fs_info *fs_info = trans->fs_info; u32 nodesize = fs_info->nodesize; int cur_level = root_level; @@ -2308,10 +2297,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, return -EINVAL; src_path = btrfs_alloc_path(); - if (!src_path) { - ret = -ENOMEM; - goto out; - } + if (!src_path) + return -ENOMEM; if (dst_level) btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); @@ -2337,10 +2324,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, parent_slot = src_path->slots[cur_level + 1]; eb = btrfs_read_node_slot(eb, parent_slot); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } + if (IS_ERR(eb)) + return PTR_ERR(eb); src_path->nodes[cur_level] = eb; @@ -2361,10 +2346,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, &src_key, src_path->slots[cur_level]); } /* Content mismatch, something went wrong */ - if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { - ret = -ENOENT; - goto out; - } + if (btrfs_comp_cpu_keys(&dst_key, &src_key)) + return -ENOENT; cur_level--; } @@ -2375,21 +2358,20 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, nodesize); if (ret < 0) - goto out; + return ret; ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, nodesize); if (ret < 0) - goto out; + return ret; /* Record leaf file extents */ if (dst_level == 0 && trace_leaf) { ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); if (ret < 0) - goto out; + return ret; ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); } -out: - btrfs_free_path(src_path); + return ret; } @@ -2590,7 +2572,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, int level; u8 drop_subptree_thres; struct extent_buffer *eb = root_eb; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); ASSERT(root_eb != NULL); @@ -2623,12 +2605,12 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) - goto out; + return ret; } if (root_level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); - goto out; + return ret; } path = btrfs_alloc_path(); @@ -2664,10 +2646,8 @@ walk_down: child_bytenr = btrfs_node_blockptr(eb, parent_slot); eb = btrfs_read_node_slot(eb, parent_slot); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } + if (IS_ERR(eb)) + return PTR_ERR(eb); path->nodes[level] = eb; path->slots[level] = 0; @@ -2678,14 +2658,14 @@ walk_down: ret = btrfs_qgroup_trace_extent(trans, child_bytenr, fs_info->nodesize); if (ret) - goto out; + return ret; } if (level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, path->nodes[level]); if (ret) - goto out; + return ret; /* Nonzero return here means we completed our search */ ret = adjust_slots_upwards(path, root_level); @@ -2699,11 +2679,7 @@ walk_down: level--; } - ret = 0; -out: - btrfs_free_path(path); - - return ret; + return 0; } static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) @@ -3303,7 +3279,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; - struct btrfs_qgroup *prealloc; + struct btrfs_qgroup *prealloc = NULL; struct btrfs_qgroup_list **qlist_prealloc = NULL; bool free_inherit = false; bool need_rescan = false; @@ -3544,7 +3520,14 @@ out: } if (free_inherit) kfree(inherit); - kfree(prealloc); + + /* + * At this point we either failed at allocating prealloc, or we + * succeeded and passed the ownership to it to add_qgroup_rb(). In any + * case, this needs to be NULL or there is something wrong. + */ + ASSERT(prealloc == NULL); + return ret; } @@ -3712,10 +3695,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, path, 1, 0); btrfs_debug(fs_info, - "current progress key (%llu %u %llu), search_slot ret %d", - fs_info->qgroup_rescan_progress.objectid, - fs_info->qgroup_rescan_progress.type, - fs_info->qgroup_rescan_progress.offset, ret); + "current progress key " BTRFS_KEY_FMT ", search_slot ret %d", + BTRFS_KEY_FMT_VALUE(&fs_info->qgroup_rescan_progress), ret); if (ret) { /* @@ -3817,8 +3798,8 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) * Rescan should only search for commit root, and any later difference * should be recorded by qgroup */ - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; while (!ret && !(stopped = rescan_should_stop(fs_info))) { trans = btrfs_start_transaction(fs_info->fs_root, 0); @@ -4796,7 +4777,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; - struct btrfs_qgroup_swapped_block *block; + struct btrfs_qgroup_swapped_block AUTO_KFREE(block); struct extent_buffer *reloc_eb = NULL; struct rb_node *node; bool swapped = false; @@ -4853,7 +4834,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, block->last_snapshot, block->trace_leaf); free_out: - kfree(block); free_extent_buffer(reloc_eb); out: if (ret < 0) { diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index cc6f6095cc9f..2987cb7c686e 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -19,7 +19,7 @@ static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, u64 newlen, u64 frontpad) { struct btrfs_root *stripe_root = trans->fs_info->stripe_root; - struct btrfs_stripe_extent *extent, *newitem; + struct btrfs_stripe_extent *extent, AUTO_KFREE(newitem); struct extent_buffer *leaf; int slot; size_t item_size; @@ -53,14 +53,10 @@ static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, stripe_root, path); if (ret) - goto out; + return ret; btrfs_release_path(path); - ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); - -out: - kfree(newitem); - return ret; + return btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); } int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) @@ -299,7 +295,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_key stripe_key; struct btrfs_root *stripe_root = fs_info->stripe_root; const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type); - struct btrfs_stripe_extent *stripe_extent; + struct btrfs_stripe_extent AUTO_KFREE(stripe_extent); const size_t item_size = struct_size(stripe_extent, strides, num_stripes); int ret; @@ -336,8 +332,6 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); } - kfree(stripe_extent); - return ret; } @@ -394,8 +388,8 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, return -ENOMEM; if (stripe->rst_search_commit_root) { - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; } ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0135dceb7baa..f38d8305e46d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -66,10 +66,10 @@ static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, dump_bioc(fs_info, rbio->bioc); btrfs_crit(fs_info, -"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx", +"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx", rbio->flags, rbio->nr_sectors, rbio->nr_data, rbio->real_stripes, rbio->stripe_nsectors, - rbio->scrubp, rbio->dbitmap); + rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap); } #define ASSERT_RBIO(expr, rbio) \ @@ -134,18 +134,10 @@ struct btrfs_stripe_hash_table { }; /* - * A structure to present a sector inside a page, the length is fixed to - * sectorsize; + * The PFN may still be valid, but our paddrs should always be block size + * aligned, thus such -1 paddr is definitely not a valid one. */ -struct sector_ptr { - /* - * Blocks from the bio list can still be highmem. - * So here we use physical address to present a page and the offset inside it. - */ - phys_addr_t paddr; - bool has_paddr; - bool uptodate; -}; +#define INVALID_PADDR (~(phys_addr_t)0) static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); @@ -159,8 +151,8 @@ static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { bitmap_free(rbio->error_bitmap); kfree(rbio->stripe_pages); - kfree(rbio->bio_sectors); - kfree(rbio->stripe_sectors); + kfree(rbio->bio_paddrs); + kfree(rbio->stripe_paddrs); kfree(rbio->finish_pointers); } @@ -235,12 +227,22 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) return 0; } -static void memcpy_sectors(const struct sector_ptr *dst, - const struct sector_ptr *src, u32 blocksize) +static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr) { - memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr), - phys_to_page(src->paddr), offset_in_page(src->paddr), - blocksize); + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + + ASSERT(sector_nr < rbio->nr_sectors); + for (int i = 0; i < rbio->sector_nsteps; i++) { + unsigned int index = sector_nr * rbio->sector_nsteps + i; + phys_addr_t dst = rbio->stripe_paddrs[index]; + phys_addr_t src = rbio->bio_paddrs[index]; + + ASSERT(dst != INVALID_PADDR); + ASSERT(src != INVALID_PADDR); + + memcpy_page(phys_to_page(dst), offset_in_page(dst), + phys_to_page(src), offset_in_page(src), step); + } } /* @@ -263,20 +265,19 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].has_paddr) { + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is * read from disk. */ if (i < rbio->nr_data * rbio->stripe_nsectors) - ASSERT(rbio->stripe_sectors[i].uptodate); + ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap)); continue; } - memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i], - rbio->bioc->fs_info->sectorsize); - rbio->stripe_sectors[i].uptodate = 1; + memcpy_from_bio_to_stripe(rbio, i); + set_bit(i, rbio->stripe_uptodate_bitmap); } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } @@ -299,19 +300,48 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } -static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, - unsigned int page_nr) +/* Get the sector number of the first sector covered by @page_nr. */ +static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; + u32 sector_nr; + + ASSERT(page_nr < rbio->nr_pages); + + sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits; + ASSERT(sector_nr < rbio->nr_sectors); + return sector_nr; +} + +/* + * Get the number of sectors covered by @page_nr. + * + * For bs > ps cases, the result will always be 1. + * For bs <= ps cases, the result will be ps / bs. + */ +static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 nr_sectors; + + ASSERT(page_nr < rbio->nr_pages); + + nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits; + ASSERT(nr_sectors > 0); + return nr_sectors; +} + +static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr); int i; ASSERT(page_nr < rbio->nr_pages); + ASSERT(sector_nr + nr_bits < rbio->nr_sectors); - for (i = sectors_per_page * page_nr; - i < sectors_per_page * page_nr + sectors_per_page; - i++) { - if (!rbio->stripe_sectors[i].uptodate) + for (i = sector_nr; i < sector_nr + nr_bits; i++) { + if (!test_bit(i, rbio->stripe_uptodate_bitmap)) return false; } return true; @@ -324,46 +354,44 @@ static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, */ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); u32 offset; int i; - for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { + for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps; + i++, offset += step) { int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); if (!rbio->stripe_pages[page_index]) continue; - rbio->stripe_sectors[i].has_paddr = true; - rbio->stripe_sectors[i].paddr = - page_to_phys(rbio->stripe_pages[page_index]) + - offset_in_page(offset); + rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) + + offset_in_page(offset); } } static void steal_rbio_page(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest, int page_nr) { - const u32 sectorsize = src->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; - int i; + const u32 sector_nr = page_nr_to_sector_nr(src, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(src, page_nr); + + ASSERT(page_nr < src->nr_pages); + ASSERT(sector_nr + nr_bits < src->nr_sectors); if (dest->stripe_pages[page_nr]) __free_page(dest->stripe_pages[page_nr]); dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; src->stripe_pages[page_nr] = NULL; - /* Also update the sector->uptodate bits. */ - for (i = sectors_per_page * page_nr; - i < sectors_per_page * page_nr + sectors_per_page; i++) - dest->stripe_sectors[i].uptodate = true; + /* Also update the stripe_uptodate_bitmap bits. */ + bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits); } static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) { - const int sector_nr = (page_nr << PAGE_SHIFT) >> - rbio->bioc->fs_info->sectorsize_bits; + const int sector_nr = page_nr_to_sector_nr(rbio, page_nr); /* * We have ensured PAGE_SIZE is aligned with sectorsize, thus @@ -677,39 +705,62 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } -static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, - unsigned int stripe_nr, - unsigned int sector_nr) +/* Return the sector index for @stripe_nr and @sector_nr. */ +static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { + unsigned int ret; + ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - return stripe_nr * rbio->stripe_nsectors + sector_nr; + ret = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(ret < rbio->nr_sectors); + return ret; +} + +/* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */ +static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr, + unsigned int step_nr) +{ + unsigned int ret; + + ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr); + + ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr; + ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps); + return ret; } -/* Return a sector from rbio->stripe_sectors, not from the bio list */ -static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int stripe_nr, - unsigned int sector_nr) +static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr, + unsigned int step_nr) { - return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, - sector_nr)]; + return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)]; } -/* Grab a sector inside P stripe */ -static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) { - return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); + return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr); } -/* Grab a sector inside Q stripe, return NULL if not RAID6 */ -static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) - return NULL; - return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); + return INVALID_PADDR; + return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr); +} + +/* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */ +static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr) +{ + return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)]; } /* @@ -944,7 +995,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) } /* - * Get a sector pointer specified by its @stripe_nr and @sector_nr. + * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) @@ -954,34 +1005,52 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) * * The read/modify/write code wants to reuse the original bio page as much * as possible, and only use stripe_sectors as fallback. + * + * Return NULL if bio_list_only is set but the specified sector has no + * coresponding bio. */ -static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, - int stripe_nr, int sector_nr, - bool bio_list_only) +static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) { - struct sector_ptr *sector; - int index; + phys_addr_t *ret = NULL; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0); - ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, - rbio, stripe_nr); - ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, - rbio, sector_nr); + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); - index = stripe_nr * rbio->stripe_nsectors + sector_nr; - ASSERT(index >= 0 && index < rbio->nr_sectors); - - spin_lock(&rbio->bio_list_lock); - sector = &rbio->bio_sectors[index]; - if (sector->has_paddr || bio_list_only) { - /* Don't return sector without a valid page pointer */ - if (!sector->has_paddr) - sector = NULL; - spin_unlock(&rbio->bio_list_lock); - return sector; + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = &rbio->bio_paddrs[index]; + return ret; + } } - spin_unlock(&rbio->bio_list_lock); + return &rbio->stripe_paddrs[index]; +} - return &rbio->stripe_sectors[index]; +/* + * Similar to sector_paddr_in_rbio(), but with extra consideration for + * bs > ps cases, where we can have multiple steps for a fs block. + */ +static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, int step_nr, + bool bio_list_only) +{ + phys_addr_t ret = INVALID_PADDR; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr); + + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); + + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = rbio->bio_paddrs[index]; + return ret; + } + } + return rbio->stripe_paddrs[index]; } /* @@ -997,10 +1066,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, const unsigned int stripe_nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; + const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE); + const unsigned int sector_nsteps = fs_info->sectorsize / step; struct btrfs_raid_bio *rbio; - /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ - ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * For bs <= ps cases, ps must be aligned to bs. + * For bs > ps cases, bs must be aligned to ps. + */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) || + IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE)); /* * Our current stripe len should be fixed to 64k thus stripe_nsectors * (at most 16) should be no larger than BITS_PER_LONG. @@ -1019,19 +1094,22 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); - rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), - GFP_NOFS); - rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), - GFP_NOFS); + rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); + rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); - if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || - !rbio->finish_pointers || !rbio->error_bitmap) { + if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs || + !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) { free_raid_bio_pointers(rbio); kfree(rbio); return ERR_PTR(-ENOMEM); } + for (int i = 0; i < num_sectors * sector_nsteps; i++) { + rbio->stripe_paddrs[i] = INVALID_PADDR; + rbio->bio_paddrs[i] = INVALID_PADDR; + } bio_list_init(&rbio->bio_list); init_waitqueue_head(&rbio->io_wait); @@ -1046,6 +1124,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; + rbio->sector_nsteps = sector_nsteps; refcount_set(&rbio->refs, 1); atomic_set(&rbio->stripes_pending, 0); @@ -1090,8 +1169,8 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) * @faila and @failb will also be updated to the first and second stripe * number of the errors. */ -static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, - int *faila, int *failb) +static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) { int stripe_nr; int found_errors = 0; @@ -1123,20 +1202,41 @@ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, return found_errors; } +static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps, + unsigned int step) +{ + int added = 0; + int ret; + + for (int i = 0; i < nr_steps; i++) { + ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, + offset_in_page(paddrs[i])); + if (ret != step) + goto revert; + added += ret; + } + return added; +revert: + /* + * We don't need to revert the bvec, as the bio will be submitted immediately, + * as long as the size is reduced the extra bvec will not be accessed. + */ + bio->bi_iter.bi_size -= added; + return 0; +} + /* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. - * Return <0 for error. + * Return <0 for error, and no byte will be added to @rbio. */ -static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct sector_ptr *sector, - unsigned int stripe_nr, - unsigned int sector_nr, - enum req_op op) +static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, + phys_addr_t *paddrs, unsigned int stripe_nr, + unsigned int sector_nr, enum req_op op) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); struct bio *last = bio_list->tail; int ret; struct bio *bio; @@ -1152,7 +1252,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - ASSERT(sector->has_paddr); + ASSERT(paddrs != NULL); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1165,8 +1265,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio->error_bitmap); /* Check if we have reached tolerance early. */ - found_errors = get_rbio_veritical_errors(rbio, sector_nr, - NULL, NULL); + found_errors = get_rbio_vertical_errors(rbio, sector_nr, + NULL, NULL); if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; return 0; @@ -1183,8 +1283,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, phys_to_page(sector->paddr), - sectorsize, offset_in_page(sector->paddr)); + ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step); if (ret == sectorsize) return 0; } @@ -1197,28 +1296,27 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; - __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize, - offset_in_page(sector->paddr)); + ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step); + ASSERT(ret == sectorsize); bio_list_add(bio_list, bio); return 0; } static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT); struct bvec_iter iter = bio->bi_iter; phys_addr_t paddr; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) { - unsigned int index = (offset >> sectorsize_bits); - struct sector_ptr *sector = &rbio->bio_sectors[index]; + btrfs_bio_for_each_block(paddr, bio, &iter, step) { + unsigned int index = (offset >> step_bits); - sector->has_paddr = true; - sector->paddr = paddr; - offset += sectorsize; + rbio->bio_paddrs[index] = paddr; + offset += step; } } @@ -1296,56 +1394,64 @@ static void assert_rbio(struct btrfs_raid_bio *rbio) ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); } -static inline void *kmap_local_sector(const struct sector_ptr *sector) +static inline void *kmap_local_paddr(phys_addr_t paddr) { /* The sector pointer must have a page mapped to it. */ - ASSERT(sector->has_paddr); + ASSERT(paddr != INVALID_PADDR); - return kmap_local_page(phys_to_page(sector->paddr)) + - offset_in_page(sector->paddr); + return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); } -/* Generate PQ for one vertical stripe. */ -static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr, + unsigned int step_nr) { void **pointers = rbio->finish_pointers; - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct sector_ptr *sector; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); int stripe; const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; /* First collect one sector from each data stripe */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_sector(sector); - } + for (stripe = 0; stripe < rbio->nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); /* Then add the parity stripe */ - sector = rbio_pstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_sector(sector); + pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr)); if (has_qstripe) { /* * RAID6, add the qstripe and call the library function * to fill in our p/q */ - sector = rbio_qstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_sector(sector); + pointers[stripe++] = kmap_local_paddr( + rbio_qstripe_paddr(rbio, sector_nr, step_nr)); assert_rbio(rbio); - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); } else { /* raid5 */ - memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + memcpy(pointers[rbio->nr_data], pointers[0], step); + run_xor(pointers + 1, rbio->nr_data - 1, step); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } +/* Generate PQ for one vertical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +{ + const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6); + + for (int i = 0; i < rbio->sector_nsteps; i++) + generate_pq_vertical_step(rbio, sectornr, i); + + set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr), + rbio->stripe_uptodate_bitmap); + if (has_qstripe) + set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr), + rbio->stripe_uptodate_bitmap); +} + static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -1372,7 +1478,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; + phys_addr_t *paddrs; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; @@ -1382,14 +1488,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe, sectornr, REQ_OP_WRITE); if (ret) goto error; @@ -1407,7 +1513,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; + phys_addr_t *paddrs; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; @@ -1432,14 +1538,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, bio_list, sector, + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) @@ -1487,21 +1593,17 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } /* - * For subpage case, we can no longer set page Up-to-date directly for - * stripe_pages[], thus we need to locate the sector. + * Return the index inside the rbio->stripe_sectors[] array. + * + * Return -1 if not found. */ -static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, - phys_addr_t paddr) +static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr) { - int i; - - for (i = 0; i < rbio->nr_sectors; i++) { - struct sector_ptr *sector = &rbio->stripe_sectors[i]; - - if (sector->has_paddr && sector->paddr == paddr) - return sector; + for (int i = 0; i < rbio->nr_sectors; i++) { + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr) + return i; } - return NULL; + return -1; } /* @@ -1510,17 +1612,23 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 blocksize = rbio->bioc->fs_info->sectorsize; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); + u32 offset = 0; phys_addr_t paddr; ASSERT(!bio_flagged(bio, BIO_CLONED)); - btrfs_bio_for_each_block_all(paddr, bio, blocksize) { - struct sector_ptr *sector = find_stripe_sector(rbio, paddr); + btrfs_bio_for_each_block_all(paddr, bio, step) { + /* Hitting the first step of a sector. */ + if (IS_ALIGNED(offset, sectorsize)) { + int sector_nr = find_stripe_sector_nr(rbio, paddr); - ASSERT(sector); - if (sector) - sector->uptodate = 1; + ASSERT(sector_nr >= 0); + if (sector_nr >= 0) + set_bit(sector_nr, rbio->stripe_uptodate_bitmap); + } + offset += step; } } @@ -1530,10 +1638,9 @@ static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) int i; for (i = 0; i < rbio->nr_sectors; i++) { - if (rbio->stripe_sectors[i].paddr == bvec_paddr) + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr) break; - if (rbio->bio_sectors[i].has_paddr && - rbio->bio_sectors[i].paddr == bvec_paddr) + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -1566,7 +1673,11 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 nr_steps = rbio->sector_nsteps; int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 offset = 0; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; phys_addr_t paddr; /* No data csum for the whole stripe, no need to verify. */ @@ -1577,18 +1688,24 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; - btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) { + btrfs_bio_for_each_block_all(paddr, bio, step) { u8 csum_buf[BTRFS_CSUM_SIZE]; - u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; - int ret; + u8 *expected_csum; + + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; + + /* Not yet covering the full fs block, continue to the next step. */ + if (!IS_ALIGNED(offset, fs_info->sectorsize)) + continue; /* No csum for this sector, skip to the next sector. */ if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; - ret = btrfs_check_block_csum(fs_info, paddr, - csum_buf, expected_csum); - if (ret < 0) + expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0)) set_bit(total_sector_nr, rbio->error_bitmap); total_sector_nr++; } @@ -1785,10 +1902,9 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - struct sector_ptr *sector; + phys_addr_t *paddrs; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; - int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) return 0; @@ -1801,54 +1917,32 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { - sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr); } csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected); - return ret; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0)) + return -EIO; + return 0; } -/* - * Recover a vertical stripe specified by @sector_nr. - * @*pointers are the pre-allocated pointers by the caller, so we don't - * need to allocate/free the pointers again and again. - */ -static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, - void **pointers, void **unmap_array) +static void recover_vertical_step(struct btrfs_raid_bio *rbio, + unsigned int sector_nr, + unsigned int step_nr, + int faila, int failb, + void **pointers, void **unmap_array) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - struct sector_ptr *sector; - const u32 sectorsize = fs_info->sectorsize; - int found_errors; - int faila; - int failb; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); int stripe_nr; - int ret = 0; - /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. - */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sector_nr, &rbio->dbitmap)) - return 0; - - found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, - &failb); - /* - * No errors in the vertical stripe, skip it. Can happen for recovery - * which only part of a stripe failed csum check. - */ - if (!found_errors) - return 0; - - if (unlikely(found_errors > rbio->bioc->max_errors)) - return -EIO; + ASSERT(step_nr < rbio->sector_nsteps); + ASSERT(sector_nr < rbio->stripe_nsectors); /* * Setup our array of pointers with sectors from each stripe @@ -1857,16 +1951,18 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * pointer order. */ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + phys_addr_t paddr; + /* * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0); } else { - sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr); } - pointers[stripe_nr] = kmap_local_sector(sector); + pointers[stripe_nr] = kmap_local_paddr(paddr); unmap_array[stripe_nr] = pointers[stripe_nr]; } @@ -1912,10 +2008,10 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, } if (failb == rbio->real_stripes - 2) { - raid6_datap_recov(rbio->real_stripes, sectorsize, + raid6_datap_recov(rbio->real_stripes, step, faila, pointers); } else { - raid6_2data_recov(rbio->real_stripes, sectorsize, + raid6_2data_recov(rbio->real_stripes, step, faila, failb, pointers); } } else { @@ -1925,7 +2021,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, ASSERT(failb == -1); pstripe: /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + memcpy(pointers[faila], pointers[rbio->nr_data], step); /* Rearrange the pointer array */ p = pointers[faila]; @@ -1935,40 +2031,66 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* Xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, sectorsize); - + run_xor(pointers, rbio->nr_data - 1, step); } +cleanup: + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); +} + +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + int found_errors; + int faila; + int failb; + int ret = 0; + /* - * No matter if this is a RMW or recovery, we should have all - * failed sectors repaired in the vertical stripe, thus they are now - * uptodate. - * Especially if we determine to cache the rbio, we need to - * have at least all data sectors uptodate. - * - * If possible, also check if the repaired sector matches its data - * checksum. + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return 0; + + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the vertical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. */ + if (!found_errors) + return 0; + + if (unlikely(found_errors > rbio->bioc->max_errors)) + return -EIO; + + for (int i = 0; i < rbio->sector_nsteps; i++) + recover_vertical_step(rbio, sector_nr, i, faila, failb, + pointers, unmap_array); if (faila >= 0) { ret = verify_one_sector(rbio, faila, sector_nr); if (ret < 0) - goto cleanup; + return ret; - sector = rbio_stripe_sector(rbio, faila, sector_nr); - sector->uptodate = 1; + set_bit(rbio_sector_index(rbio, faila, sector_nr), + rbio->stripe_uptodate_bitmap); } if (failb >= 0) { ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) - goto cleanup; + return ret; - sector = rbio_stripe_sector(rbio, failb, sector_nr); - sector->uptodate = 1; + set_bit(rbio_sector_index(rbio, failb, sector_nr), + rbio->stripe_uptodate_bitmap); } - -cleanup: - for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) - kunmap_local(unmap_array[stripe_nr]); return ret; } @@ -2043,7 +2165,7 @@ static void recover_rbio(struct btrfs_raid_bio *rbio) total_sector_nr++) { int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; - struct sector_ptr *sector; + phys_addr_t *paddrs; /* * Skip the range which has error. It can be a range which is @@ -2060,8 +2182,8 @@ static void recover_rbio(struct btrfs_raid_bio *rbio) continue; } - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, sectornr, REQ_OP_READ); if (ret < 0) { bio_list_put(&bio_list); @@ -2106,7 +2228,7 @@ static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_n int faila; int failb; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, &failb); /* This vertical stripe doesn't have errors. */ if (!found_errors) @@ -2250,13 +2372,13 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; + phys_addr_t *paddrs; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, REQ_OP_READ); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, + sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; @@ -2310,14 +2432,15 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) int i; for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { - struct sector_ptr *sector = &rbio->stripe_sectors[i]; + phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps]; /* * We have a sector which doesn't have page nor uptodate, * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (!sector->has_paddr || !sector->uptodate) + if (paddr == INVALID_PADDR || + !test_bit(i, rbio->stripe_uptodate_bitmap)) return true; } return false; @@ -2398,7 +2521,7 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); + found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL); if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; @@ -2469,47 +2592,121 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, return rbio; } +static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio, + int sector_nr) +{ + const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize); + const u32 base = sector_nr * rbio->sector_nsteps; + + for (int i = base; i < base + rbio->sector_nsteps; i++) { + const unsigned int page_index = (i * step) >> PAGE_SHIFT; + struct page *page; + + if (rbio->stripe_pages[page_index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[page_index] = page; + } + return 0; +} + /* * We just scrub the parity that we have correct data on the same horizontal, * so we needn't allocate all pages for all the stripes. */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int total_sector_nr; for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct page *page; int sectornr = total_sector_nr % rbio->stripe_nsectors; - int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; + int ret; if (!test_bit(sectornr, &rbio->dbitmap)) continue; - if (rbio->stripe_pages[index]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; + ret = alloc_rbio_sector_pages(rbio, total_sector_nr); + if (ret < 0) + return ret; } index_stripe_sectors(rbio); return 0; } +/* Return true if the content of the step matches the caclulated one. */ +static bool verify_one_parity_step(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr, + unsigned int step_nr) +{ + const unsigned int nr_data = rbio->nr_data; + const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2); + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + void *parity; + bool ret = false; + + ASSERT(step_nr < rbio->sector_nsteps); + + /* First collect one page from each data stripe. */ + for (int stripe = 0; stripe < nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sector_nr, + step_nr, 0)); + + if (has_qstripe) { + assert_rbio(rbio); + /* RAID6, call the library function to fill in our P/Q. */ + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); + } else { + /* RAID5. */ + memcpy(pointers[nr_data], pointers[0], step); + run_xor(pointers + 1, nr_data - 1, step); + } + + /* Check scrubbing parity and repair it. */ + parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr)); + if (memcmp(parity, pointers[rbio->scrubp], step) != 0) + memcpy(parity, pointers[rbio->scrubp], step); + else + ret = true; + kunmap_local(parity); + + for (int stripe = nr_data - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); + return ret; +} + +/* + * The @pointers array should have the P/Q parity already mapped. + */ +static void verify_one_parity_sector(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr) +{ + bool found_error = false; + + for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) { + bool match; + + match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr); + if (!match) + found_error = true; + } + if (!found_error) + bitmap_clear(&rbio->dbitmap, sector_nr, 1); +} + static int finish_parity_scrub(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; - const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; - int stripe; int sectornr; bool has_qstripe; struct page *page; - struct sector_ptr p_sector = { 0 }; - struct sector_ptr q_sector = { 0 }; + phys_addr_t p_paddr = INVALID_PADDR; + phys_addr_t q_paddr = INVALID_PADDR; struct bio_list bio_list; int is_replace = 0; int ret; @@ -2542,72 +2739,36 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; - p_sector.has_paddr = true; - p_sector.paddr = page_to_phys(page); - p_sector.uptodate = 1; + p_paddr = page_to_phys(page); page = NULL; + pointers[nr_data] = kmap_local_paddr(p_paddr); if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ page = alloc_page(GFP_NOFS); if (!page) { - __free_page(phys_to_page(p_sector.paddr)); - p_sector.has_paddr = false; + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; return -ENOMEM; } - q_sector.has_paddr = true; - q_sector.paddr = page_to_phys(page); - q_sector.uptodate = 1; + q_paddr = page_to_phys(page); page = NULL; - pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector); + pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_sector(&p_sector); - - for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; - void *parity; - - /* first collect one page from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_sector(sector); - } - if (has_qstripe) { - assert_rbio(rbio); - /* RAID6, call the library function to fill in our P/Q */ - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); - } else { - /* raid5 */ - memcpy(pointers[nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, nr_data - 1, sectorsize); - } - - /* Check scrubbing parity and repair it */ - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - parity = kmap_local_sector(sector); - if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) - memcpy(parity, pointers[rbio->scrubp], sectorsize); - else - /* Parity is right, needn't writeback */ - bitmap_clear(&rbio->dbitmap, sectornr, 1); - kunmap_local(parity); - - for (stripe = nr_data - 1; stripe >= 0; stripe--) - kunmap_local(pointers[stripe]); - } + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) + verify_one_parity_sector(rbio, pointers, sectornr); kunmap_local(pointers[nr_data]); - __free_page(phys_to_page(p_sector.paddr)); - p_sector.has_paddr = false; - if (q_sector.has_paddr) { - __free_page(phys_to_page(q_sector.paddr)); - q_sector.has_paddr = false; + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; + if (q_paddr != INVALID_PADDR) { + __free_page(phys_to_page(q_paddr)); + q_paddr = INVALID_PADDR; } /* @@ -2616,10 +2777,10 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) * everything else. */ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; + phys_addr_t *paddrs; - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp, sectornr, REQ_OP_WRITE); if (ret) goto cleanup; @@ -2634,11 +2795,10 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) */ ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; + phys_addr_t *paddrs; - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, - rbio->real_stripes, + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) goto cleanup; @@ -2686,7 +2846,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) int failb; int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, &failb); if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; @@ -2755,7 +2915,7 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) total_sector_nr++) { int sectornr = total_sector_nr % rbio->stripe_nsectors; int stripe = total_sector_nr / rbio->stripe_nsectors; - struct sector_ptr *sector; + phys_addr_t *paddrs; /* No data in the vertical stripe, no need to read. */ if (!test_bit(sectornr, &rbio->dbitmap)) @@ -2763,22 +2923,23 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) /* * We want to find all the sectors missing from the rbio and - * read them from the disk. If sector_in_rbio() finds a sector + * read them from the disk. If sector_paddr_in_rbio() finds a sector * in the bio list we don't need to read it off the stripe. */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); /* * The bio cache may have handed us an uptodate sector. If so, * use it. */ - if (sector->uptodate) + if (test_bit(rbio_sector_index(rbio, stripe, sectornr), + rbio->stripe_uptodate_bitmap)) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); @@ -2819,7 +2980,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); + found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL); if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; @@ -2857,9 +3018,6 @@ void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, unsigned int foffset = 0; int ret; - /* We shouldn't hit RAID56 for bs > ps cases for now. */ - ASSERT(fs_info->sectorsize <= PAGE_SIZE); - /* * If we hit ENOMEM temporarily, but later at * raid56_parity_submit_scrub_rbio() time it succeeded, we just do @@ -2893,8 +3051,7 @@ void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, foffset = 0; } } - for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits; - sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits; - sector_nr++) - rbio->stripe_sectors[sector_nr].uptodate = true; + bitmap_set(rbio->stripe_uptodate_bitmap, + offset_in_full_stripe >> fs_info->sectorsize_bits, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 84c4d1d29c7a..1f463ecf7e41 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -16,7 +16,6 @@ #include "volumes.h" struct page; -struct sector_ptr; struct btrfs_fs_info; enum btrfs_rbio_ops { @@ -25,6 +24,84 @@ enum btrfs_rbio_ops { BTRFS_RBIO_PARITY_SCRUB, }; +/* + * Overview of btrfs_raid_bio. + * + * One btrfs_raid_bio represents a full stripe of RAID56, including both data + * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K). + * + * One btrfs_raid_bio can have one or more bios from higher layer, covering + * part or all of the data stripes. + * + * [PAGES FROM HIGHER LAYER BIOS] + * Higher layer bios are in the btrfs_raid_bio::bio_list. + * + * Pages from the bio_list are represented like the following: + * + * bio_list: |<- Bio 1 ->| |<- Bio 2 ->| ... + * bio_paddrs: [0] [1] [2] [3] [4] [5] ... + * + * If there is a bio covering a sector (one btrfs fs block), the corresponding + * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address + * (with the offset inside the page) of the corresponding bio. + * + * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will + * be INVALID_PADDR. + * + * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)). + * + * [PAGES FOR INTERNAL USAGES] + * Pages not covered by any bio or belonging to P/Q stripes are stored in + * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following: + * + * stripe_pages: |<- Page 0 ->|<- Page 1 ->| ... + * stripe_paddrs: [0] [1] [2] [3] [4] ... + * + * stripe_pages[] array stores all the pages covering the full stripe, including + * data and P/Q pages. + * stripe_pages[0] is the first page of the first data stripe. + * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second + * data stripe. + * + * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write + * (the bio covers all data stripes) there is no need to allocate pages for + * data stripes (can grab from bio_paddrs[]). + * + * If the corresponding page of stripe_paddrs[i] is not allocated, the value of + * stripe_paddrs[i] will be INVALID_PADDR. + * + * The length of each entry in stripe_paddrs[] is a step. + * + * [LOCATING A SECTOR] + * To locate a sector for IO, we need the following info: + * + * - stripe_nr + * Starts from 0 (representing the first data stripe), ends at + * @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe). + * + * - sector_nr + * Starts from 0 (representing the first sector of the stripe), ends + * at BTRFS_STRIPE_LEN / sectorsize - 1. + * + * - step_nr + * A step is min(sector_size, PAGE_SIZE). + * + * Starts from 0 (representing the first step of the sector), ends + * at @sector_nsteps - 1. + * + * For most call sites they do not need to bother this parameter. + * It is for bs > ps support and only for vertical stripe related works. + * (e.g. RMW/recover) + * + * - from which array + * Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the + * bio_paddrs[] (aka, from the higher layer bios). + * + * For IO, a physical address is returned, so that we can extract the page and + * the offset inside the page for IO. + * A special value INVALID_PADDR represents when the physical address is invalid, + * normally meaning there is no page allocated for the specified sector. + */ struct btrfs_raid_bio { struct btrfs_io_context *bioc; @@ -82,6 +159,14 @@ struct btrfs_raid_bio { /* How many sectors there are for each stripe */ u8 stripe_nsectors; + /* + * How many steps there are for one sector. + * + * For bs > ps cases, it's sectorsize / PAGE_SIZE. + * For bs <= ps cases, it's always 1. + */ + u8 sector_nsteps; + /* Stripe number that we're scrubbing */ u8 scrubp; @@ -116,13 +201,13 @@ struct btrfs_raid_bio { struct page **stripe_pages; /* Pointers to the sectors in the bio_list, for faster lookup */ - struct sector_ptr *bio_sectors; + phys_addr_t *bio_paddrs; - /* - * For subpage support, we need to map each sector to above - * stripe_pages. - */ - struct sector_ptr *stripe_sectors; + /* Pointers to the sectors in the stripe_pages[]. */ + phys_addr_t *stripe_paddrs; + + /* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */ + unsigned long *stripe_uptodate_bitmap; /* Allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; @@ -131,10 +216,6 @@ struct btrfs_raid_bio { * The bitmap recording where IO errors happened. * Each bit is corresponding to one sector in either bio_sectors[] or * stripe_sectors[] array. - * - * The reason we don't use another bit in sector_ptr is, we have two - * arrays of sectors, and a lot of IO can use sectors in both arrays. - * Thus making it much harder to iterate. */ unsigned long *error_bitmap; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5465a5eae9b2..b5fe95baf92e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/blkdev.h> +#include <linux/fscrypt.h> #include <linux/iversion.h> #include "ctree.h" #include "fs.h" @@ -343,7 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_trans_handle *trans; - char *buf = NULL; + char AUTO_KVFREE(buf); struct btrfs_key key; u32 nritems; int slot; @@ -358,10 +359,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, return ret; path = btrfs_alloc_path(); - if (!path) { - kvfree(buf); + if (!path) return ret; - } path->reada = READA_FORWARD; /* Clone data */ @@ -611,7 +610,6 @@ process_slot: } out: - kvfree(buf); clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); return ret; @@ -792,6 +790,10 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } + /* Can only reflink encrypted files if both files are encrypted. */ + if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode)) + return -EINVAL; + /* Don't make the dst file partly checksummed */ if ((inode_in->flags & BTRFS_INODE_NODATASUM) != (inode_out->flags & BTRFS_INODE_NODATASUM)) { @@ -868,6 +870,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, bool same_inode = dst_inode == src_inode; int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))) + return -EIO; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0765e06d00b8..5bfefc3e9c06 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -511,7 +511,7 @@ static void __del_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; - struct mapping_node *node = NULL; + struct mapping_node AUTO_KFREE(node); struct reloc_control *rc = fs_info->reloc_ctl; bool put_ref = false; @@ -544,7 +544,6 @@ static void __del_reloc_root(struct btrfs_root *root) spin_unlock(&fs_info->trans_lock); if (put_ref) btrfs_put_root(root); - kfree(node); } /* @@ -586,10 +585,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct extent_buffer *eb; - struct btrfs_root_item *root_item; + struct btrfs_root_item AUTO_KFREE(root_item); struct btrfs_key root_key; int ret = 0; - bool must_abort = false; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); if (!root_item) @@ -615,17 +613,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); btrfs_err(fs_info, - "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)", - objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset); - ret = -EUCLEAN; - goto fail; + "cannot relocate partially dropped subvolume %llu, drop progress key " BTRFS_KEY_FMT, + objectid, BTRFS_KEY_FMT_VALUE(&cpu_key)); + return ERR_PTR(-EUCLEAN); } /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) - goto fail; + return ERR_PTR(ret); /* * Set the last_snapshot field to the generation of the commit @@ -648,14 +645,13 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_copy_root(trans, root, root->node, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) - goto fail; + return ERR_PTR(ret); } /* * We have changed references at this point, we must abort the - * transaction if anything fails. + * transaction if anything fails (i.e. 'goto abort'). */ - must_abort = true; memcpy(root_item, &root->root_item, sizeof(*root_item)); btrfs_set_root_bytenr(root_item, eb->start); @@ -675,9 +671,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); if (ret) - goto fail; - - kfree(root_item); + goto abort; reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); if (IS_ERR(reloc_root)) { @@ -687,11 +681,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); btrfs_set_root_last_trans(reloc_root, trans->transid); return reloc_root; -fail: - kfree(root_item); + abort: - if (must_abort) - btrfs_abort_transaction(trans, ret); + btrfs_abort_transaction(trans, ret); return ERR_PTR(ret); } @@ -2947,7 +2939,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) const struct file_extent_cluster *cluster = &rc->cluster; u64 offset = BTRFS_I(inode)->reloc_block_group_start; u64 cur_file_offset = cluster->start - offset; - struct file_ra_state *ra; + struct file_ra_state AUTO_KFREE(ra); int cluster_nr = 0; int ret = 0; @@ -2960,13 +2952,13 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) ret = prealloc_file_extent_cluster(rc); if (ret) - goto out; + return ret; file_ra_state_init(ra, inode->i_mapping); ret = setup_relocation_extent_mapping(rc); if (ret) - goto out; + return ret; while (cur_file_offset < cluster->end - offset) { ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); @@ -2975,8 +2967,6 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) } if (ret == 0) WARN_ON(cluster_nr != cluster->nr); -out: - kfree(ra); return ret; } @@ -3175,8 +3165,8 @@ again: key.offset = blocksize; } - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) return ret; @@ -3368,8 +3358,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) @@ -3882,8 +3872,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, struct inode *inode; struct btrfs_path *path; int ret; - int rw = 0; - int err = 0; + bool bg_is_ro = false; /* * This only gets set if we had a half-deleted snapshot on mount. We @@ -3925,24 +3914,20 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, } ret = reloc_chunk_start(fs_info); - if (ret < 0) { - err = ret; + if (ret < 0) goto out_put_bg; - } rc->extent_root = extent_root; rc->block_group = bg; ret = btrfs_inc_block_group_ro(rc->block_group, true); - if (ret) { - err = ret; + if (ret) goto out; - } - rw = 1; + bg_is_ro = true; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -3954,14 +3939,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, else ret = PTR_ERR(inode); - if (ret && ret != -ENOENT) { - err = ret; + if (ret && ret != -ENOENT) goto out; - } rc->data_inode = create_reloc_inode(rc->block_group); if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); + ret = PTR_ERR(rc->data_inode); rc->data_inode = NULL; goto out; } @@ -3982,8 +3965,6 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) - err = ret; finishes_stage = rc->stage; /* @@ -3996,16 +3977,18 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, * out of the loop if we hit an error. */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, - (u64)-1); - if (ret) - err = ret; + int wb_ret; + + wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, + (u64)-1); + if (wb_ret && ret == 0) + ret = wb_ret; invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } - if (err < 0) + if (ret < 0) goto out; if (rc->extents_found == 0) @@ -4021,14 +4004,14 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, WARN_ON(rc->block_group->reserved > 0); WARN_ON(rc->block_group->used > 0); out: - if (err && rw) + if (ret && bg_is_ro) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); reloc_chunk_end(fs_info); out_put_bg: btrfs_put_block_group(bg); free_reloc_control(rc); - return err; + return ret; } static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index d07eab70f759..6a7e297ab0a7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -147,8 +147,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (unlikely(ret > 0)) { btrfs_crit(fs_info, - "unable to find root key (%llu %u %llu) in tree %llu", - key->objectid, key->type, key->offset, btrfs_root_id(root)); + "unable to find root key " BTRFS_KEY_FMT " in tree %llu", + BTRFS_KEY_FMT_VALUE(key), btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ba20d9286a34..a40ee41f42c6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -463,10 +463,10 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - sctx->extent_path.search_commit_root = 1; - sctx->extent_path.skip_locking = 1; - sctx->csum_path.search_commit_root = 1; - sctx->csum_path.skip_locking = 1; + sctx->extent_path.search_commit_root = true; + sctx->extent_path.skip_locking = true; + sctx->csum_path.search_commit_root = true; + sctx->csum_path.skip_locking = true; for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { int ret; @@ -505,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; struct btrfs_fs_info *fs_info = swarn->dev->fs_info; - struct inode_fs_paths *ipath = NULL; + struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; struct btrfs_root *local_root; struct btrfs_key key; @@ -569,7 +569,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, (char *)(unsigned long)ipath->fspath->val[i]); btrfs_put_root(local_root); - free_ipath(ipath); return 0; err: @@ -580,7 +579,6 @@ err: swarn->physical, root, inum, offset, ret); - free_ipath(ipath); return 0; } @@ -777,10 +775,10 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, -"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +"scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT, logical, stripe->mirror_num, - CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), - CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); return; } if (stripe->sectors[sector_nr].generation != @@ -929,10 +927,11 @@ static int calc_next_mirror(int mirror, int num_copies) static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, int sector_nr) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); int ret; - ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), bbio->fs_info->sectorsize, + ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize, offset_in_page(kaddr)); /* * Caller should ensure the bbio has enough size. @@ -942,7 +941,21 @@ static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *st * to create the minimal amount of bio vectors, for fs block size < page * size cases. */ - ASSERT(ret == bbio->fs_info->sectorsize); + ASSERT(ret == fs_info->sectorsize); +} + +static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info, + unsigned int nr_vecs, blk_opf_t opf, + u64 logical, + btrfs_bio_end_io_t end_io, void *private) +{ + struct btrfs_bio *bbio; + + bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode), + logical, end_io, private); + bbio->is_scrub = true; + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + return bbio; } static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, @@ -953,8 +966,9 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); int i; - ASSERT(stripe->mirror_num >= 1); - ASSERT(atomic_read(&stripe->pending_io) == 0); + ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); + ASSERT(atomic_read(&stripe->pending_io) == 0, + "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io)); for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { /* The current sector cannot be merged, submit the bio. */ @@ -968,12 +982,10 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, bbio = NULL; } - if (!bbio) { - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, - fs_info, scrub_repair_read_endio, stripe); - bbio->bio.bi_iter.bi_sector = (stripe->logical + - (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; - } + if (!bbio) + bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, + stripe->logical + (i << fs_info->sectorsize_bits), + scrub_repair_read_endio, stripe); scrub_bio_add_sector(bbio, stripe, i); } @@ -1019,7 +1031,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, int ret; /* For scrub, our mirror_num should always start at 1. */ - ASSERT(stripe->mirror_num >= 1); + ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, stripe->logical, &mapped_len, &bioc, NULL, NULL); @@ -1159,7 +1171,7 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) int mirror; int i; - ASSERT(stripe->mirror_num > 0); + ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); @@ -1284,7 +1296,7 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); - for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_WRITE_ERRS); } @@ -1352,13 +1364,10 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); bbio = NULL; } - if (!bbio) { - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE, - fs_info, scrub_write_endio, stripe); - bbio->bio.bi_iter.bi_sector = (stripe->logical + - (sector_nr << fs_info->sectorsize_bits)) >> - SECTOR_SHIFT; - } + if (!bbio) + bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE, + stripe->logical + (sector_nr << fs_info->sectorsize_bits), + scrub_write_endio, stripe); scrub_bio_add_sector(bbio, stripe, sector_nr); } if (bbio) @@ -1478,7 +1487,7 @@ static int compare_extent_item_range(struct btrfs_path *path, btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || - key.type == BTRFS_METADATA_ITEM_KEY); + key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type); if (key.type == BTRFS_METADATA_ITEM_KEY) len = fs_info->nodesize; else @@ -1583,7 +1592,7 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || - key.type == BTRFS_EXTENT_ITEM_KEY); + key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type); *extent_start_ret = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) *size_ret = path->nodes[0]->fs_info->nodesize; @@ -1681,7 +1690,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, scrub_stripe_reset_bitmaps(stripe); /* The range must be inside the bg. */ - ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length, + "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", + bg->start, logical_start, logical_end, bg->start + bg->length); ret = find_first_extent_item(extent_root, extent_path, logical_start, logical_len); @@ -1849,9 +1860,8 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) continue; } - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, - fs_info, scrub_read_endio, stripe); - bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, + logical, scrub_read_endio, stripe); } scrub_bio_add_sector(bbio, stripe, i); @@ -1888,10 +1898,8 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, return; } - bbio = btrfs_bio_alloc(BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, fs_info, - scrub_read_endio, stripe); - - bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; + bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, + stripe->logical, scrub_read_endio, stripe); /* Read the whole range inside the chunk boundary. */ for (unsigned int cur = 0; cur < nr_sectors; cur++) scrub_bio_add_sector(bbio, stripe, cur); @@ -2069,37 +2077,135 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * return 0; } +/* + * Return 0 if we should not cancel the scrub. + * Return <0 if we need to cancel the scrub, returned value will + * indicate the reason: + * - -ECANCELED - Being explicitly canceled through ioctl. + * - -EINTR - Being interrupted by signal or fs/process freezing. + */ +static int should_cancel_scrub(const struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) + return -ECANCELED; + + /* + * The user (e.g. fsfreeze command) or power management (PM) + * suspend/hibernate can freeze the fs. And PM suspend/hibernate will + * also freeze all user processes. + * + * A user process can only be frozen when it is in user space, thus we + * have to cancel the run so that the process can return to the user + * space. + * + * Furthermore we have to check both filesystem and process freezing, + * as PM can be configured to freeze the filesystems before processes. + * + * If we only check fs freezing, then suspend without fs freezing + * will timeout, as the process is still in kernel space. + * + * If we only check process freezing, then suspend with fs freezing + * will timeout, as the running scrub will prevent the fs from being frozen. + */ + if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || + freezing(current) || signal_pending(current)) + return -EINTR; + return 0; +} + +static int scrub_raid56_cached_parity(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, + struct btrfs_chunk_map *map, + u64 full_stripe_start, + unsigned long *extent_bitmap) +{ + DECLARE_COMPLETION_ONSTACK(io_done); + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_io_context *bioc = NULL; + struct btrfs_raid_bio *rbio; + struct bio bio; + const int data_stripes = nr_data_stripes(map); + u64 length = btrfs_stripe_nr_to_offset(data_stripes); + int ret; + + bio_init(&bio, NULL, NULL, 0, REQ_OP_READ); + bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; + bio.bi_private = &io_done; + bio.bi_end_io = raid56_scrub_wait_endio; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc, NULL, NULL); + if (ret < 0) + goto out; + /* For RAID56 write there must be an @bioc allocated. */ + ASSERT(bioc); + rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); + btrfs_put_bioc(bioc); + if (!rbio) { + ret = -ENOMEM; + goto out; + } + /* Use the recovered stripes as cache to avoid read them from disk again. */ + for (int i = 0; i < data_stripes; i++) { + struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i]; + + raid56_parity_cache_data_folios(rbio, stripe->folios, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); + } + raid56_parity_submit_scrub_rbio(rbio); + wait_for_completion_io(&io_done); + ret = blk_status_to_errno(bio.bi_status); +out: + btrfs_bio_counter_dec(fs_info); + bio_uninit(&bio); + return ret; +} + static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, struct btrfs_block_group *bg, struct btrfs_chunk_map *map, u64 full_stripe_start) { - DECLARE_COMPLETION_ONSTACK(io_done); struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_raid_bio *rbio; - struct btrfs_io_context *bioc = NULL; struct btrfs_path extent_path = { 0 }; struct btrfs_path csum_path = { 0 }; - struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; const int data_stripes = nr_data_stripes(map); unsigned long extent_bitmap = 0; - u64 length = btrfs_stripe_nr_to_offset(data_stripes); int ret; ASSERT(sctx->raid56_data_stripes); + ret = should_cancel_scrub(sctx); + if (ret < 0) + return ret; + + if (atomic_read(&fs_info->scrub_pause_req)) + scrub_blocked_if_needed(fs_info); + + spin_lock(&bg->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { + spin_unlock(&bg->lock); + return 0; + } + spin_unlock(&bg->lock); + /* * For data stripe search, we cannot reuse the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ - extent_path.search_commit_root = 1; - extent_path.skip_locking = 1; - csum_path.search_commit_root = 1; - csum_path.skip_locking = 1; + extent_path.search_commit_root = true; + extent_path.skip_locking = true; + csum_path.search_commit_root = true; + csum_path.skip_locking = true; for (int i = 0; i < data_stripes; i++) { int stripe_index; @@ -2194,45 +2300,11 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, } /* Now we can check and regenerate the P/Q stripe. */ - bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; - bio->bi_private = &io_done; - bio->bi_end_io = raid56_scrub_wait_endio; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc, NULL, NULL); - if (ret < 0) { - bio_put(bio); - btrfs_put_bioc(bioc); - btrfs_bio_counter_dec(fs_info); - goto out; - } - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap, - BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); - btrfs_put_bioc(bioc); - if (!rbio) { - ret = -ENOMEM; - bio_put(bio); - btrfs_bio_counter_dec(fs_info); - goto out; - } - /* Use the recovered stripes as cache to avoid read them from disk again. */ - for (int i = 0; i < data_stripes; i++) { - stripe = &sctx->raid56_data_stripes[i]; - - raid56_parity_cache_data_folios(rbio, stripe->folios, - full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); - } - raid56_parity_submit_scrub_rbio(rbio); - wait_for_completion_io(&io_done); - ret = blk_status_to_errno(bio->bi_status); - bio_put(bio); - btrfs_bio_counter_dec(fs_info); - + ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, + &extent_bitmap); +out: btrfs_release_path(&extent_path); btrfs_release_path(&csum_path); -out: return ret; } @@ -2263,18 +2335,13 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; - /* Canceled? */ - if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sctx->cancel_req)) { - ret = -ECANCELED; + ret = should_cancel_scrub(sctx); + if (ret < 0) break; - } - /* Paused? */ - if (atomic_read(&fs_info->scrub_pause_req)) { - /* Push queued extents */ + + if (atomic_read(&fs_info->scrub_pause_req)) scrub_blocked_if_needed(fs_info); - } - /* Block group removed? */ + spin_lock(&bg->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { spin_unlock(&bg->lock); @@ -2529,8 +2596,6 @@ out: } if (sctx->is_dev_replace && ret >= 0) { - int ret2; - ret2 = sync_write_pointer_for_zoned(sctx, chunk_logical + offset, map->stripes[stripe_index].physical, @@ -2623,8 +2688,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, return -ENOMEM; path->reada = READA_FORWARD; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; key.objectid = scrub_dev->devid; key.type = BTRFS_DEV_EXTENT_KEY; @@ -3039,6 +3104,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, unsigned int nofs_flag; bool need_commit = false; + /* Set the basic fallback @last_physical before we got a sctx. */ + if (progress) + progress->last_physical = start; + if (btrfs_fs_closing(fs_info)) return -EAGAIN; @@ -3057,6 +3126,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, sctx = scrub_setup_ctx(fs_info, is_dev_replace); if (IS_ERR(sctx)) return PTR_ERR(sctx); + sctx->stat.last_physical = start; ret = scrub_workers_get(fs_info); if (ret) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 96a030d28e09..2522faa97478 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -47,28 +47,30 @@ * It allows fast adding of path elements on the right side (normal path) and * fast adding to the left side (reversed path). A reversed path can also be * unreversed if needed. + * + * The definition of struct fs_path relies on -fms-extensions to allow + * including a tagged struct as an anonymous member. */ +struct __fs_path { + char *start; + char *end; + + char *buf; + unsigned short buf_len:15; + unsigned short reversed:1; +}; +static_assert(sizeof(struct __fs_path) < 256); struct fs_path { - union { - struct { - char *start; - char *end; - - char *buf; - unsigned short buf_len:15; - unsigned short reversed:1; - char inline_buf[]; - }; - /* - * Average path length does not exceed 200 bytes, we'll have - * better packing in the slab and higher chance to satisfy - * an allocation later during send. - */ - char pad[256]; - }; + struct __fs_path; + /* + * Average path length does not exceed 200 bytes, we'll have + * better packing in the slab and higher chance to satisfy + * an allocation later during send. + */ + char inline_buf[256 - sizeof(struct __fs_path)]; }; #define FS_PATH_INLINE_SIZE \ - (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf)) + sizeof_field(struct fs_path, inline_buf) /* reused for each extent */ @@ -305,7 +307,6 @@ struct send_ctx { struct btrfs_lru_cache dir_created_cache; struct btrfs_lru_cache dir_utimes_cache; - /* Must be last as it ends in a flexible-array member. */ struct fs_path cur_inode_path; }; @@ -633,9 +634,9 @@ static struct btrfs_path *alloc_path_for_send(void) path = btrfs_alloc_path(); if (!path) return NULL; - path->search_commit_root = 1; - path->skip_locking = 1; - path->need_commit_sem = 1; + path->search_commit_root = true; + path->skip_locking = true; + path->need_commit_sem = true; return path; } @@ -1053,10 +1054,8 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } if (unlikely(start < p->buf)) { btrfs_err(root->fs_info, - "send: path ref buffer underflow for key (%llu %u %llu)", - found_key->objectid, - found_key->type, - found_key->offset); + "send: path ref buffer underflow for key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(found_key)); ret = -EINVAL; goto out; } @@ -1136,12 +1135,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_dir_item_key_to_cpu(eb, di, &di_key); if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { - if (name_len > XATTR_NAME_MAX) { + if (unlikely(name_len > XATTR_NAME_MAX)) { ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len > - BTRFS_MAX_XATTR_SIZE(root->fs_info)) { + if (unlikely(name_len + data_len > + BTRFS_MAX_XATTR_SIZE(root->fs_info))) { ret = -E2BIG; goto out; } @@ -1149,7 +1148,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len > PATH_MAX) { + if (unlikely(name_len + data_len > PATH_MAX)) { ret = -ENAMETOOLONG; goto out; } @@ -2460,7 +2459,7 @@ static int send_subvol_begin(struct send_ctx *sctx) struct btrfs_key key; struct btrfs_root_ref *ref; struct extent_buffer *leaf; - char *name = NULL; + char AUTO_KFREE(name); int namelen; path = btrfs_alloc_path(); @@ -2478,18 +2477,15 @@ static int send_subvol_begin(struct send_ctx *sctx) ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, &key, path, 1, 0); if (ret < 0) - goto out; - if (ret) { - ret = -ENOENT; - goto out; - } + return ret; + if (ret) + return -ENOENT; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || key.objectid != btrfs_root_id(send_root)) { - ret = -ENOENT; - goto out; + return -ENOENT; } ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); namelen = btrfs_root_ref_name_len(leaf, ref); @@ -2499,11 +2495,11 @@ static int send_subvol_begin(struct send_ctx *sctx) if (parent_root) { ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); if (ret < 0) - goto out; + return ret; } else { ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); if (ret < 0) - goto out; + return ret; } TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); @@ -2531,8 +2527,6 @@ static int send_subvol_begin(struct send_ctx *sctx) ret = send_cmd(sctx); tlv_put_failure: -out: - kfree(name); return ret; } @@ -4079,7 +4073,7 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) */ static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) { - char *name; + char AUTO_KFREE(name); int ret; name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); @@ -4089,17 +4083,16 @@ static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) fs_path_reset(ref->full_path); ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); if (ret < 0) - goto out; + return ret; ret = fs_path_add(ref->full_path, name, ref->name_len); if (ret < 0) - goto out; + return ret; /* Update the reference's base name pointer. */ set_ref_path(ref, ref->full_path); -out: - kfree(name); - return ret; + + return 0; } static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node) @@ -4951,6 +4944,7 @@ struct find_xattr_ctx { int found_idx; char *found_data; int found_data_len; + bool copy_data; }; static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, @@ -4962,9 +4956,11 @@ static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, strncmp(name, ctx->name, name_len) == 0) { ctx->found_idx = num; ctx->found_data_len = data_len; - ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); - if (!ctx->found_data) - return -ENOMEM; + if (ctx->copy_data) { + ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); + if (!ctx->found_data) + return -ENOMEM; + } return 1; } return 0; @@ -4984,6 +4980,7 @@ static int find_xattr(struct btrfs_root *root, ctx.found_idx = -1; ctx.found_data = NULL; ctx.found_data_len = 0; + ctx.copy_data = (data != NULL); ret = iterate_dir_item(root, path, __find_xattr, &ctx); if (ret < 0) @@ -4995,7 +4992,7 @@ static int find_xattr(struct btrfs_root *root, *data = ctx.found_data; *data_len = ctx.found_data_len; } else { - kfree(ctx.found_data); + ASSERT(ctx.found_data == NULL); } return ctx.found_idx; } @@ -5008,8 +5005,8 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, { int ret; struct send_ctx *sctx = ctx; - char *found_data = NULL; - int found_data_len = 0; + char AUTO_KFREE(found_data); + int found_data_len = 0; ret = find_xattr(sctx->parent_root, sctx->right_path, sctx->cmp_key, name, name_len, &found_data, @@ -5027,7 +5024,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, } } - kfree(found_data); return ret; } @@ -5138,7 +5134,7 @@ static int process_verity(struct send_ctx *sctx) if (ret < 0) goto iput; - if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + if (unlikely(ret > FS_VERITY_MAX_DESCRIPTOR_SIZE)) { ret = -EMSGSIZE; goto iput; } @@ -5182,14 +5178,14 @@ static int put_data_header(struct send_ctx *sctx, u32 len) * Since v2, the data attribute header doesn't include a length, * it is implicitly to the end of the command. */ - if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) + if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)) return -EOVERFLOW; put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); sctx->send_size += sizeof(__le16); } else { struct btrfs_tlv_header *hdr; - if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); @@ -5589,8 +5585,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * between the beginning of the command and the file data. */ data_offset = PAGE_ALIGN(sctx->send_size); - if (data_offset > sctx->send_max_size || - sctx->send_max_size - data_offset < disk_num_bytes) { + if (unlikely(data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes)) { ret = -EOVERFLOW; goto out; } @@ -5643,14 +5639,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - /* - * Do not go through encoded read for bs > ps cases. - * - * Encoded send is using vmallocated pages as buffer, which we can - * not ensure every folio is large enough to contain a block. - */ - if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE && - (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { bool is_inline = (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE); @@ -5764,7 +5753,7 @@ static int send_capabilities(struct send_ctx *sctx) struct btrfs_dir_item *di; struct extent_buffer *leaf; unsigned long data_ptr; - char *buf = NULL; + char AUTO_KFREE(buf); int buf_len; int ret = 0; @@ -5776,28 +5765,23 @@ static int send_capabilities(struct send_ctx *sctx) XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); if (!di) { /* There is no xattr for this inode */ - goto out; + return 0; } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; + return PTR_ERR(di); } leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); buf = kmalloc(buf_len, GFP_KERNEL); - if (!buf) { - ret = -ENOMEM; - goto out; - } + if (!buf) + return -ENOMEM; data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); -out: - kfree(buf); return ret; } @@ -7274,8 +7258,8 @@ static int search_key_again(const struct send_ctx *sctx, if (unlikely(ret > 0)) { btrfs_print_tree(path->nodes[path->lowest_level], false); btrfs_err(root->fs_info, -"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", - key->objectid, key->type, key->offset, +"send: key " BTRFS_KEY_FMT" not found in %s root %llu, lowest_level %d, slot %d", + BTRFS_KEY_FMT_VALUE(key), (root == sctx->parent_root ? "parent" : "send"), btrfs_root_id(root), path->lowest_level, path->slots[path->lowest_level]); @@ -7643,10 +7627,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - left_path->search_commit_root = 1; - left_path->skip_locking = 1; - right_path->search_commit_root = 1; - right_path->skip_locking = 1; + left_path->search_commit_root = true; + left_path->skip_locking = true; + right_path->search_commit_root = true; + right_path->skip_locking = true; /* * Strategy: Go to the first items of both trees. Then do diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 97452fb5d29b..6babbe333741 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -15,6 +15,7 @@ #include "accessors.h" #include "extent-tree.h" #include "zoned.h" +#include "delayed-inode.h" /* * HOW DOES SPACE RESERVATION WORK @@ -67,7 +68,7 @@ * Assume we are unable to simply make the reservation because we do not have * enough space * - * -> __reserve_bytes + * -> reserve_bytes * create a reserve_ticket with ->bytes set to our reservation, add it to * the tail of space_info->tickets, kick async flush thread * @@ -172,15 +173,14 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, - bool may_use_included) -{ - ASSERT(s_info); - return s_info->bytes_used + s_info->bytes_reserved + - s_info->bytes_pinned + s_info->bytes_readonly + - s_info->bytes_zone_unusable + - (may_use_included ? s_info->bytes_may_use : 0); -} +struct reserve_ticket { + u64 bytes; + int error; + bool steal; + struct list_head list; + wait_queue_head_t wait; + spinlock_t lock; +}; /* * after adding space to the filesystem, we need to clear the full flags @@ -192,7 +192,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) struct btrfs_space_info *found; list_for_each_entry(found, head, list) - found->full = 0; + found->full = false; } /* @@ -211,7 +211,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) if (btrfs_is_zoned(fs_info)) return fs_info->zone_size; - ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK, "flags=%llu", flags); if (flags & BTRFS_BLOCK_GROUP_DATA) return BTRFS_MAX_DATA_CHUNK_SIZE; @@ -262,8 +262,9 @@ static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flag struct btrfs_space_info *sub_group; int ret; - ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); - ASSERT(id != BTRFS_SUB_GROUP_PRIMARY); + ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY, + "parent->subgroup_id=%d", parent->subgroup_id); + ASSERT(id != BTRFS_SUB_GROUP_PRIMARY, "id=%d", id); sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS); if (!sub_group) @@ -274,7 +275,7 @@ static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flag sub_group->parent = parent; sub_group->subgroup_id = id; - ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group); + ret = btrfs_sysfs_add_space_info_type(sub_group); if (ret) { kfree(sub_group); parent->sub_group[index] = NULL; @@ -308,7 +309,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) return ret; } - ret = btrfs_sysfs_add_space_info_type(info, space_info); + ret = btrfs_sysfs_add_space_info_type(space_info); if (ret) return ret; @@ -372,8 +373,8 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, space_info->bytes_readonly += block_group->bytes_super; btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable); if (block_group->length > 0) - space_info->full = 0; - btrfs_try_granting_tickets(info, space_info); + space_info->full = false; + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); block_group->space_info = space_info; @@ -421,10 +422,10 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) return min_t(u64, data_chunk_size, SZ_1G); } -static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info, - enum btrfs_reserve_flush_enum flush) +static u64 calc_available_free_space(const struct btrfs_space_info *space_info, + enum btrfs_reserve_flush_enum flush) { + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 profile; u64 avail; u64 data_chunk_size; @@ -490,44 +491,77 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, return avail; } -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) +static inline bool check_can_overcommit(const struct btrfs_space_info *space_info, + u64 space_info_used_bytes, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + const u64 avail = calc_available_free_space(space_info, flush); + + return (space_info_used_bytes + bytes < space_info->total_bytes + avail); +} + +static inline bool can_overcommit(const struct btrfs_space_info *space_info, + u64 space_info_used_bytes, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return false; + + return check_can_overcommit(space_info, space_info_used_bytes, bytes, flush); +} + +bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) { - u64 avail; u64 used; /* Don't overcommit when in mixed mode */ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; + return false; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(fs_info, space_info, flush); - if (used + bytes < space_info->total_bytes + avail) - return 1; - return 0; + return check_can_overcommit(space_info, used, bytes, flush); } static void remove_ticket(struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) + struct reserve_ticket *ticket, int error) { + lockdep_assert_held(&space_info->lock); + if (!list_empty(&ticket->list)) { list_del_init(&ticket->list); - ASSERT(space_info->reclaim_size >= ticket->bytes); + ASSERT(space_info->reclaim_size >= ticket->bytes, + "space_info->reclaim_size=%llu ticket->bytes=%llu", + space_info->reclaim_size, ticket->bytes); space_info->reclaim_size -= ticket->bytes; } + + spin_lock(&ticket->lock); + /* + * If we are called from a task waiting on the ticket, it may happen + * that before it sets an error on the ticket, a reclaim task was able + * to satisfy the ticket. In that case ignore the error. + */ + if (error && ticket->bytes > 0) + ticket->error = error; + else + ticket->bytes = 0; + + wake_up(&ticket->wait); + spin_unlock(&ticket->lock); } /* * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. */ -void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +void btrfs_try_granting_tickets(struct btrfs_space_info *space_info) { struct list_head *head; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + u64 used = btrfs_space_info_used(space_info, true); lockdep_assert_held(&space_info->lock); @@ -535,19 +569,18 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, again: while (!list_empty(head)) { struct reserve_ticket *ticket; - u64 used = btrfs_space_info_used(space_info, true); + u64 used_after; ticket = list_first_entry(head, struct reserve_ticket, list); + used_after = used + ticket->bytes; /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= space_info->total_bytes) || - btrfs_can_overcommit(fs_info, space_info, ticket->bytes, - flush)) { + if (used_after <= space_info->total_bytes || + can_overcommit(space_info, used, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); - remove_ticket(space_info, ticket); - ticket->bytes = 0; + remove_ticket(space_info, ticket, 0); space_info->tickets_id++; - wake_up(&ticket->wait); + used = used_after; } else { break; } @@ -594,9 +627,9 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_space_info *info) { + const struct btrfs_fs_info *fs_info = info->fs_info; const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -613,16 +646,16 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, info->bytes_readonly, info->bytes_zone_unusable); } -void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, +void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, bool dump_block_groups) { + struct btrfs_fs_info *fs_info = info->fs_info; struct btrfs_block_group *cache; u64 total_avail = 0; int index = 0; spin_lock(&info->lock); - __btrfs_dump_space_info(fs_info, info); + __btrfs_dump_space_info(info); dump_global_block_rsv(fs_info); spin_unlock(&info->lock); @@ -670,11 +703,11 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void shrink_delalloc(struct btrfs_space_info *space_info, u64 to_reclaim, bool wait_ordered, bool for_preempt) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 ordered_bytes; @@ -801,10 +834,10 @@ skip_async: * and may fail for various reasons. The caller is supposed to examine the * state of @space_info to detect the outcome. */ -static void flush_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 num_bytes, - enum btrfs_flush_state state, bool for_preempt) +static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes, + enum btrfs_flush_state state, bool for_preempt) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int nr; @@ -833,7 +866,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, case FLUSH_DELALLOC_FULL: if (state == FLUSH_DELALLOC_FULL) num_bytes = U64_MAX; - shrink_delalloc(fs_info, space_info, num_bytes, + shrink_delalloc(space_info, num_bytes, state != FLUSH_DELALLOC, for_preempt); break; case FLUSH_DELAYED_REFS_NR: @@ -900,8 +933,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -909,8 +941,7 @@ static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, lockdep_assert_held(&space_info->lock); - avail = calc_available_free_space(fs_info, space_info, - BTRFS_RESERVE_FLUSH_ALL); + avail = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL); used = btrfs_space_info_used(space_info, true); /* @@ -925,18 +956,25 @@ static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; } -static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info) +static bool need_preemptive_reclaim(const struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; u64 thresh; u64 used; - thresh = mult_perc(space_info->total_bytes, 90); - lockdep_assert_held(&space_info->lock); + /* + * We have tickets queued, bail so we don't compete with the async + * flushers. + */ + if (space_info->reclaim_size) + return false; + + thresh = mult_perc(space_info->total_bytes, 90); + /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved + global_rsv_size) >= thresh) @@ -957,13 +995,6 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, return false; /* - * We have tickets queued, bail so we don't compete with the async - * flushers. - */ - if (space_info->reclaim_size) - return false; - - /* * If we have over half of the free space occupied by reservations or * pinned then we want to start flushing. * @@ -992,8 +1023,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * much delalloc we need for the background flusher to kick in. */ - thresh = calc_available_free_space(fs_info, space_info, - BTRFS_RESERVE_FLUSH_ALL); + thresh = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; if (used < space_info->total_bytes) @@ -1037,13 +1067,15 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } -static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static bool steal_from_global_rsv(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 min_bytes; + lockdep_assert_held(&space_info->lock); + if (!ticket->steal) return false; @@ -1057,21 +1089,19 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, return false; } global_rsv->reserved -= ticket->bytes; - remove_ticket(space_info, ticket); - ticket->bytes = 0; - wake_up(&ticket->wait); - space_info->tickets_id++; if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; + global_rsv->full = false; spin_unlock(&global_rsv->lock); + remove_ticket(space_info, ticket, 0); + space_info->tickets_id++; + return true; } /* * We've exhausted our flushing, start failing tickets. * - * @fs_info - fs_info for this fs * @space_info - the space info we were flushing * * We call this when we've exhausted our flushing ability and haven't made @@ -1084,47 +1114,44 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, * other tickets, or if it stumbles across a ticket that was smaller than the * first ticket. */ -static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; - const bool aborted = BTRFS_FS_ERROR(fs_info); + const int abort_error = BTRFS_FS_ERROR(fs_info); trace_btrfs_fail_all_tickets(fs_info, space_info); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); - __btrfs_dump_space_info(fs_info, space_info); + __btrfs_dump_space_info(space_info); } while (!list_empty(&space_info->tickets) && tickets_id == space_info->tickets_id) { ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); + if (unlikely(abort_error)) { + remove_ticket(space_info, ticket, abort_error); + } else { + if (steal_from_global_rsv(space_info, ticket)) + return true; - if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) - return true; - - if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_info(fs_info, "failing ticket with %llu bytes", - ticket->bytes); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_info(fs_info, "failing ticket with %llu bytes", + ticket->bytes); - remove_ticket(space_info, ticket); - if (aborted) - ticket->error = -EIO; - else - ticket->error = -ENOSPC; - wake_up(&ticket->wait); + remove_ticket(space_info, ticket, -ENOSPC); - /* - * We're just throwing tickets away, so more flushing may not - * trip over btrfs_try_granting_tickets, so we need to call it - * here to see if we can make progress with the next ticket in - * the list. - */ - if (!aborted) - btrfs_try_granting_tickets(fs_info, space_info); + /* + * We're just throwing tickets away, so more flushing may + * not trip over btrfs_try_granting_tickets, so we need + * to call it here to see if we can make progress with + * the next ticket in the list. + */ + btrfs_try_granting_tickets(space_info); + } } return (tickets_id != space_info->tickets_id); } @@ -1144,9 +1171,9 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) final_state = COMMIT_TRANS; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); if (!to_reclaim) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1155,15 +1182,14 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) flush_state = FLUSH_DELAYED_ITEMS_NR; do { - flush_space(fs_info, space_info, to_reclaim, flush_state, false); + flush_space(space_info, to_reclaim, flush_state, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { @@ -1197,11 +1223,11 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) if (flush_state > final_state) { commit_cycles++; if (commit_cycles > 2) { - if (maybe_fail_all_tickets(fs_info, space_info)) { + if (maybe_fail_all_tickets(space_info)) { flush_state = FLUSH_DELAYED_ITEMS_NR; commit_cycles--; } else { - space_info->flush = 0; + space_info->flush = false; } } else { flush_state = FLUSH_DELAYED_ITEMS_NR; @@ -1257,14 +1283,15 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv = &fs_info->trans_block_rsv; spin_lock(&space_info->lock); - while (need_preemptive_reclaim(fs_info, space_info)) { + while (need_preemptive_reclaim(space_info)) { enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv); + const u64 bytes_may_use = space_info->bytes_may_use; + const u64 bytes_pinned = space_info->bytes_pinned; - loops++; - + spin_unlock(&space_info->lock); /* * We don't have a precise counter for the metadata being * reserved for delalloc, so we'll approximate it by subtracting @@ -1276,8 +1303,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) btrfs_block_rsv_reserved(delayed_block_rsv) + btrfs_block_rsv_reserved(delayed_refs_rsv) + btrfs_block_rsv_reserved(trans_rsv); - if (block_rsv_size < space_info->bytes_may_use) - delalloc_size = space_info->bytes_may_use - block_rsv_size; + if (block_rsv_size < bytes_may_use) + delalloc_size = bytes_may_use - block_rsv_size; /* * We don't want to include the global_rsv in our calculation, @@ -1294,10 +1321,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) if (delalloc_size > block_rsv_size) { to_reclaim = delalloc_size; flush = FLUSH_DELALLOC; - } else if (space_info->bytes_pinned > + } else if (bytes_pinned > (btrfs_block_rsv_reserved(delayed_block_rsv) + btrfs_block_rsv_reserved(delayed_refs_rsv))) { - to_reclaim = space_info->bytes_pinned; + to_reclaim = bytes_pinned; flush = COMMIT_TRANS; } else if (btrfs_block_rsv_reserved(delayed_block_rsv) > btrfs_block_rsv_reserved(delayed_refs_rsv)) { @@ -1308,7 +1335,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush = FLUSH_DELAYED_REFS_NR; } - spin_unlock(&space_info->lock); + loops++; /* * We don't want to reclaim everything, just a portion, so scale @@ -1318,7 +1345,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) to_reclaim >>= 2; if (!to_reclaim) to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); - flush_space(fs_info, space_info, to_reclaim, flush, true); + flush_space(space_info, to_reclaim, flush, true); cond_resched(); spin_lock(&space_info->lock); } @@ -1383,7 +1410,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1391,27 +1418,27 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) spin_unlock(&space_info->lock); while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); + flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } /* Something happened, fail everything and bail. */ - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) goto aborted_fs; last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); } while (flush_state < ARRAY_SIZE(data_flush_states)) { - flush_space(fs_info, space_info, U64_MAX, + flush_space(space_info, U64_MAX, data_flush_states[flush_state], false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1425,16 +1452,16 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) if (flush_state >= ARRAY_SIZE(data_flush_states)) { if (space_info->full) { - if (maybe_fail_all_tickets(fs_info, space_info)) + if (maybe_fail_all_tickets(space_info)) flush_state = 0; else - space_info->flush = 0; + space_info->flush = false; } else { flush_state = 0; } /* Something happened, fail everything and bail. */ - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) goto aborted_fs; } @@ -1443,8 +1470,8 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) return; aborted_fs: - maybe_fail_all_tickets(fs_info, space_info); - space_info->flush = 0; + maybe_fail_all_tickets(space_info); + space_info->flush = false; spin_unlock(&space_info->lock); } @@ -1489,40 +1516,47 @@ static const enum btrfs_flush_state evict_flush_states[] = { RESET_ZONES, }; -static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket, - const enum btrfs_flush_state *states, - int states_nr) +static bool is_ticket_served(struct reserve_ticket *ticket) { + bool ret; + + spin_lock(&ticket->lock); + ret = (ticket->bytes == 0); + spin_unlock(&ticket->lock); + + return ret; +} + +static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + const enum btrfs_flush_state *states, + int states_nr) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 to_reclaim; int flush_state = 0; - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still * because we may have only satisfied the priority tickets and still * left non priority tickets on the list. We would then have * to_reclaim but ->bytes == 0. */ - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + if (is_ticket_served(ticket)) return; - } + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); + spin_unlock(&space_info->lock); while (flush_state < states_nr) { - spin_unlock(&space_info->lock); - flush_space(fs_info, space_info, to_reclaim, states[flush_state], - false); - flush_state++; - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + flush_space(space_info, to_reclaim, states[flush_state], false); + if (is_ticket_served(ticket)) return; - } + flush_state++; } + spin_lock(&space_info->lock); /* * Attempt to steal from the global rsv if we can, except if the fs was * turned into error mode due to a transaction abort when flushing space @@ -1531,48 +1565,38 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, * just to have caller fail immediately instead of later when trying to * modify the fs, making it easier to debug -ENOSPC problems. */ - if (BTRFS_FS_ERROR(fs_info)) { - ticket->error = BTRFS_FS_ERROR(fs_info); - remove_ticket(space_info, ticket); - } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { - ticket->error = -ENOSPC; - remove_ticket(space_info, ticket); - } + if (unlikely(BTRFS_FS_ERROR(fs_info))) + remove_ticket(space_info, ticket, BTRFS_FS_ERROR(fs_info)); + else if (!steal_from_global_rsv(space_info, ticket)) + remove_ticket(space_info, ticket, -ENOSPC); /* * We must run try_granting_tickets here because we could be a large * ticket in front of a smaller ticket that can now be satisfied with * the available space. */ - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } -static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void priority_reclaim_data_space(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { - spin_lock(&space_info->lock); - /* We could have been granted before we got here. */ - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + if (is_ticket_served(ticket)) return; - } + spin_lock(&space_info->lock); while (!space_info->full) { spin_unlock(&space_info->lock); - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); + if (is_ticket_served(ticket)) return; - } + spin_lock(&space_info->lock); } - ticket->error = -ENOSPC; - remove_ticket(space_info, ticket); - btrfs_try_granting_tickets(fs_info, space_info); + remove_ticket(space_info, ticket, -ENOSPC); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } @@ -1581,11 +1605,13 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, { DEFINE_WAIT(wait); - int ret = 0; - spin_lock(&space_info->lock); + spin_lock(&ticket->lock); while (ticket->bytes > 0 && ticket->error == 0) { + int ret; + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + spin_unlock(&ticket->lock); if (ret) { /* * Delete us from the list. After we unlock the space @@ -1595,24 +1621,23 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, * despite getting an error, resulting in a space leak * (bytes_may_use counter of our space_info). */ - remove_ticket(space_info, ticket); - ticket->error = -EINTR; - break; + spin_lock(&space_info->lock); + remove_ticket(space_info, ticket, -EINTR); + spin_unlock(&space_info->lock); + return; } - spin_unlock(&space_info->lock); schedule(); finish_wait(&ticket->wait, &wait); - spin_lock(&space_info->lock); + spin_lock(&ticket->lock); } - spin_unlock(&space_info->lock); + spin_unlock(&ticket->lock); } /* * Do the appropriate flushing and waiting for a ticket. * - * @fs_info: the filesystem * @space_info: space info for the reservation * @ticket: ticket for the reservation * @start_ns: timestamp when the reservation started @@ -1622,8 +1647,7 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, * This does the work of figuring out how to flush for the ticket, waiting for * the reservation, and returning the appropriate error if there is one. */ -static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static int handle_reserve_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket, u64 start_ns, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) @@ -1637,20 +1661,20 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, wait_reserve_ticket(space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: - priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_reclaim_metadata_space(space_info, ticket, priority_flush_states, ARRAY_SIZE(priority_flush_states)); break; case BTRFS_RESERVE_FLUSH_EVICT: - priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_reclaim_metadata_space(space_info, ticket, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: - priority_reclaim_data_space(fs_info, space_info, ticket); + priority_reclaim_data_space(space_info, ticket); break; default: - ASSERT(0); + ASSERT(0, "flush=%d", flush); break; } @@ -1662,9 +1686,10 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, * releasing reserved space (if an error happens the expectation is that * space wasn't reserved at all). */ - ASSERT(!(ticket->bytes == 0 && ticket->error)); - trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, - start_ns, flush, ticket->error); + ASSERT(!(ticket->bytes == 0 && ticket->error), + "ticket->bytes=%llu ticket->error=%d", ticket->bytes, ticket->error); + trace_btrfs_reserve_ticket(space_info->fs_info, space_info->flags, + orig_bytes, start_ns, flush, ticket->error); return ret; } @@ -1678,9 +1703,9 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); } -static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static inline void maybe_clamp_preempt(struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); @@ -1715,7 +1740,6 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) /* * Try to reserve bytes from the block_rsv's space. * - * @fs_info: the filesystem * @space_info: space info we want to allocate from * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1727,10 +1751,10 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct work_struct *async_work; struct reserve_ticket ticket; u64 start_ns = 0; @@ -1738,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, int ret = -ENOSPC; bool pending_tickets; - ASSERT(orig_bytes); + ASSERT(orig_bytes, "orig_bytes=%llu", orig_bytes); /* * If have a transaction handle (current->journal_info != NULL), then * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor @@ -1747,9 +1771,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (current->journal_info) { /* One assert per line for easier debugging. */ - ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); - ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); - ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL, "flush=%d", flush); + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL, "flush=%d", flush); + ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT, "flush=%d", flush); } if (flush == BTRFS_RESERVE_FLUSH_DATA) @@ -1777,7 +1801,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || - btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { + can_overcommit(space_info, used, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } @@ -1788,7 +1812,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * left to allocate for the block. */ if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { - used = btrfs_space_info_used(space_info, false); + used -= space_info->bytes_may_use; if (used + orig_bytes <= space_info->total_bytes) { btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; @@ -1807,6 +1831,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ticket.error = 0; space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); + spin_lock_init(&ticket.lock); ticket.steal = can_steal(flush); if (trace_btrfs_reserve_ticket_enabled()) start_ns = ktime_get_ns(); @@ -1823,9 +1848,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * preemptive flushing in order to keep up with * the workload. */ - maybe_clamp_preempt(fs_info, space_info); + maybe_clamp_preempt(space_info); - space_info->flush = 1; + space_info->flush = true; trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, @@ -1844,7 +1869,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && !work_busy(&fs_info->preempt_reclaim_work) && - need_preemptive_reclaim(fs_info, space_info)) { + need_preemptive_reclaim(space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_dfl_wq, @@ -1855,14 +1880,12 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!ret || !can_ticket(flush)) return ret; - return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, - orig_bytes, flush); + return handle_reserve_ticket(space_info, &ticket, start_ns, orig_bytes, flush); } /* * Try to reserve metadata bytes from the block_rsv's space. * - * @fs_info: the filesystem * @space_info: the space_info we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1874,20 +1897,21 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * regain reservations will be made and this will fail if there is not enough * space already. */ -int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; - ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); + ret = reserve_bytes(space_info, orig_bytes, flush); if (ret == -ENOSPC) { + struct btrfs_fs_info *fs_info = space_info->fs_info; + trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, orig_bytes, false); + btrfs_dump_space_info(space_info, orig_bytes, false); } return ret; } @@ -1895,7 +1919,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, /* * Try to reserve data bytes for an allocation. * - * @fs_info: the filesystem + * @space_info: the space_info we're allocating for * @bytes: number of bytes we need * @flush: how we are allowed to flush * @@ -1910,15 +1934,17 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || - flush == BTRFS_RESERVE_NO_FLUSH); - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + flush == BTRFS_RESERVE_NO_FLUSH, "flush=%d", flush); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA, + "current->journal_info=0x%lx flush=%d", + (unsigned long)current->journal_info, flush); - ret = __reserve_bytes(fs_info, space_info, bytes, flush); + ret = reserve_bytes(space_info, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, bytes, false); + btrfs_dump_space_info(space_info, bytes, false); } return ret; } @@ -1931,7 +1957,7 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) btrfs_info(fs_info, "dumping space info:"); list_for_each_entry(space_info, &fs_info->space_info, list) { spin_lock(&space_info->lock); - __btrfs_dump_space_info(fs_info, space_info); + __btrfs_dump_space_info(space_info); spin_unlock(&space_info->lock); } dump_global_block_rsv(fs_info); @@ -1948,7 +1974,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) int factor; /* It's df, we don't care if it's racy */ - if (list_empty(&sinfo->ro_bgs)) + if (data_race(list_empty(&sinfo->ro_bgs))) return 0; spin_lock(&sinfo->lock); @@ -2187,7 +2213,7 @@ void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) global_rsv->reserved += to_add; btrfs_space_info_update_bytes_may_use(space_info, to_add); if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; + global_rsv->full = true; len -= to_add; } spin_unlock(&global_rsv->lock); @@ -2195,5 +2221,5 @@ void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) grant: /* Add to any tickets we may have. */ if (len) - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_try_granting_tickets(space_info); } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 679f22efb407..446c0614ad4a 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -142,11 +142,11 @@ struct btrfs_space_info { flushing. The value is >> clamp, so turns out to be a 2^clamp divisor. */ - unsigned int full:1; /* indicates that we cannot allocate any more + bool full; /* indicates that we cannot allocate any more chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + bool chunk_alloc; /* set if we are allocating a chunk */ - unsigned int flush:1; /* set if we are trying to make space */ + bool flush; /* set if we are trying to make space */ unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ @@ -224,14 +224,6 @@ struct btrfs_space_info { s64 reclaimable_bytes; }; -struct reserve_ticket { - u64 bytes; - int error; - bool steal; - struct list_head list; - wait_queue_head_t wait; -}; - static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && @@ -266,6 +258,17 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable"); +static inline u64 btrfs_space_info_used(const struct btrfs_space_info *s_info, + bool may_use_included) +{ + lockdep_assert_held(&s_info->lock); + + return s_info->bytes_used + s_info->bytes_reserved + + s_info->bytes_pinned + s_info->bytes_readonly + + s_info->bytes_zone_unusable + + (may_use_included ? s_info->bytes_may_use : 0); +} + int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group); @@ -273,21 +276,15 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, - bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, +void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, bool dump_block_groups); -int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); -void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info); -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush); +void btrfs_try_granting_tickets(struct btrfs_space_info *space_info); +bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( struct btrfs_space_info *space_info, @@ -295,7 +292,7 @@ static inline void btrfs_space_info_free_bytes_may_use( { spin_lock(&space_info->lock); btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); - btrfs_try_granting_tickets(space_info->fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 5ca8d4db6722..f82e71f5d88b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -180,13 +180,14 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, /* Basic checks */ ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(len, fs_info->sectorsize)); + IS_ALIGNED(len, fs_info->sectorsize), "start=%llu len=%u", start, len); /* * The range check only works for mapped page, we can still have * unmapped page like dummy extent buffer pages. */ if (folio->mapping) - ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio), + ASSERT(folio_pos(folio) <= start && + start + len <= folio_next_pos(folio), "start=%llu len=%u folio_pos=%llu folio_size=%zu", start, len, folio_pos(folio), folio_size(folio)); } @@ -194,12 +195,11 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ unsigned int __start_bit; \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ - __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ + __start_bit += __bpf * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -217,7 +217,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start; + *len = min_t(u64, folio_next_pos(folio), orig_start + orig_len) - *start; } static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, @@ -250,7 +250,9 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, clear_bit(bit, bfs->bitmaps); cleared++; } - ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared, + "atomic_read(&bfs->nr_locked)=%d cleared=%d", + atomic_read(&bfs->nr_locked), cleared); last = atomic_sub_and_test(cleared, &bfs->nr_locked); spin_unlock_irqrestore(&bfs->lock, flags); return last; @@ -329,7 +331,9 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, if (test_and_clear_bit(bit + start_bit, bfs->bitmaps)) cleared++; } - ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared, + "atomic_read(&bfs->nr_locked)=%d cleared=%d", + atomic_read(&bfs->nr_locked), cleared); last = atomic_sub_and_test(cleared, &bfs->nr_locked); spin_unlock_irqrestore(&bfs->lock, flags); if (last) @@ -338,24 +342,20 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, #define subpage_test_bitmap_all_set(fs_info, folio, name) \ ({ \ - struct btrfs_folio_state *bfs = folio_get_private(folio); \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + struct btrfs_folio_state *__bfs = folio_get_private(folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_set(bfs->bitmaps, \ - blocks_per_folio * btrfs_bitmap_nr_##name, \ - blocks_per_folio); \ + bitmap_test_range_all_set(__bfs->bitmaps, \ + __bpf * btrfs_bitmap_nr_##name, __bpf); \ }) #define subpage_test_bitmap_all_zero(fs_info, folio, name) \ ({ \ - struct btrfs_folio_state *bfs = folio_get_private(folio); \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + struct btrfs_folio_state *__bfs = folio_get_private(folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_zero(bfs->bitmaps, \ - blocks_per_folio * btrfs_bitmap_nr_##name, \ - blocks_per_folio); \ + bitmap_test_range_all_zero(__bfs->bitmaps, \ + __bpf * btrfs_bitmap_nr_##name, __bpf); \ }) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, @@ -445,6 +445,7 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; + bool keep_write; spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); @@ -455,18 +456,9 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, * assume writeback is complete, and exit too early — violating sync * ordering guarantees. */ + keep_write = folio_test_dirty(folio); if (!folio_test_writeback(folio)) - __folio_start_writeback(folio, true); - if (!folio_test_dirty(folio)) { - struct address_space *mapping = folio_mapping(folio); - XA_STATE(xas, &mapping->i_pages, folio->index); - unsigned long flags; - - xas_lock_irqsave(&xas, flags); - xas_load(&xas); - xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); - xas_unlock_irqrestore(&xas, flags); - } + __folio_start_writeback(folio, keep_write); spin_unlock_irqrestore(&bfs->lock, flags); } @@ -672,27 +664,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, #define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ { \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ - const struct btrfs_folio_state *bfs = folio_get_private(folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ + const struct btrfs_folio_state *__bfs = folio_get_private(folio); \ \ - ASSERT(blocks_per_folio <= BITS_PER_LONG); \ - *dst = bitmap_read(bfs->bitmaps, \ - blocks_per_folio * btrfs_bitmap_nr_##name, \ - blocks_per_folio); \ + ASSERT(__bpf <= BITS_PER_LONG); \ + *dst = bitmap_read(__bfs->bitmaps, \ + __bpf * btrfs_bitmap_nr_##name, __bpf); \ } #define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ { \ unsigned long bitmap; \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ - start, len, folio_pos(folio), \ - blocks_per_folio, &bitmap); \ + start, len, folio_pos(folio), __bpf, &bitmap); \ } /* diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index ad0552db7c7d..d81a0ade559f 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -7,7 +7,6 @@ #include <linux/atomic.h> #include <linux/sizes.h> #include "btrfs_inode.h" -#include "fs.h" struct address_space; struct folio; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 430e7419349c..1999533b52be 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -807,17 +807,15 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, struct btrfs_root_ref *root_ref; struct btrfs_inode_ref *inode_ref; struct btrfs_key key; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); char *name = NULL, *ptr; u64 dirid; int len; int ret; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto err; - } + if (!path) + return ERR_PTR(-ENOMEM); name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { @@ -905,7 +903,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, fs_root = NULL; } - btrfs_free_path(path); if (ptr == name + PATH_MAX - 1) { name[0] = '/'; name[1] = '\0'; @@ -916,7 +913,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, err: btrfs_put_root(fs_root); - btrfs_free_path(path); kfree(name); return ERR_PTR(ret); } @@ -1614,7 +1610,7 @@ static inline void btrfs_descending_sort_devices( static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 *free_bytes) { - struct btrfs_device_info *devices_info; + struct btrfs_device_info AUTO_KFREE(devices_info); struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 type; @@ -1712,7 +1708,6 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, nr_devices--; } - kfree(devices_info); *free_bytes = avail_space; return 0; } @@ -2430,6 +2425,66 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont return 0; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_remove_bdev(struct super_block *sb, struct block_device *bdev) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_device *device; + struct btrfs_dev_lookup_args lookup_args = { .devt = bdev->bd_dev }; + bool can_rw; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + device = btrfs_find_device(fs_info->fs_devices, &lookup_args); + if (!device) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* Device not found, should not affect the running fs, just give a warning. */ + btrfs_warn(fs_info, "unable to find btrfs device for block device '%pg'", bdev); + return 0; + } + /* + * The to-be-removed device is already missing? + * + * That's weird but no special handling needed and can exit right now. + */ + if (unlikely(test_and_set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_warn(fs_info, "btrfs device id %llu is already missing", device->devid); + return 0; + } + + device->fs_devices->missing_devices++; + if (test_and_clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + list_del_init(&device->dev_alloc_list); + WARN_ON(device->fs_devices->rw_devices < 1); + device->fs_devices->rw_devices--; + } + can_rw = btrfs_check_rw_degradable(fs_info, device); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* + * Now device is considered missing, btrfs_device_name() won't give a + * meaningful result anymore, so only output the devid. + */ + if (unlikely(!can_rw)) { + btrfs_crit(fs_info, + "btrfs device id %llu has gone missing, can not maintain read-write", + device->devid); + return -EIO; + } + btrfs_warn(fs_info, + "btrfs device id %llu has gone missing, continue as degraded", + device->devid); + btrfs_set_opt(fs_info->mount_opt, DEGRADED); + return 0; +} + +static void btrfs_shutdown(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_force_shutdown(fs_info); +} +#endif + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -2445,6 +2500,10 @@ static const struct super_operations btrfs_super_ops = { .unfreeze_fs = btrfs_unfreeze, .nr_cached_objects = btrfs_nr_cached_objects, .free_cached_objects = btrfs_free_cached_objects, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + .remove_bdev = btrfs_remove_bdev, + .shutdown = btrfs_shutdown, +#endif }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 81f52c1f55ce..1f64c132b387 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -10,6 +10,7 @@ #include <linux/completion.h> #include <linux/bug.h> #include <linux/list.h> +#include <linux/string_choices.h> #include <crypto/hash.h> #include "messages.h" #include "ctree.h" @@ -25,6 +26,7 @@ #include "misc.h" #include "fs.h" #include "accessors.h" +#include "zoned.h" /* * Structure name Path @@ -1187,6 +1189,56 @@ static ssize_t btrfs_commit_stats_store(struct kobject *kobj, } BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); +static ssize_t btrfs_zoned_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_block_group *bg; + size_t ret = 0; + + + if (!btrfs_is_zoned(fs_info)) + return ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + ret += sysfs_emit_at(buf, ret, "active block-groups: %zu\n", + list_count_nodes(&fs_info->zone_active_bgs)); + spin_unlock(&fs_info->zone_active_bgs_lock); + + mutex_lock(&fs_info->reclaim_bgs_lock); + spin_lock(&fs_info->unused_bgs_lock); + ret += sysfs_emit_at(buf, ret, "\treclaimable: %zu\n", + list_count_nodes(&fs_info->reclaim_bgs)); + ret += sysfs_emit_at(buf, ret, "\tunused: %zu\n", + list_count_nodes(&fs_info->unused_bgs)); + spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); + + ret += sysfs_emit_at(buf, ret, "\tneed reclaim: %s\n", + str_true_false(btrfs_zoned_should_reclaim(fs_info))); + + if (fs_info->data_reloc_bg) + ret += sysfs_emit_at(buf, ret, + "data relocation block-group: %llu\n", + fs_info->data_reloc_bg); + if (fs_info->treelog_bg) + ret += sysfs_emit_at(buf, ret, + "tree-log block-group: %llu\n", + fs_info->treelog_bg); + + spin_lock(&fs_info->zone_active_bgs_lock); + ret += sysfs_emit_at(buf, ret, "active zones:\n"); + list_for_each_entry(bg, &fs_info->zone_active_bgs, active_bg_list) { + ret += sysfs_emit_at(buf, ret, + "\tstart: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu\n", + bg->start, bg->alloc_offset, bg->used, + bg->reserved, bg->zone_unusable); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + return ret; +} +BTRFS_ATTR(, zoned_stats, btrfs_zoned_stats_show); + static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1599,6 +1651,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), + BTRFS_ATTR_PTR(, zoned_stats), #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif @@ -1981,13 +2034,12 @@ static const char *alloc_name(struct btrfs_space_info *space_info) * Create a sysfs entry for a space info type at path * /sys/fs/btrfs/UUID/allocation/TYPE */ -int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info) { int ret; ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, - fs_info->space_info_kobj, "%s", + space_info->fs_info->space_info_kobj, "%s", alloc_name(space_info)); if (ret) { kobject_put(&space_info->kobj); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 0f94ae923210..05498e5346c3 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -37,8 +37,7 @@ void __cold btrfs_exit_sysfs(void); int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache); -int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info); +int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info); void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); void btrfs_sysfs_update_devid(struct btrfs_device *device); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index b19328d077d3..a0187d6163df 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -505,7 +505,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb) static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; - unsigned long *bitmap = NULL; + unsigned long AUTO_KFREE(bitmap); struct extent_buffer *eb = NULL; int ret; @@ -551,7 +551,6 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) ret = __test_eb_bitmaps(bitmap, eb); out: free_extent_buffer(eb); - kfree(bitmap); btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 42af6c737c6e..0b9f25dd1a68 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1013,7 +1013,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, struct rmap_test_vector *test) { struct btrfs_chunk_map *map; - u64 *logical = NULL; + u64 AUTO_KFREE(logical); int i, out_ndaddrs, out_stripe_len; int ret; @@ -1046,7 +1046,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, if (ret) { test_err("error adding chunk map to mapping tree"); btrfs_free_chunk_map(map); - goto out_free; + return ret; } ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1), @@ -1079,8 +1079,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = 0; out: btrfs_remove_chunk_map(fs_info, map); -out_free: - kfree(logical); return ret; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 3fc8dc3fd980..05cfda8af422 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -20,7 +20,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, struct btrfs_extent_item *item; struct btrfs_extent_inline_ref *iref; struct btrfs_tree_block_info *block_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); @@ -41,7 +41,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); if (ret) { test_err("couldn't insert ref %d", ret); - btrfs_free_path(path); return ret; } @@ -61,7 +60,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_free_path(path); return 0; } @@ -70,7 +68,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, { struct btrfs_trans_handle trans; struct btrfs_extent_item *item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 refs; int ret; @@ -90,7 +88,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); - btrfs_free_path(path); return ret; } @@ -112,7 +109,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); if (ret) test_err("failed to insert backref"); - btrfs_free_path(path); return ret; } @@ -121,7 +117,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, { struct btrfs_trans_handle trans; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; btrfs_init_dummy_trans(&trans, NULL); @@ -139,11 +135,9 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { test_err("didn't find our key %d", ret); - btrfs_free_path(path); return ret; } btrfs_del_item(&trans, root, path); - btrfs_free_path(path); return 0; } @@ -152,7 +146,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, { struct btrfs_trans_handle trans; struct btrfs_extent_item *item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 refs; int ret; @@ -172,7 +166,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); - btrfs_free_path(path); return ret; } @@ -198,7 +191,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return ret; } btrfs_del_item(&trans, root, path); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 89ae0c7a610a..05ee4391c83a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -32,6 +32,8 @@ #include "ioctl.h" #include "relocation.h" #include "scrub.h" +#include "ordered-data.h" +#include "delayed-inode.h" static struct kmem_cache *btrfs_trans_handle_cachep; @@ -138,7 +140,6 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { void btrfs_put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs)); @@ -185,7 +186,8 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ - ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); + ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING, + "cur_trans->state=%d", cur_trans->state); down_write(&fs_info->commit_root_sem); @@ -575,7 +577,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, * We want to reserve all the bytes we may need all at once, so we only * do 1 enospc flushing cycle per transaction start. */ - ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + ret = btrfs_reserve_metadata_bytes(si, bytes, flush); /* * If we are an emergency flush, which can steal from the global block @@ -585,7 +587,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { bytes -= *delayed_refs_bytes; *delayed_refs_bytes = 0; - ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + ret = btrfs_reserve_metadata_bytes(si, bytes, flush); } return ret; @@ -1024,13 +1026,18 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->block_rsv) { - ASSERT(!trans->bytes_reserved); - ASSERT(!trans->delayed_refs_bytes_reserved); + ASSERT(trans->bytes_reserved == 0, + "trans->bytes_reserved=%llu", trans->bytes_reserved); + ASSERT(trans->delayed_refs_bytes_reserved == 0, + "trans->delayed_refs_bytes_reserved=%llu", + trans->delayed_refs_bytes_reserved); return; } if (!trans->bytes_reserved) { - ASSERT(!trans->delayed_refs_bytes_reserved); + ASSERT(trans->delayed_refs_bytes_reserved == 0, + "trans->delayed_refs_bytes_reserved=%llu", + trans->delayed_refs_bytes_reserved); return; } @@ -1229,7 +1236,8 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) bool errors = false; int ret; - ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID, + "root_id(log_root)=%llu", btrfs_root_id(log_root)); ret = __btrfs_wait_marked_extents(fs_info, dirty_pages); if ((mark & EXTENT_DIRTY_LOG1) && @@ -1334,7 +1342,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ - ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING, + "trans->transaction->state=%d", trans->transaction->state); eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, @@ -1468,7 +1477,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ - ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING, + "trans->transaction->state=%d", trans->transaction->state); spin_lock(&fs_info->fs_roots_radix_lock); while (1) { @@ -1486,9 +1496,15 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) * At this point we can neither have tasks logging inodes * from a root nor trying to commit a log tree. */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); + ASSERT(atomic_read(&root->log_writers) == 0, + "atomic_read(&root->log_writers)=%d", + atomic_read(&root->log_writers)); + ASSERT(atomic_read(&root->log_commit[0]) == 0, + "atomic_read(&root->log_commit[0])=%d", + atomic_read(&root->log_commit[0])); + ASSERT(atomic_read(&root->log_commit[1]) == 0, + "atomic_read(&root->log_commit[1])=%d", + atomic_read(&root->log_commit[1])); radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root), @@ -2157,7 +2173,8 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) return; lockdep_assert_held(&trans->fs_info->trans_lock); - ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP, + "cur_trans->state=%d", cur_trans->state); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } @@ -2184,7 +2201,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *prev_trans = NULL; int ret; - ASSERT(refcount_read(&trans->use_count) == 1); + ASSERT(refcount_read(&trans->use_count) == 1, + "refcount_read(&trans->use_count)=%d", refcount_read(&trans->use_count)); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 9f7c777af635..18ef069197e5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -14,10 +14,6 @@ #include <linux/wait.h> #include "btrfs_inode.h" #include "delayed-ref.h" -#include "extent-io-tree.h" -#include "block-rsv.h" -#include "messages.h" -#include "misc.h" struct dentry; struct inode; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index c10b4c242acf..c21c21adf61e 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -186,7 +186,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, key->type == BTRFS_INODE_EXTREF_KEY || key->type == BTRFS_DIR_INDEX_KEY || key->type == BTRFS_DIR_ITEM_KEY || - key->type == BTRFS_EXTENT_DATA_KEY); + key->type == BTRFS_EXTENT_DATA_KEY, "key->type=%u", key->type); /* * Only subvolume trees along with their reloc trees need this check. @@ -1618,10 +1618,9 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(prev_end > key->objectid)) { extent_err(leaf, slot, - "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]", - prev_key->objectid, prev_key->type, - prev_key->offset, key->objectid, key->type, - key->offset); + "previous extent " BTRFS_KEY_FMT " overlaps current extent " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(prev_key), + BTRFS_KEY_FMT_VALUE(key)); return -EUCLEAN; } } @@ -2060,10 +2059,9 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) /* Make sure the keys are in the right order */ if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) { generic_err(leaf, slot, - "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", - prev_key.objectid, prev_key.type, - prev_key.offset, key.objectid, key.type, - key.offset); + "bad key order, prev " BTRFS_KEY_FMT " current " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&prev_key), + BTRFS_KEY_FMT_VALUE(&key)); return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } @@ -2181,10 +2179,9 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { generic_err(node, slot, - "bad key order, current (%llu %u %llu) next (%llu %u %llu)", - key.objectid, key.type, key.offset, - next_key.objectid, next_key.type, - next_key.offset); + "bad key order, current " BTRFS_KEY_FMT " next " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key), + BTRFS_KEY_FMT_VALUE(&next_key)); return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 30f3c3b849c1..fff37c8d96a4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -29,6 +29,7 @@ #include "orphan.h" #include "print-tree.h" #include "tree-checker.h" +#include "delayed-inode.h" #define MAX_CONFLICT_INODES 10 @@ -198,9 +199,9 @@ static void do_abort_log_replay(struct walk_control *wc, const char *function, if (wc->log_leaf) { btrfs_crit(fs_info, - "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):", +"log tree (for root %llu) leaf currently being processed (slot %d key " BTRFS_KEY_FMT "):", btrfs_root_id(wc->root), wc->log_slot, - wc->log_key.objectid, wc->log_key.type, wc->log_key.offset); + BTRFS_KEY_FMT_VALUE(&wc->log_key)); btrfs_print_leaf(wc->log_leaf); } @@ -262,7 +263,7 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r struct btrfs_inode *inode; /* Only meant to be called for subvolume roots and not for log roots. */ - ASSERT(btrfs_is_fstree(btrfs_root_id(root))); + ASSERT(btrfs_is_fstree(btrfs_root_id(root)), "root_id=%llu", btrfs_root_id(root)); /* * We're holding a transaction handle whether we are logging or @@ -501,7 +502,7 @@ static int overwrite_item(struct walk_control *wc) * the leaf before writing into the log tree. See the comments at * copy_items() for more details. */ - ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root)); item_size = btrfs_item_size(wc->log_leaf, wc->log_slot); src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); @@ -510,9 +511,9 @@ static int overwrite_item(struct walk_control *wc) ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search subvolume tree for key (%llu %u %llu) root %llu", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset, btrfs_root_id(root)); + "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&wc->log_key), + btrfs_root_id(root)); return ret; } @@ -601,9 +602,9 @@ static int overwrite_item(struct walk_control *wc) insert: btrfs_release_path(wc->subvol_path); /* try to insert the key into the destination tree */ - wc->subvol_path->skip_release_on_error = 1; + wc->subvol_path->skip_release_on_error = true; ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size); - wc->subvol_path->skip_release_on_error = 0; + wc->subvol_path->skip_release_on_error = false; dst_eb = wc->subvol_path->nodes[0]; dst_slot = wc->subvol_path->slots[0]; @@ -618,9 +619,8 @@ insert: btrfs_extend_item(trans, wc->subvol_path, item_size - found_size); } else if (ret) { btrfs_abort_log_replay(wc, ret, - "failed to insert item for key (%llu %u %llu)", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset); + "failed to insert item for key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&wc->log_key)); return ret; } dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); @@ -829,9 +829,9 @@ static noinline int replay_one_extent(struct walk_control *wc) &wc->log_key, sizeof(*item)); if (ret) { btrfs_abort_log_replay(wc, ret, - "failed to insert item with key (%llu %u %llu) root %llu", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset, btrfs_root_id(root)); + "failed to insert item with key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&wc->log_key), + btrfs_root_id(root)); goto out; } dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0], @@ -1348,9 +1348,9 @@ again: ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search subvolume tree for key (%llu %u %llu) root %llu", - search_key.objectid, search_key.type, - search_key.offset, btrfs_root_id(root)); + "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&search_key), + btrfs_root_id(root)); return ret; } else if (ret == 0) { /* @@ -1483,9 +1483,9 @@ again: } if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search subvolume tree for key (%llu %u %llu) root %llu", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset, btrfs_root_id(root)); + "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&wc->log_key), + btrfs_root_id(root)); goto out; } @@ -2282,7 +2282,8 @@ static noinline int replay_one_dir_item(struct walk_control *wc) struct btrfs_dir_item *di; /* We only log dir index keys, which only contain a single dir item. */ - ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY); + ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY, + "wc->log_key.type=%u", wc->log_key.type); di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item); ret = replay_one_name(wc, di); @@ -2434,7 +2435,7 @@ static noinline int check_item_in_log(struct walk_control *wc, * we need to do is process the dir index keys, we (and our caller) can * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). */ - ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); + ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY, "dir_key->type=%u", dir_key->type); eb = wc->subvol_path->nodes[0]; slot = wc->subvol_path->slots[0]; @@ -2647,7 +2648,7 @@ static noinline int replay_dir_deletes(struct walk_control *wc, int ret = 0; struct btrfs_key dir_key; struct btrfs_key found_key; - struct btrfs_path *log_path; + BTRFS_PATH_AUTO_FREE(log_path); struct btrfs_inode *dir; dir_key.objectid = dirid; @@ -2664,7 +2665,6 @@ static noinline int replay_dir_deletes(struct walk_control *wc, * we replay the deletes before we copy in the inode item from the log. */ if (IS_ERR(dir)) { - btrfs_free_path(log_path); ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; @@ -2700,10 +2700,9 @@ static noinline int replay_dir_deletes(struct walk_control *wc, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search root %llu for key (%llu %u %llu)", + "failed to search root %llu for key " BTRFS_KEY_FMT, btrfs_root_id(root), - dir_key.objectid, dir_key.type, - dir_key.offset); + BTRFS_KEY_FMT_VALUE(&dir_key)); goto out; } @@ -2745,7 +2744,6 @@ static noinline int replay_dir_deletes(struct walk_control *wc, ret = 0; out: btrfs_release_path(wc->subvol_path); - btrfs_free_path(log_path); iput(&dir->vfs_inode); return ret; } @@ -3340,7 +3338,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); return ctx->log_ret; } - ASSERT(log_transid == root->log_transid); + ASSERT(log_transid == root->log_transid, + "log_transid=%d root->log_transid=%d", log_transid, root->log_transid); atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ @@ -3480,7 +3479,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = root_log_ctx.log_ret; goto out; } - ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid, + "root_log_ctx.log_transid=%d log_root_tree->log_transid=%d", + root_log_ctx.log_transid, log_root_tree->log_transid); atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { @@ -3584,7 +3585,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * someone else already started it. We use <= and not < because the * first log transaction has an ID of 0. */ - ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid); + ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid, + "last_log_commit(root)=%d log_transid=%d", + btrfs_get_root_last_log_commit(root), log_transid); btrfs_set_root_last_log_commit(root, log_transid); out_wake_log_root: @@ -3895,10 +3898,10 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * or the entire directory. */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { + struct btrfs_root *root = dir->root; BTRFS_PATH_AUTO_FREE(path); int ret; @@ -3933,11 +3936,11 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, - struct btrfs_inode *inode, u64 dirid) + struct btrfs_inode *inode, + struct btrfs_inode *dir) { - struct btrfs_root *log; + struct btrfs_root *root = dir->root; int ret; ret = inode_logged(trans, inode, NULL); @@ -3952,10 +3955,10 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); if (WARN_ON(ret)) return; - log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL); + ret = btrfs_del_inode_ref(trans, root->log_root, name, btrfs_ino(inode), + btrfs_ino(dir), NULL); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) btrfs_set_log_full_commit(trans); @@ -4017,7 +4020,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, int count) { struct btrfs_root *log = inode->root->log_root; - char *ins_data = NULL; + char AUTO_KFREE(ins_data); struct btrfs_item_batch batch; struct extent_buffer *dst; unsigned long src_offset; @@ -4028,7 +4031,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, int ret; int i; - ASSERT(count > 0); + ASSERT(count > 0, "count=%d", count); batch.nr = count; if (count == 1) { @@ -4062,7 +4065,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); if (ret) - goto out; + return ret; dst = dst_path->nodes[0]; /* @@ -4081,7 +4084,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, btrfs_release_path(dst_path); last_index = batch.keys[count - 1].offset; - ASSERT(last_index > inode->last_dir_index_offset); + ASSERT(last_index > inode->last_dir_index_offset, + "last_index=%llu inode->last_dir_index_offset=%llu", + last_index, inode->last_dir_index_offset); /* * If for some unexpected reason the last item's index is not greater @@ -4094,8 +4099,6 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, if (btrfs_get_first_dir_index_to_log(inode) == 0) btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); -out: - kfree(ins_data); return ret; } @@ -4154,7 +4157,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, for (int i = path->slots[0]; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; - int ret; btrfs_item_key_to_cpu(src, &key, i); @@ -4224,8 +4226,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, } if (batch_size > 0) { - int ret; - ret = flush_dir_items_batch(trans, inode, src, dst_path, batch_start, batch_size); if (ret < 0) @@ -4410,7 +4410,9 @@ done: * change in the current transaction), then we don't need to log * a range, last_old_dentry_offset is == to last_offset. */ - ASSERT(last_old_dentry_offset <= last_offset); + ASSERT(last_old_dentry_offset <= last_offset, + "last_old_dentry_offset=%llu last_offset=%llu", + last_old_dentry_offset, last_offset); if (last_old_dentry_offset < last_offset) ret = insert_dir_log_key(trans, log, path, ino, last_old_dentry_offset + 1, @@ -4765,7 +4767,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; - char *ins_data; + char AUTO_KFREE(ins_data); int dst_index; const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); @@ -4893,7 +4895,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, disk_bytenr + extent_num_bytes - 1, &ordered_sums, false); if (ret < 0) - goto out; + return ret; ret = 0; list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { @@ -4903,7 +4905,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, kfree(sums); } if (ret) - goto out; + return ret; add_to_batch: ins_sizes[dst_index] = btrfs_item_size(src, src_slot); @@ -4917,11 +4919,11 @@ add_to_batch: * so we don't need to do anything. */ if (batch.nr == 0) - goto out; + return 0; ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); if (ret) - goto out; + return ret; dst_index = 0; for (int i = 0; i < nr; i++) { @@ -4974,8 +4976,6 @@ copy_item: } btrfs_release_path(dst_path); -out: - kfree(ins_data); return ret; } @@ -5414,12 +5414,12 @@ process: set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); atomic_inc(&trans->transaction->pending_ordered); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } btrfs_put_ordered_extent(ordered); } @@ -5694,9 +5694,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, struct btrfs_inode *inode, u64 *other_ino, u64 *other_parent) { - int ret; BTRFS_PATH_AUTO_FREE(search_path); - char *name = NULL; + char AUTO_KFREE(name); u32 name_len = 0; u32 item_size = btrfs_item_size(eb, slot); u32 cur_offset = 0; @@ -5705,8 +5704,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, search_path = btrfs_alloc_path(); if (!search_path) return -ENOMEM; - search_path->search_commit_root = 1; - search_path->skip_locking = 1; + search_path->search_commit_root = true; + search_path->skip_locking = true; while (cur_offset < item_size) { u64 parent; @@ -5739,10 +5738,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, char *new_name; new_name = krealloc(name, this_name_len, GFP_NOFS); - if (!new_name) { - ret = -ENOMEM; - goto out; - } + if (!new_name) + return -ENOMEM; name_len = this_name_len; name = new_name; } @@ -5760,28 +5757,24 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, di, &di_key); if (di_key.type == BTRFS_INODE_ITEM_KEY) { if (di_key.objectid != key->objectid) { - ret = 1; *other_ino = di_key.objectid; *other_parent = parent; + return 1; } else { - ret = 0; + return 0; } } else { - ret = -EAGAIN; + return -EAGAIN; } - goto out; } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; + return PTR_ERR(di); } btrfs_release_path(search_path); cur_offset += this_len; } - ret = 0; -out: - kfree(name); - return ret; + + return 0; } /* @@ -6031,8 +6024,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (WARN_ON_ONCE(ret > 0)) { @@ -6052,8 +6045,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, } btrfs_release_path(path); - path->search_commit_root = 0; - path->skip_locking = 0; + path->search_commit_root = false; + path->skip_locking = false; return ret; } @@ -6543,7 +6536,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, curr = list_next_entry(curr, log_list); } - ASSERT(batch.nr >= 1); + ASSERT(batch.nr >= 1, "batch.nr=%d", batch.nr); ret = insert_delayed_items_batch(trans, log, path, &batch, first); curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, @@ -6587,7 +6580,9 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, } last_dir_index = curr->index; - ASSERT(last_dir_index >= first_dir_index); + ASSERT(last_dir_index >= first_dir_index, + "last_dir_index=%llu first_dir_index=%llu", + last_dir_index, first_dir_index); ret = insert_dir_log_key(trans, inode->root->log_root, path, ino, first_dir_index, last_dir_index); @@ -6681,7 +6676,9 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, goto next_batch; last_dir_index = last->index; - ASSERT(last_dir_index >= first_dir_index); + ASSERT(last_dir_index >= first_dir_index, + "last_dir_index=%llu first_dir_index=%llu", + last_dir_index, first_dir_index); /* * If this range starts right after where the previous one ends, * then we want to reuse the previous range item and change its @@ -6748,7 +6745,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, */ lockdep_assert_not_held(&inode->log_mutex); - ASSERT(!ctx->logging_new_delayed_dentries); + ASSERT(!ctx->logging_new_delayed_dentries, + "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries); ctx->logging_new_delayed_dentries = true; list_for_each_entry(item, delayed_ins_list, log_list) { @@ -7169,8 +7167,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; key.objectid = ino; key.type = BTRFS_INODE_REF_KEY; @@ -7203,28 +7201,24 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { - struct btrfs_key inode_key; + u64 dir_id; struct btrfs_inode *dir_inode; - inode_key.type = BTRFS_INODE_ITEM_KEY; - inode_key.offset = 0; - if (key.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *) (ptr + cur_offset); - inode_key.objectid = btrfs_inode_extref_parent( - leaf, extref); + dir_id = btrfs_inode_extref_parent(leaf, extref); cur_offset += sizeof(*extref); cur_offset += btrfs_inode_extref_name_len(leaf, extref); } else { - inode_key.objectid = key.offset; + dir_id = key.offset; cur_offset = item_size; } - dir_inode = btrfs_iget_logging(inode_key.objectid, root); + dir_inode = btrfs_iget_logging(dir_id, root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent @@ -7965,7 +7959,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct fscrypt_name fname; - ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX, + "old_dir_index=%llu", old_dir_index); ret = fscrypt_setup_filename(&old_dir->vfs_inode, &old_dentry->d_name, 0, &fname); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index dc313e6bb2fa..41e47fda036d 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -8,8 +8,7 @@ #include <linux/list.h> #include <linux/fs.h> -#include "messages.h" -#include "ctree.h" +#include <linux/fscrypt.h> #include "transaction.h" struct inode; @@ -80,13 +79,12 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, - struct btrfs_inode *inode, u64 dirid); + struct btrfs_inode *inode, + struct btrfs_inode *dir); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 17b5e81123a1..e3a1310fa7d5 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -27,32 +27,26 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, u8 type, u64 subid) { int ret; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int slot; u32 item_size; unsigned long offset; struct btrfs_key key; - if (WARN_ON_ONCE(!uuid_root)) { - ret = -ENOENT; - goto out; - } + if (WARN_ON_ONCE(!uuid_root)) + return -ENOENT; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; btrfs_uuid_to_key(uuid, type, &key); ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return -ENOENT; eb = path->nodes[0]; slot = path->slots[0]; @@ -64,7 +58,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); - goto out; + return ret; } while (item_size) { __le64 data; @@ -78,8 +72,6 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, item_size -= sizeof(data); } -out: - btrfs_free_path(path); return ret; } @@ -89,7 +81,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; int slot; @@ -100,18 +92,14 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ if (ret != -ENOENT) return ret; - if (WARN_ON_ONCE(!uuid_root)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(!uuid_root)) + return -EINVAL; btrfs_uuid_to_key(uuid, type, &key); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subid_le)); @@ -134,15 +122,12 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", ret, key.objectid, key.offset, type); - goto out; + return ret; } - ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); -out: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, @@ -151,7 +136,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; int slot; @@ -161,29 +146,23 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 unsigned long move_src; unsigned long move_len; - if (WARN_ON_ONCE(!uuid_root)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(!uuid_root)) + return -EINVAL; btrfs_uuid_to_key(uuid, type, &key); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for uuid item!", ret); - goto out; - } - if (ret > 0) { - ret = -ENOENT; - goto out; + return ret; } + if (ret > 0) + return -ENOENT; eb = path->nodes[0]; slot = path->slots[0]; @@ -192,8 +171,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); - ret = -ENOENT; - goto out; + return -ENOENT; } while (item_size) { __le64 read_subid; @@ -205,16 +183,12 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 item_size -= sizeof(read_subid); } - if (!item_size) { - ret = -ENOENT; - goto out; - } + if (!item_size) + return -ENOENT; item_size = btrfs_item_size(eb, slot); - if (item_size == sizeof(subid)) { - ret = btrfs_del_item(trans, uuid_root, path); - goto out; - } + if (item_size == sizeof(subid)) + return btrfs_del_item(trans, uuid_root, path); move_dst = offset; move_src = offset + sizeof(subid); @@ -222,9 +196,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 memmove_extent_buffer(eb, move_dst, move_src, move_len); btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1); -out: - btrfs_free_path(path); - return ret; + return 0; } static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, @@ -293,7 +265,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->uuid_root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret = 0; struct extent_buffer *leaf; int slot; @@ -301,10 +273,8 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) unsigned long offset; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = 0; key.type = 0; @@ -312,17 +282,15 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) again_search_slot: ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); - if (ret) { - if (ret > 0) - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return 0; while (1) { - if (btrfs_fs_closing(fs_info)) { - ret = -EINTR; - goto out; - } + if (btrfs_fs_closing(fs_info)) + return -EINTR; + cond_resched(); leaf = path->nodes[0]; slot = path->slots[0]; @@ -353,7 +321,7 @@ again_search_slot: ret = btrfs_check_uuid_tree_entry(fs_info, uuid, key.type, subid_cpu); if (ret < 0) - goto out; + return ret; if (ret > 0) { btrfs_release_path(path); ret = btrfs_uuid_iter_rem(root, uuid, key.type, @@ -369,7 +337,7 @@ again_search_slot: goto again_search_slot; } if (ret < 0 && ret != -ENOENT) - goto out; + return ret; key.offset++; goto again_search_slot; } @@ -386,8 +354,6 @@ skip: break; } -out: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 46bd8ca58670..06dfcb461f53 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -109,7 +109,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) { struct btrfs_trans_handle *trans; struct btrfs_root *root = inode->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int count = 0; int ret; @@ -121,10 +121,8 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) while (1) { /* 1 for the item being dropped */ trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); /* * Walk backwards through all the items until we find one that @@ -143,7 +141,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) path->slots[0]--; } else if (ret < 0) { btrfs_end_transaction(trans); - goto out; + return ret; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -161,17 +159,14 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) ret = btrfs_del_items(trans, root, path, path->slots[0], 1); if (ret) { btrfs_end_transaction(trans); - goto out; + return ret; } count++; btrfs_release_path(path); btrfs_end_transaction(trans); } - ret = count; btrfs_end_transaction(trans); -out: - btrfs_free_path(path); - return ret; + return count; } /* @@ -217,7 +212,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, const char *src, u64 len) { struct btrfs_trans_handle *trans; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_key key; @@ -233,10 +228,8 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, while (len > 0) { /* 1 for the new item being inserted */ trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); key.objectid = btrfs_ino(inode); key.type = key_type; @@ -267,7 +260,6 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, btrfs_end_transaction(trans); } - btrfs_free_path(path); return ret; } @@ -296,7 +288,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, char *dest, u64 len, struct folio *dest_folio) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_key key; @@ -404,7 +396,6 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, } } out: - btrfs_free_path(path); if (!ret) ret = copied; return ret; @@ -587,6 +578,9 @@ static int btrfs_begin_enable_verity(struct file *filp) btrfs_assert_inode_locked(inode); + if (IS_ENCRYPTED(&inode->vfs_inode)) + return -EOPNOTSUPP; + if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) return -EBUSY; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2bec544d8ba3..ae1742a35e76 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -739,7 +739,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; struct path new = { .mnt = NULL, .dentry = NULL }; - char *old_path = NULL; + char AUTO_KFREE(old_path); bool is_same = false; int ret; @@ -765,7 +765,6 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path) if (path_equal(&old, &new)) is_same = true; out: - kfree(old_path); path_put(&old); path_put(&new); return is_same; @@ -1681,7 +1680,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct btrfs_dev_extent *dev_extent; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u64 search_start; u64 hole_size; u64 max_hole_start; @@ -1711,8 +1710,8 @@ again: } path->reada = READA_FORWARD; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; key.objectid = device->devid; key.type = BTRFS_DEV_EXTENT_KEY; @@ -1812,7 +1811,6 @@ next: "max_hole_start=%llu max_hole_size=%llu search_end=%llu", max_hole_start, max_hole_size, search_end); out: - btrfs_free_path(path); *start = max_hole_start; if (len) *len = max_hole_size; @@ -1826,7 +1824,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf = NULL; @@ -1845,7 +1843,7 @@ again: ret = btrfs_previous_item(root, path, key.objectid, BTRFS_DEV_EXTENT_KEY); if (ret) - goto out; + return ret; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); extent = btrfs_item_ptr(leaf, path->slots[0], @@ -1860,7 +1858,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - goto out; + return ret; } *dev_extent_len = btrfs_dev_extent_length(leaf, extent); @@ -1868,8 +1866,6 @@ again: ret = btrfs_del_item(trans, root, path); if (ret == 0) set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); -out: - btrfs_free_path(path); return ret; } @@ -1897,7 +1893,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, int ret; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) @@ -1909,13 +1905,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (unlikely(ret == 0)) { /* Corruption */ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } ret = btrfs_previous_item(fs_info->chunk_root, path, @@ -1928,10 +1923,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, path->slots[0]); *devid_ret = found_key.offset + 1; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -1942,7 +1934,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; struct btrfs_key key; @@ -1961,7 +1953,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, &key, sizeof(*dev_item)); btrfs_trans_release_chunk_metadata(trans); if (ret) - goto out; + return ret; leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); @@ -1987,10 +1979,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -2002,14 +1991,11 @@ out: static void update_dev_time(const char *device_path) { struct path path; - int ret; - ret = kern_path(device_path, LOOKUP_FOLLOW, &path); - if (ret) - return; - - inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); - path_put(&path); + if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) { + vfs_utimes(&path, NULL); + path_put(&path); + } } static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, @@ -2017,7 +2003,7 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, { struct btrfs_root *root = device->fs_info->chunk_root; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -2031,16 +2017,12 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); btrfs_trans_release_chunk_metadata(trans); - if (ret) { - if (ret > 0) - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; + if (ret < 0) + return ret; - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } /* @@ -2626,7 +2608,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_dev_item *dev_item; struct btrfs_device *device; @@ -2648,7 +2630,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) ret = btrfs_search_slot(trans, root, &key, path, 0, 1); btrfs_trans_release_chunk_metadata(trans); if (ret < 0) - goto error; + return ret; leaf = path->nodes[0]; next_slot: @@ -2657,7 +2639,7 @@ next_slot: if (ret > 0) break; if (ret < 0) - goto error; + return ret; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_release_path(path); @@ -2688,10 +2670,7 @@ next_slot: path->slots[0]++; goto next_slot; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) @@ -2946,7 +2925,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = device->fs_info->chunk_root; struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; @@ -2962,12 +2941,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); @@ -2981,8 +2958,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); -out: - btrfs_free_path(path); return ret; } @@ -3035,7 +3010,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -3048,23 +3023,21 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; - else if (unlikely(ret > 0)) { /* Logic error or corruption */ + return ret; + if (unlikely(ret > 0)) { + /* Logic error or corruption */ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", chunk_offset); btrfs_abort_transaction(trans, -ENOENT); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = btrfs_del_item(trans, root, path); if (unlikely(ret < 0)) { btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } -out: - btrfs_free_path(path); return ret; } @@ -3501,7 +3474,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) { struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_chunk *chunk; struct btrfs_key key; @@ -3525,7 +3498,7 @@ again: ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); - goto error; + return ret; } if (unlikely(ret == 0)) { /* @@ -3535,9 +3508,8 @@ again: * offset (one less than the previous one, wrong * alignment and size). */ - ret = -EUCLEAN; mutex_unlock(&fs_info->reclaim_bgs_lock); - goto error; + return -EUCLEAN; } ret = btrfs_previous_item(chunk_root, path, key.objectid, @@ -3545,7 +3517,7 @@ again: if (ret) mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) - goto error; + return ret; if (ret > 0) break; @@ -3579,8 +3551,6 @@ again: } else if (WARN_ON(failed && retried)) { ret = -ENOSPC; } -error: - btrfs_free_path(path); return ret; } @@ -4081,7 +4051,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) struct btrfs_root *chunk_root = fs_info->chunk_root; u64 chunk_type; struct btrfs_chunk *chunk; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -4252,7 +4222,6 @@ loop: goto again; } error: - btrfs_free_path(path); if (enospc_errors) { btrfs_info(fs_info, "%d enospc errors during balance", enospc_errors); @@ -4410,7 +4379,7 @@ static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) { u32 size_buf = 1024; char tmp_buf[192] = {'\0'}; - char *buf; + char AUTO_KFREE(buf); char *bp; u32 size_bp = size_buf; int ret; @@ -4458,8 +4427,6 @@ out_overflow: btrfs_info(fs_info, "balance: %s %s", (bctl->flags & BTRFS_BALANCE_RESUME) ? "resume" : "start", buf); - - kfree(buf); } /* @@ -4660,12 +4627,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); + mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); - sb_end_write(fs_info->sb); return ret; } @@ -4709,7 +4676,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) struct btrfs_balance_control *bctl; struct btrfs_balance_item *item; struct btrfs_disk_balance_args disk_bargs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; int ret; @@ -4724,17 +4691,14 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0) { /* ret = -ENOENT; */ - ret = 0; - goto out; + return 0; } bctl = kzalloc(sizeof(*bctl), GFP_NOFS); - if (!bctl) { - ret = -ENOMEM; - goto out; - } + if (!bctl) + return -ENOMEM; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); @@ -4771,8 +4735,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) fs_info->balance_ctl = bctl; spin_unlock(&fs_info->balance_lock); mutex_unlock(&fs_info->balance_mutex); -out: - btrfs_free_path(path); return ret; } @@ -5593,9 +5555,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct btrfs_device_info *devices_info = NULL; + struct btrfs_device_info AUTO_KFREE(devices_info); struct alloc_chunk_ctl ctl; - struct btrfs_block_group *block_group; int ret; lockdep_assert_held(&info->chunk_mutex); @@ -5628,22 +5589,14 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOMEM); ret = gather_device_info(fs_devices, &ctl, devices_info); - if (ret < 0) { - block_group = ERR_PTR(ret); - goto out; - } + if (ret < 0) + return ERR_PTR(ret); ret = decide_stripe_size(fs_devices, &ctl, devices_info); - if (ret < 0) { - block_group = ERR_PTR(ret); - goto out; - } - - block_group = create_chunk(trans, &ctl, devices_info); + if (ret < 0) + return ERR_PTR(ret); -out: - kfree(devices_info); - return block_group; + return create_chunk(trans, &ctl, devices_info); } /* @@ -6076,12 +6029,7 @@ struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, { struct btrfs_io_context *bioc; - bioc = kzalloc( - /* The size of btrfs_io_context */ - sizeof(struct btrfs_io_context) + - /* Plus the variable array for the stripes */ - sizeof(struct btrfs_io_stripe) * (total_stripes), - GFP_NOFS); + bioc = kzalloc(struct_size(bioc, stripes, total_stripes), GFP_NOFS); if (!bioc) return NULL; @@ -6807,6 +6755,8 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, const struct btrfs_device *device) { + if (args->devt) + return device->devt == args->devt; if (args->missing) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && !device->bdev) @@ -7455,7 +7405,7 @@ static void readahead_tree_node_children(struct extent_buffer *node) int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->chunk_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; struct btrfs_key found_key; @@ -7494,7 +7444,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * chunk tree, to keep it simple, just skip locking on the chunk tree. */ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); - path->skip_locking = 1; + path->skip_locking = true; /* * Read all device items, and then all the chunk items. All @@ -7572,8 +7522,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) ret = 0; error: mutex_unlock(&uuid_mutex); - - btrfs_free_path(path); return ret; } @@ -7673,7 +7621,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); int ret = 0; path = btrfs_alloc_path(); @@ -7695,8 +7643,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) } out: mutex_unlock(&fs_devices->device_list_mutex); - - btrfs_free_path(path); return ret; } @@ -7705,7 +7651,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_stats_item *ptr; @@ -7724,7 +7670,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, btrfs_warn(fs_info, "error %d while searching for dev_stats item for device %s", ret, btrfs_dev_name(device)); - goto out; + return ret; } if (ret == 0 && @@ -7735,7 +7681,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, btrfs_warn(fs_info, "delete too small dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); - goto out; + return ret; } ret = 1; } @@ -7749,7 +7695,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, btrfs_warn(fs_info, "insert dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); - goto out; + return ret; } } @@ -7758,8 +7704,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); -out: - btrfs_free_path(path); return ret; } @@ -8049,7 +7993,7 @@ out: */ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; u64 prev_devid = 0; @@ -8080,17 +8024,15 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) path->reada = READA_FORWARD; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; + return ret; /* No dev extents at all? Not good */ - if (unlikely(ret > 0)) { - ret = -EUCLEAN; - goto out; - } + if (unlikely(ret > 0)) + return -EUCLEAN; } while (1) { struct extent_buffer *leaf = path->nodes[0]; @@ -8116,20 +8058,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", devid, physical_offset, prev_dev_ext_end); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) - goto out; + return ret; prev_devid = devid; prev_dev_ext_end = physical_offset + physical_len; ret = btrfs_next_item(root, path); if (ret < 0) - goto out; + return ret; if (ret > 0) { ret = 0; break; @@ -8137,10 +8078,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) } /* Ensure all chunks have corresponding dev extents */ - ret = verify_chunk_dev_extent_mapping(fs_info); -out: - btrfs_free_path(path); - return ret; + return verify_chunk_dev_extent_mapping(fs_info); } /* @@ -8177,12 +8115,12 @@ static int relocating_repair_kthread(void *data) target = cache->start; btrfs_put_block_group(cache); - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); - sb_end_write(fs_info->sb); return -EBUSY; } @@ -8210,7 +8148,6 @@ out: btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2cbf8080eade..34b854c1a303 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -45,7 +45,7 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN_SHIFT (16) #define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1) -static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); +static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); /* Used by sanity check for btrfs_raid_types. */ #define const_ffs(n) (__builtin_ctzll(n) + 1) @@ -58,8 +58,7 @@ static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); */ static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); -static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) > - ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); +static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); /* ilog2() can handle both constants and variables */ #define BTRFS_BG_FLAG_TO_INDEX(profile) \ @@ -662,6 +661,11 @@ struct btrfs_dev_lookup_args { u64 devid; u8 *uuid; u8 *fsid; + /* + * If devt is specified, all other members will be ignored as it is + * enough to uniquely locate a device. + */ + dev_t devt; bool missing; }; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 79fb1614bd0c..ab55d10bd71f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -29,9 +29,8 @@ int btrfs_getxattr(const struct inode *inode, const char *name, { struct btrfs_dir_item *di; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; - int ret = 0; unsigned long data_ptr; path = btrfs_alloc_path(); @@ -41,26 +40,19 @@ int btrfs_getxattr(const struct inode *inode, const char *name, /* lookup the xattr by name */ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, strlen(name), 0); - if (!di) { - ret = -ENODATA; - goto out; - } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (!di) + return -ENODATA; + if (IS_ERR(di)) + return PTR_ERR(di); leaf = path->nodes[0]; /* if size is 0, that means we want the size of the attr */ - if (!size) { - ret = btrfs_dir_data_len(leaf, di); - goto out; - } + if (!size) + return btrfs_dir_data_len(leaf, di); /* now get the data out of our dir_item */ - if (btrfs_dir_data_len(leaf, di) > size) { - ret = -ERANGE; - goto out; - } + if (btrfs_dir_data_len(leaf, di) > size) + return -ERANGE; /* * The way things are packed into the leaf is like this @@ -73,11 +65,7 @@ int btrfs_getxattr(const struct inode *inode, const char *name, btrfs_dir_name_len(leaf, di)); read_extent_buffer(leaf, buffer, data_ptr, btrfs_dir_data_len(leaf, di)); - ret = btrfs_dir_data_len(leaf, di); - -out: - btrfs_free_path(path); - return ret; + return btrfs_dir_data_len(leaf, di); } int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, @@ -85,7 +73,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); size_t name_len = strlen(name); int ret = 0; @@ -97,7 +85,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->skip_release_on_error = 1; + path->skip_release_on_error = true; if (!value) { di = btrfs_lookup_xattr(trans, root, path, @@ -212,7 +200,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, */ } out: - btrfs_free_path(path); if (!ret) { set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -278,7 +265,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int iter_ret = 0; int ret = 0; size_t total_size = 0, size_left = size; @@ -354,8 +341,6 @@ next: else ret = total_size; - btrfs_free_path(path); - return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d1db7fa1fe58..359a98e6de85 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -37,8 +37,8 @@ #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) -#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) -#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) +#define BTRFS_SB_LOG_FIRST_SHIFT ilog2(BTRFS_SB_LOG_FIRST_OFFSET) +#define BTRFS_SB_LOG_SECOND_SHIFT ilog2(BTRFS_SB_LOG_SECOND_OFFSET) /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 @@ -93,7 +93,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, sector_t sector; for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { - ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); + ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL, + "zones[%d].type=%d", i, zones[i].type); empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); full[i] = sb_zone_is_full(&zones[i]); } @@ -166,14 +167,14 @@ static inline u32 sb_zone_number(int shift, int mirror) { u64 zone = U64_MAX; - ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX, "mirror=%d", mirror); switch (mirror) { case 0: zone = 0; break; case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } - ASSERT(zone <= U32_MAX); + ASSERT(zone <= U32_MAX, "zone=%llu", zone); return (u32)zone; } @@ -240,7 +241,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, unsigned int i; u32 zno; - ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + ASSERT(IS_ALIGNED(pos, zinfo->zone_size), + "pos=%llu zinfo->zone_size=%llu", pos, zinfo->zone_size); zno = pos >> zinfo->zone_size_shift; /* * We cannot report zones beyond the zone end. So, it is OK to @@ -264,8 +266,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, } } - ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, - copy_zone_info_cb, zones); + ret = blkdev_report_zones_cached(device->bdev, pos >> SECTOR_SHIFT, + *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { btrfs_err(device->fs_info, "zoned: failed to read zone %llu on %s (devid %llu)", @@ -494,6 +496,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_ACTIVE: __set_bit(nreported, zone_info->active_zones); nactive++; break; @@ -896,9 +899,9 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, if (sb_zone + 1 >= nr_zones) return -ENOENT; - ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), - BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, - zones); + ret = blkdev_report_zones_cached(bdev, zone_start_sector(sb_zone, bdev), + BTRFS_NR_SB_LOG_ZONES, + copy_zone_info_cb, zones); if (ret < 0) return ret; if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) @@ -1055,8 +1058,10 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, bool have_sb; int i; - ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); - ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); + ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size), + "hole_start=%llu zinfo->zone_size=%llu", hole_start, zinfo->zone_size); + ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size), + "num_bytes=%llu zinfo->zone_size=%llu", num_bytes, zinfo->zone_size); while (pos < hole_end) { begin = pos >> shift; @@ -1172,8 +1177,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) u64 pos; int ret; - ASSERT(IS_ALIGNED(start, zinfo->zone_size)); - ASSERT(IS_ALIGNED(size, zinfo->zone_size)); + ASSERT(IS_ALIGNED(start, zinfo->zone_size), + "start=%llu, zinfo->zone_size=%llu", start, zinfo->zone_size); + ASSERT(IS_ALIGNED(size, zinfo->zone_size), + "size=%llu, zinfo->zone_size=%llu", size, zinfo->zone_size); if (begin + nbits > zinfo->nr_zones) return -ERANGE; @@ -1628,7 +1635,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) struct btrfs_chunk_map *map; u64 logical = cache->start; u64 length = cache->length; - struct zone_info *zone_info = NULL; + struct zone_info AUTO_KFREE(zone_info); int ret; int i; unsigned long *active = NULL; @@ -1782,7 +1789,6 @@ out: cache->physical_map = NULL; } bitmap_free(active); - kfree(zone_info); return ret; } @@ -1809,14 +1815,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; bool ret = false; if (!btrfs_is_zoned(fs_info)) return false; - if (!inode || !is_data_inode(inode)) + if (!is_data_inode(inode)) return false; if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) @@ -1867,7 +1873,7 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); /* The em should be a new COW extent, thus it should not have an offset. */ - ASSERT(em->offset == 0); + ASSERT(em->offset == 0, "em->offset=%llu", em->offset); em->disk_bytenr = logical; btrfs_free_extent_map(em); write_unlock(&em_tree->lock); @@ -2578,7 +2584,8 @@ again: struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; int factor; - ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC, + "reloc_sinfo->subgroup_id=%d", reloc_sinfo->subgroup_id); factor = btrfs_bg_type_to_factor(bg->flags); down_write(&space_info->groups_sem); @@ -2592,9 +2599,9 @@ again: space_info->disk_total -= bg->length * factor; space_info->disk_total -= bg->zone_unusable; /* There is no allocation ever happened. */ - ASSERT(bg->used == 0); + ASSERT(bg->used == 0, "bg->used=%llu", bg->used); /* No super block in a block group on the zoned setup. */ - ASSERT(bg->bytes_super == 0); + ASSERT(bg->bytes_super == 0, "bg->bytes_super=%llu", bg->bytes_super); spin_unlock(&space_info->lock); bg->space_info = reloc_sinfo; @@ -2620,7 +2627,8 @@ again: /* Allocate new BG in the data relocation space_info. */ space_info = data_sinfo->sub_group[0]; - ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC, + "space_info->subgroup_id=%d", space_info->subgroup_id); ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret == 1) { @@ -2750,10 +2758,9 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) return ret < 0 ? ret : 1; } -int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool do_finish) +int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_block_group *bg; int index; @@ -2962,7 +2969,8 @@ int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num * This holds because we currently reset fully used then freed * block group. */ - ASSERT(reclaimed == bg->zone_capacity); + ASSERT(reclaimed == bg->zone_capacity, + "reclaimed=%llu bg->zone_capacity=%llu", reclaimed, bg->zone_capacity); bg->free_space_ctl->free_space += reclaimed; space_info->bytes_zone_unusable -= reclaimed; spin_unlock(&bg->lock); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 17c5656580dd..5cefdeb08b7b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -15,7 +15,6 @@ #include "disk-io.h" #include "block-group.h" #include "btrfs_inode.h" -#include "fs.h" struct block_device; struct extent_buffer; @@ -94,8 +93,7 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); -int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, bool do_finish); +int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); #else /* CONFIG_BLK_DEV_ZONED */ @@ -262,8 +260,7 @@ static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) return 1; } -static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static inline int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish) { /* Consider all the block groups are active */ diff --git a/fs/buffer.c b/fs/buffer.c index 6a8752f7bbed..838c0c571022 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -611,9 +611,9 @@ int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, return err; ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read_once(inode) & I_DIRTY_ALL)) goto out; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); @@ -2732,7 +2732,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, loff_t i_size = i_size_read(inode); /* Is the folio fully inside i_size? */ - if (folio_pos(folio) + folio_size(folio) <= i_size) + if (folio_next_pos(folio) <= i_size) return __block_write_full_folio(inode, folio, get_block, wbc); /* Is the folio fully outside i_size? (truncate in progress) */ diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 3e63cfe15874..a08250d244ea 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -9,6 +9,7 @@ #include <linux/mount.h> #include <linux/xattr.h> #include <linux/file.h> +#include <linux/namei.h> #include <linux/falloc.h> #include <trace/events/fscache.h> #include "internal.h" @@ -428,11 +429,13 @@ static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie) if (!old_tmpfile) { struct cachefiles_volume *volume = object->volume; struct dentry *fan = volume->fanout[(u8)cookie->key_hash]; + struct dentry *obj; - inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); - cachefiles_bury_object(volume->cache, object, fan, - old_file->f_path.dentry, - FSCACHE_OBJECT_INVALIDATED); + obj = start_removing_dentry(fan, old_file->f_path.dentry); + if (!IS_ERR(obj)) + cachefiles_bury_object(volume->cache, object, + fan, obj, + FSCACHE_OBJECT_INVALIDATED); } fput(old_file); } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d1edb2ac3837..e5ec90dccc27 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -93,12 +93,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); retry: ret = cachefiles_inject_read_error(); if (ret == 0) - subdir = lookup_one(&nop_mnt_idmap, &QSTR(dirname), dir); + subdir = start_creating(&nop_mnt_idmap, dir, &QSTR(dirname)); else subdir = ERR_PTR(ret); trace_cachefiles_lookup(NULL, dir, subdir); @@ -129,10 +128,12 @@ retry: if (ret < 0) goto mkdir_error; ret = cachefiles_inject_write_error(); - if (ret == 0) - subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); - else + if (ret == 0) { + subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700, NULL); + } else { + end_creating(subdir); subdir = ERR_PTR(ret); + } if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); @@ -141,7 +142,7 @@ retry: trace_cachefiles_mkdir(dir, subdir); if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) { - dput(subdir); + end_creating(subdir); goto retry; } ASSERT(d_backing_inode(subdir)); @@ -154,7 +155,7 @@ retry: /* Tell rmdir() it's not allowed to delete the subdir */ inode_lock(d_inode(subdir)); - inode_unlock(d_inode(dir)); + end_creating_keep(subdir); if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", @@ -196,14 +197,11 @@ mark_error: return ERR_PTR(-EBUSY); mkdir_error: - inode_unlock(d_inode(dir)); - if (!IS_ERR(subdir)) - dput(subdir); + end_creating(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - inode_unlock(d_inode(dir)); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); @@ -263,6 +261,8 @@ static int cachefiles_unlink(struct cachefiles_cache *cache, * - File backed objects are unlinked * - Directory backed objects are stuffed into the graveyard for userspace to * delete + * On entry dir must be locked. It will be unlocked on exit. + * On entry there must be at least 2 refs on rep, one will be dropped on exit. */ int cachefiles_bury_object(struct cachefiles_cache *cache, struct cachefiles_object *object, @@ -278,27 +278,23 @@ int cachefiles_bury_object(struct cachefiles_cache *cache, _enter(",'%pd','%pd'", dir, rep); if (rep->d_parent != dir) { - inode_unlock(d_inode(dir)); + end_removing(rep); _leave(" = -ESTALE"); return -ESTALE; } /* non-directories can just be unlinked */ if (!d_is_dir(rep)) { - dget(rep); /* Stop the dentry being negated if it's only pinned - * by a file struct. - */ ret = cachefiles_unlink(cache, object, dir, rep, why); - dput(rep); + end_removing(rep); - inode_unlock(d_inode(dir)); _leave(" = %d", ret); return ret; } /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - inode_unlock(d_inode(dir)); + end_removing(rep); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -425,13 +421,12 @@ int cachefiles_delete_object(struct cachefiles_object *object, _enter(",OBJ%x{%pD}", object->debug_id, object->file); - /* Stop the dentry being negated if it's only pinned by a file struct. */ - dget(dentry); - - inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT); - ret = cachefiles_unlink(volume->cache, object, fan, dentry, why); - inode_unlock(d_backing_inode(fan)); - dput(dentry); + dentry = start_removing_dentry(fan, dentry); + if (IS_ERR(dentry)) + ret = PTR_ERR(dentry); + else + ret = cachefiles_unlink(volume->cache, object, fan, dentry, why); + end_removing(dentry); return ret; } @@ -644,9 +639,13 @@ bool cachefiles_look_up_object(struct cachefiles_object *object) if (!d_is_reg(dentry)) { pr_err("%pd is not a file\n", dentry); - inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); - ret = cachefiles_bury_object(volume->cache, object, fan, dentry, - FSCACHE_OBJECT_IS_WEIRD); + struct dentry *de = start_removing_dentry(fan, dentry); + if (IS_ERR(de)) + ret = PTR_ERR(de); + else + ret = cachefiles_bury_object(volume->cache, object, + fan, de, + FSCACHE_OBJECT_IS_WEIRD); dput(dentry); if (ret < 0) return false; @@ -679,36 +678,41 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, _enter(",%pD", object->file); - inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); ret = cachefiles_inject_read_error(); if (ret == 0) - dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); + dentry = start_creating(&nop_mnt_idmap, fan, &QSTR(object->d_name)); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), cachefiles_trace_lookup_error); _debug("lookup fail %ld", PTR_ERR(dentry)); - goto out_unlock; + goto out; } - if (!d_is_negative(dentry)) { + /* + * This loop will only execute more than once if some other thread + * races to create the object we are trying to create. + */ + while (!d_is_negative(dentry)) { ret = cachefiles_unlink(volume->cache, object, fan, dentry, FSCACHE_OBJECT_IS_STALE); if (ret < 0) - goto out_dput; + goto out_end; + + end_creating(dentry); - dput(dentry); ret = cachefiles_inject_read_error(); if (ret == 0) - dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); + dentry = start_creating(&nop_mnt_idmap, fan, + &QSTR(object->d_name)); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), cachefiles_trace_lookup_error); _debug("lookup fail %ld", PTR_ERR(dentry)); - goto out_unlock; + goto out; } } @@ -729,10 +733,9 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, success = true; } -out_dput: - dput(dentry); -out_unlock: - inode_unlock(d_inode(fan)); +out_end: + end_creating(dentry); +out: _leave(" = %u", success); return success; } @@ -748,26 +751,20 @@ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache, struct dentry *victim; int ret = -ENOENT; - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); + victim = start_removing(&nop_mnt_idmap, dir, &QSTR(filename)); - victim = lookup_one(&nop_mnt_idmap, &QSTR(filename), dir); if (IS_ERR(victim)) goto lookup_error; - if (d_is_negative(victim)) - goto lookup_put; if (d_inode(victim)->i_flags & S_KERNEL_FILE) goto lookup_busy; return victim; lookup_busy: ret = -EBUSY; -lookup_put: - inode_unlock(d_inode(dir)); - dput(victim); + end_removing(victim); return ERR_PTR(ret); lookup_error: - inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); if (ret == -ENOENT) return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */ @@ -815,18 +812,17 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, ret = cachefiles_bury_object(cache, NULL, dir, victim, FSCACHE_OBJECT_WAS_CULLED); + dput(victim); if (ret < 0) goto error; fscache_count_culled(); - dput(victim); _leave(" = 0"); return 0; error_unlock: - inode_unlock(d_inode(dir)); + end_removing(victim); error: - dput(victim); if (ret == -ENOENT) return -ESTALE; /* Probably got retired by the netfs */ diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c index 781aac4ef274..90ba926f488e 100644 --- a/fs/cachefiles/volume.c +++ b/fs/cachefiles/volume.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/slab.h> +#include <linux/namei.h> #include "internal.h" #include <trace/events/fscache.h> @@ -58,9 +59,11 @@ retry: if (ret < 0) { if (ret != -ESTALE) goto error_dir; - inode_lock_nested(d_inode(cache->store), I_MUTEX_PARENT); - cachefiles_bury_object(cache, NULL, cache->store, vdentry, - FSCACHE_VOLUME_IS_WEIRD); + vdentry = start_removing_dentry(cache->store, vdentry); + if (!IS_ERR(vdentry)) + cachefiles_bury_object(cache, NULL, cache->store, + vdentry, + FSCACHE_VOLUME_IS_WEIRD); cachefiles_put_directory(volume->dentry); cond_resched(); goto retry; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 322ed268f14a..63b75d214210 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping, ceph_wbc->index = ceph_wbc->start_index; ceph_wbc->end = -1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; - } else { - ceph_wbc->tag = PAGECACHE_TAG_DIRTY; - } + ceph_wbc->tag = wbc_to_tag(wbc); ceph_wbc->op_idx = -1; ceph_wbc->num_ops = 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 930fbd54d2c8..f678bab189d8 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -26,7 +26,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) return; /* Only new inodes! */ - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return; WARN_ON_ONCE(ci->netfs.cache); diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index 7026e794813c..928746b92512 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -329,7 +329,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen) out: kfree(cryptbuf); if (dir != parent) { - if ((dir->i_state & I_NEW)) + if ((inode_state_read_once(dir) & I_NEW)) discard_new_inode(dir); else iput(dir); @@ -438,7 +438,7 @@ out: fscrypt_fname_free_buffer(&_tname); out_inode: if (dir != fname->dir) { - if ((dir->i_state & I_NEW)) + if ((inode_state_read_once(dir) & I_NEW)) discard_new_inode(dir); else iput(dir); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d18c0eaef9b7..bf50c6e7a029 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2155,7 +2155,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, " rfiles: %20lld\n" " rsubdirs: %20lld\n" "rbytes: %20lld\n" - "rctime: %10lld.%09ld\n", + "rctime: %ptSp\n", ci->i_files + ci->i_subdirs, ci->i_files, ci->i_subdirs, @@ -2163,8 +2163,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, ci->i_rfiles, ci->i_rsubdirs, ci->i_rbytes, - ci->i_rctime.tv_sec, - ci->i_rctime.tv_nsec); + &ci->i_rctime); } if (*ppos >= dfi->dir_info_len) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 99b30f784ee2..983390069f73 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -740,7 +740,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode, vino.ino, ceph_ino(dir), dentry->d_name.name); ceph_dir_clear_ordered(dir); ceph_init_inode_acls(inode, as_ctx); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { /* * If it's not I_NEW, then someone created this before * we got here. Assume the server is aware of it at @@ -901,7 +901,7 @@ retry: new_inode = NULL; goto out_req; } - WARN_ON_ONCE(!(new_inode->i_state & I_NEW)); + WARN_ON_ONCE(!(inode_state_read_once(new_inode) & I_NEW)); spin_lock(&dentry->d_lock); di->flags |= CEPH_DENTRY_ASYNC_CREATE; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a6e260d9e420..a596cb53f1ac 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -132,7 +132,7 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, goto out_err; } - inode->i_state = 0; + inode_state_assign_raw(inode, 0); inode->i_mode = *mode; err = ceph_security_init_secctx(dentry, *mode, as_ctx); @@ -201,7 +201,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, doutc(cl, "on %llx=%llx.%llx got %p new %d\n", ceph_present_inode(inode), ceph_vinop(inode), inode, - !!(inode->i_state & I_NEW)); + !!(inode_state_read_once(inode) & I_NEW)); return inode; } @@ -228,7 +228,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) goto err; } - if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { + if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) { pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n", inode->i_mode); goto err; @@ -261,7 +261,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) } } #endif - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_op = &ceph_snapdir_iops; inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ @@ -270,7 +270,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) return inode; err: - if ((inode->i_state & I_NEW)) + if ((inode_state_read_once(inode) & I_NEW)) discard_new_inode(inode); else iput(inode); @@ -744,7 +744,7 @@ void ceph_evict_inode(struct inode *inode) netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_NETFS_WB) + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); @@ -879,7 +879,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); + struct timespec64 iatime = inode_get_atime(inode); struct timespec64 ictime = inode_get_ctime(inode); + struct timespec64 imtime = inode_get_mtime(inode); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -889,39 +891,26 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_XATTR_EXCL)) { if (ci->i_version == 0 || timespec64_compare(ctime, &ictime) > 0) { - doutc(cl, "ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", - ictime.tv_sec, ictime.tv_nsec, - ctime->tv_sec, ctime->tv_nsec); + doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime); inode_set_ctime_to_ts(inode, *ctime); } if (ci->i_version == 0 || ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ - doutc(cl, "mtime %lld.%09ld -> %lld.%09ld tw %d -> %d\n", - inode_get_mtime_sec(inode), - inode_get_mtime_nsec(inode), - mtime->tv_sec, mtime->tv_nsec, - ci->i_time_warp_seq, (int)time_warp_seq); + doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime, + ci->i_time_warp_seq, (int)time_warp_seq); inode_set_mtime_to_ts(inode, *mtime); inode_set_atime_to_ts(inode, *atime); ci->i_time_warp_seq = time_warp_seq; } else if (time_warp_seq == ci->i_time_warp_seq) { - struct timespec64 ts; - /* nobody did utimes(); take the max */ - ts = inode_get_mtime(inode); - if (timespec64_compare(mtime, &ts) > 0) { - doutc(cl, "mtime %lld.%09ld -> %lld.%09ld inc\n", - ts.tv_sec, ts.tv_nsec, - mtime->tv_sec, mtime->tv_nsec); + if (timespec64_compare(mtime, &imtime) > 0) { + doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime); inode_set_mtime_to_ts(inode, *mtime); } - ts = inode_get_atime(inode); - if (timespec64_compare(atime, &ts) > 0) { - doutc(cl, "atime %lld.%09ld -> %lld.%09ld inc\n", - ts.tv_sec, ts.tv_nsec, - atime->tv_sec, atime->tv_nsec); + if (timespec64_compare(atime, &iatime) > 0) { + doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime); inode_set_atime_to_ts(inode, *atime); } } else if (issued & CEPH_CAP_FILE_EXCL) { @@ -1013,7 +1002,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, le64_to_cpu(info->version), ci->i_version); /* Once I_NEW is cleared, we can't change type or dev numbers */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_mode = mode; } else { if (inode_wrong_type(inode, mode)) { @@ -1090,7 +1079,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, #ifdef CONFIG_FS_ENCRYPTION if (iinfo->fscrypt_auth_len && - ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) { + ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) { kfree(ci->fscrypt_auth); ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; ci->fscrypt_auth = iinfo->fscrypt_auth; @@ -1692,13 +1681,13 @@ retry_lookup: pr_err_client(cl, "badness %p %llx.%llx\n", in, ceph_vinop(in)); req->r_target_inode = NULL; - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) discard_new_inode(in); else iput(in); goto done; } - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) unlock_new_inode(in); } @@ -1898,11 +1887,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, pr_err_client(cl, "inode badness on %p got %d\n", in, rc); err = rc; - if (in->i_state & I_NEW) { + if (inode_state_read_once(in) & I_NEW) { ihold(in); discard_new_inode(in); } - } else if (in->i_state & I_NEW) { + } else if (inode_state_read_once(in) & I_NEW) { unlock_new_inode(in); } @@ -2114,7 +2103,7 @@ retry_lookup: pr_err_client(cl, "badness on %p %llx.%llx\n", in, ceph_vinop(in)); if (d_really_is_negative(dn)) { - if (in->i_state & I_NEW) { + if (inode_state_read_once(in) & I_NEW) { ihold(in); discard_new_inode(in); } @@ -2124,7 +2113,7 @@ retry_lookup: err = ret; goto next_item; } - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) unlock_new_inode(in); if (d_really_is_negative(dn)) { @@ -2703,10 +2692,8 @@ retry: if (ia_valid & ATTR_ATIME) { struct timespec64 atime = inode_get_atime(inode); - doutc(cl, "%p %llx.%llx atime %lld.%09ld -> %lld.%09ld\n", - inode, ceph_vinop(inode), - atime.tv_sec, atime.tv_nsec, - attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); + doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n", + inode, ceph_vinop(inode), &atime, &attr->ia_atime); if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { ci->i_time_warp_seq++; inode_set_atime_to_ts(inode, attr->ia_atime); @@ -2780,10 +2767,8 @@ retry: if (ia_valid & ATTR_MTIME) { struct timespec64 mtime = inode_get_mtime(inode); - doutc(cl, "%p %llx.%llx mtime %lld.%09ld -> %lld.%09ld\n", - inode, ceph_vinop(inode), - mtime.tv_sec, mtime.tv_nsec, - attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); + doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n", + inode, ceph_vinop(inode), &mtime, &attr->ia_mtime); if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { ci->i_time_warp_seq++; inode_set_mtime_to_ts(inode, attr->ia_mtime); @@ -2804,13 +2789,11 @@ retry: /* these do nothing */ if (ia_valid & ATTR_CTIME) { + struct timespec64 ictime = inode_get_ctime(inode); bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; - doutc(cl, "%p %llx.%llx ctime %lld.%09ld -> %lld.%09ld (%s)\n", - inode, ceph_vinop(inode), - inode_get_ctime_sec(inode), - inode_get_ctime_nsec(inode), - attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, + doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n", + inode, ceph_vinop(inode), &ictime, &attr->ia_ctime, only ? "ctime only" : "ignored"); if (only) { /* diff --git a/fs/ceph/super.c b/fs/ceph/super.c index ad0cf177e75a..f6bf24b5c683 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1149,7 +1149,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, const char *path = fsc->mount_options->server_path ? fsc->mount_options->server_path + 1 : ""; - err = __ceph_open_session(fsc->client, started); + err = __ceph_open_session(fsc->client); if (err < 0) goto out; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 537165db4519..ad1f30bea175 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -249,8 +249,7 @@ static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, static ssize_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, size_t size) { - return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_rctime.tv_sec, - ci->i_rctime.tv_nsec); + return ceph_fmt_xattr(val, size, "%ptSp", &ci->i_rctime); } /* dir pin */ @@ -307,8 +306,7 @@ static bool ceph_vxattrcb_snap_btime_exists(struct ceph_inode_info *ci) static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, size_t size) { - return ceph_fmt_xattr(val, size, "%lld.%09ld", ci->i_snap_btime.tv_sec, - ci->i_snap_btime.tv_nsec); + return ceph_fmt_xattr(val, size, "%ptSp", &ci->i_snap_btime); } static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 62a3d2565c26..70bb0579b40c 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -70,7 +70,7 @@ retry: if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { cii = ITOC(inode); /* we still need to set i_ino for things like stat(2) */ inode->i_ino = hash; @@ -148,7 +148,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) /* we should never see newly created inodes because we intentionally * fail in the initialization callback */ - BUG_ON(inode->i_state & I_NEW); + BUG_ON(inode_state_read_once(inode) & I_NEW); return inode; } diff --git a/fs/coredump.c b/fs/coredump.c index 5c1c381ee380..8feb9c1cf83d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -708,7 +708,7 @@ static bool coredump_sock_connect(struct core_name *cn, struct coredump_params * */ pidfs_coredump(cprm); - retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len, + retval = kernel_connect(socket, (struct sockaddr_unsized *)(&addr), addr_len, O_NONBLOCK | SOCK_COREDUMP); if (retval) { @@ -1036,7 +1036,7 @@ static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm, static bool coredump_write(struct core_name *cn, struct coredump_params *cprm, - struct linux_binfmt *binfmt) + const struct linux_binfmt *binfmt) { if (dump_interrupted()) @@ -1086,119 +1086,119 @@ static inline bool coredump_skip(const struct coredump_params *cprm, return false; } -void vfs_coredump(const kernel_siginfo_t *siginfo) +static void do_coredump(struct core_name *cn, struct coredump_params *cprm, + size_t **argv, int *argc, const struct linux_binfmt *binfmt) { - struct cred *cred __free(put_cred) = NULL; - size_t *argv __free(kfree) = NULL; - struct core_state core_state; - struct core_name cn; - struct mm_struct *mm = current->mm; - struct linux_binfmt *binfmt = mm->binfmt; - const struct cred *old_cred; - int argc = 0; - struct coredump_params cprm = { - .siginfo = siginfo, - .limit = rlimit(RLIMIT_CORE), - /* - * We must use the same mm->flags while dumping core to avoid - * inconsistency of bit flags, since this flag is not protected - * by any locks. - * - * Note that we only care about MMF_DUMP* flags. - */ - .mm_flags = __mm_flags_get_dumpable(mm), - .vma_meta = NULL, - .cpu = raw_smp_processor_id(), - }; - - audit_core_dumps(siginfo->si_signo); - - if (coredump_skip(&cprm, binfmt)) - return; - - cred = prepare_creds(); - if (!cred) - return; - /* - * We cannot trust fsuid as being the "true" uid of the process - * nor do we know its entire history. We only know it was tainted - * so we dump it as root in mode 2, and only into a controlled - * environment (pipe handler or fully qualified path). - */ - if (coredump_force_suid_safe(&cprm)) - cred->fsuid = GLOBAL_ROOT_UID; - - if (coredump_wait(siginfo->si_signo, &core_state) < 0) - return; - - old_cred = override_creds(cred); - - if (!coredump_parse(&cn, &cprm, &argv, &argc)) { + if (!coredump_parse(cn, cprm, argv, argc)) { coredump_report_failure("format_corename failed, aborting core"); - goto close_fail; + return; } - switch (cn.core_type) { + switch (cn->core_type) { case COREDUMP_FILE: - if (!coredump_file(&cn, &cprm, binfmt)) - goto close_fail; + if (!coredump_file(cn, cprm, binfmt)) + return; break; case COREDUMP_PIPE: - if (!coredump_pipe(&cn, &cprm, argv, argc)) - goto close_fail; + if (!coredump_pipe(cn, cprm, *argv, *argc)) + return; break; case COREDUMP_SOCK_REQ: fallthrough; case COREDUMP_SOCK: - if (!coredump_socket(&cn, &cprm)) - goto close_fail; + if (!coredump_socket(cn, cprm)) + return; break; default: WARN_ON_ONCE(true); - goto close_fail; + return; } /* Don't even generate the coredump. */ - if (cn.mask & COREDUMP_REJECT) - goto close_fail; + if (cn->mask & COREDUMP_REJECT) + return; /* get us an unshared descriptor table; almost always a no-op */ /* The cell spufs coredump code reads the file descriptor tables */ if (unshare_files()) - goto close_fail; + return; - if ((cn.mask & COREDUMP_KERNEL) && !coredump_write(&cn, &cprm, binfmt)) - goto close_fail; + if ((cn->mask & COREDUMP_KERNEL) && !coredump_write(cn, cprm, binfmt)) + return; - coredump_sock_shutdown(cprm.file); + coredump_sock_shutdown(cprm->file); /* Let the parent know that a coredump was generated. */ - if (cn.mask & COREDUMP_USERSPACE) - cn.core_dumped = true; + if (cn->mask & COREDUMP_USERSPACE) + cn->core_dumped = true; /* * When core_pipe_limit is set we wait for the coredump server * or usermodehelper to finish before exiting so it can e.g., * inspect /proc/<pid>. */ - if (cn.mask & COREDUMP_WAIT) { - switch (cn.core_type) { + if (cn->mask & COREDUMP_WAIT) { + switch (cn->core_type) { case COREDUMP_PIPE: - wait_for_dump_helpers(cprm.file); + wait_for_dump_helpers(cprm->file); break; case COREDUMP_SOCK_REQ: fallthrough; case COREDUMP_SOCK: - coredump_sock_wait(cprm.file); + coredump_sock_wait(cprm->file); break; default: break; } } +} + +void vfs_coredump(const kernel_siginfo_t *siginfo) +{ + size_t *argv __free(kfree) = NULL; + struct core_state core_state; + struct core_name cn; + const struct mm_struct *mm = current->mm; + const struct linux_binfmt *binfmt = mm->binfmt; + int argc = 0; + struct coredump_params cprm = { + .siginfo = siginfo, + .limit = rlimit(RLIMIT_CORE), + /* + * We must use the same mm->flags while dumping core to avoid + * inconsistency of bit flags, since this flag is not protected + * by any locks. + * + * Note that we only care about MMF_DUMP* flags. + */ + .mm_flags = __mm_flags_get_dumpable(mm), + .vma_meta = NULL, + .cpu = raw_smp_processor_id(), + }; + + audit_core_dumps(siginfo->si_signo); + + if (coredump_skip(&cprm, binfmt)) + return; + + CLASS(prepare_creds, cred)(); + if (!cred) + return; + /* + * We cannot trust fsuid as being the "true" uid of the process + * nor do we know its entire history. We only know it was tainted + * so we dump it as root in mode 2, and only into a controlled + * environment (pipe handler or fully qualified path). + */ + if (coredump_force_suid_safe(&cprm)) + cred->fsuid = GLOBAL_ROOT_UID; + + if (coredump_wait(siginfo->si_signo, &core_state) < 0) + return; -close_fail: + scoped_with_creds(cred) + do_coredump(&cn, &cprm, &argv, &argc, binfmt); coredump_cleanup(&cn, &cprm); - revert_creds(old_cred); return; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index ca54bf24b719..e54ebe402df7 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -95,7 +95,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode = iget_locked(sb, cramino(cramfs_inode, offset)); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; switch (cramfs_inode->mode & S_IFMT) { diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 3adbd7167055..5e939ea3ac28 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -945,7 +945,7 @@ static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { inode = ci->ci_inode; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4bd3918f50e3..40fa05688d3a 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -834,7 +834,7 @@ int fscrypt_drop_inode(struct inode *inode) * userspace is still using the files, inodes can be dirtied between * then and now. We mustn't lose any writes, so skip dirty inodes here. */ - if (inode->i_state & I_DIRTY_ALL) + if (inode_state_read(inode) & I_DIRTY_ALL) return 0; /* @@ -1507,7 +1507,7 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); /* * invalidate the pages whose sharing state is to be changed @@ -1536,10 +1536,10 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (ret < 0) return ret; - ret = iomap_iter_advance(iter, &length); + ret = iomap_iter_advance(iter, length); if (ret) return ret; - } while (length > 0); + } while ((length = iomap_length(iter)) > 0); if (did_zero) *did_zero = true; @@ -1597,7 +1597,7 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { done = iov_iter_zero(min(length, end - pos), iter); - return iomap_iter_advance(iomi, &done); + return iomap_iter_advance(iomi, done); } } @@ -1681,12 +1681,12 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); - length = xfer; - ret = iomap_iter_advance(iomi, &length); + ret = iomap_iter_advance(iomi, xfer); if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; + length = iomap_length(iomi); } dax_read_unlock(id); @@ -1919,10 +1919,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp, ret |= VM_FAULT_MAJOR; } - if (!(ret & VM_FAULT_ERROR)) { - u64 length = PAGE_SIZE; - iter.status = iomap_iter_advance(&iter, &length); - } + if (!(ret & VM_FAULT_ERROR)) + iter.status = iomap_iter_advance(&iter, PAGE_SIZE); } if (iomap_errp) @@ -2034,10 +2032,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); - if (ret != VM_FAULT_FALLBACK) { - u64 length = PMD_SIZE; - iter.status = iomap_iter_advance(&iter, &length); - } + if (ret != VM_FAULT_FALLBACK) + iter.status = iomap_iter_advance(&iter, PMD_SIZE); } unlock_entry: @@ -2163,7 +2159,6 @@ static int dax_range_compare_iter(struct iomap_iter *it_src, const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; - u64 dest_len; void *saddr, *daddr; int id, ret; @@ -2196,10 +2191,9 @@ static int dax_range_compare_iter(struct iomap_iter *it_src, dax_read_unlock(id); advance: - dest_len = len; - ret = iomap_iter_advance(it_src, &len); + ret = iomap_iter_advance(it_src, len); if (!ret) - ret = iomap_iter_advance(it_dest, &dest_len); + ret = iomap_iter_advance(it_dest, len); return ret; out_unlock: diff --git a/fs/dcache.c b/fs/dcache.c index 035cccbc9276..9143fd502def 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -86,7 +86,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); -static struct kmem_cache *dentry_cache __ro_after_init; +static struct kmem_cache *__dentry_cache __ro_after_init; +#define dentry_cache runtime_const_ptr(__dentry_cache) const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); @@ -794,7 +795,7 @@ void d_mark_dontcache(struct inode *inode) de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } - inode->i_state |= I_DONTCACHE; + inode_state_set(inode, I_DONTCACHE); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); @@ -1073,7 +1074,7 @@ struct dentry *d_find_alias_rcu(struct inode *inode) spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left - if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { + if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { @@ -1980,14 +1981,8 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW & ~I_CREATING; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } @@ -2306,11 +2301,20 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; - if (d_unhashed(dentry)) - continue; if (dentry->d_name.hash_len != hashlen) continue; - if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) + if (unlikely(dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)) + continue; + /* + * Check for the dentry being unhashed. + * + * As tempting as it is, we *can't* skip it because of a race window + * between us finding the dentry before it gets unhashed and loading + * the sequence counter after unhashing is finished. + * + * We can at least predict on it. + */ + if (unlikely(d_unhashed(dentry))) continue; *seqp = seq; return dentry; @@ -3222,9 +3226,10 @@ static void __init dcache_init(void) * but it is probably not worth it because of the cache nature * of the dcache. */ - dentry_cache = KMEM_CACHE_USERCOPY(dentry, + __dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_shortname.string); + runtime_const_init(ptr, __dentry_cache); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 661a99a7dfbe..532bd7c46baf 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -403,7 +403,7 @@ static struct dentry *debugfs_start_creating(const char *name, return dentry; } -static struct dentry *failed_creating(struct dentry *dentry) +static struct dentry *debugfs_failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); @@ -411,7 +411,7 @@ static struct dentry *failed_creating(struct dentry *dentry) return ERR_PTR(-ENOMEM); } -static struct dentry *end_creating(struct dentry *dentry) +static struct dentry *debugfs_end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; @@ -435,7 +435,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { - failed_creating(dentry); + debugfs_failed_creating(dentry); return ERR_PTR(-EPERM); } @@ -443,7 +443,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, if (unlikely(!inode)) { pr_err("out of free dentries, can not create file '%s'\n", name); - return failed_creating(dentry); + return debugfs_failed_creating(dentry); } inode->i_mode = mode; @@ -458,7 +458,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return debugfs_end_creating(dentry); } struct dentry *debugfs_create_file_full(const char *name, umode_t mode, @@ -585,7 +585,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { - failed_creating(dentry); + debugfs_failed_creating(dentry); return ERR_PTR(-EPERM); } @@ -593,7 +593,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) if (unlikely(!inode)) { pr_err("out of free dentries, can not create directory '%s'\n", name); - return failed_creating(dentry); + return debugfs_failed_creating(dentry); } inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; @@ -605,7 +605,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return debugfs_end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); @@ -632,7 +632,7 @@ struct dentry *debugfs_create_automount(const char *name, return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { - failed_creating(dentry); + debugfs_failed_creating(dentry); return ERR_PTR(-EPERM); } @@ -640,7 +640,7 @@ struct dentry *debugfs_create_automount(const char *name, if (unlikely(!inode)) { pr_err("out of free dentries, can not create automount '%s'\n", name); - return failed_creating(dentry); + return debugfs_failed_creating(dentry); } make_empty_dir_inode(inode); @@ -652,7 +652,7 @@ struct dentry *debugfs_create_automount(const char *name, d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return debugfs_end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); @@ -699,13 +699,13 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, pr_err("out of free dentries, can not create symlink '%s'\n", name); kfree(link); - return failed_creating(dentry); + return debugfs_failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); - return end_creating(dentry); + return debugfs_end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); @@ -842,7 +842,8 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, . int error = 0; const char *new_name; struct name_snapshot old_name; - struct dentry *parent, *target; + struct dentry *target; + struct renamedata rd = {}; struct inode *dir; va_list ap; @@ -855,36 +856,31 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, . if (!new_name) return -ENOMEM; - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock(dir); + rd.old_parent = dget_parent(dentry); + rd.new_parent = rd.old_parent; + rd.flags = RENAME_NOREPLACE; + target = lookup_noperm_unlocked(&QSTR(new_name), rd.new_parent); + if (IS_ERR(target)) + return PTR_ERR(target); - take_dentry_name_snapshot(&old_name, dentry); - - if (WARN_ON_ONCE(dentry->d_parent != parent)) { - error = -EINVAL; - goto out; - } - if (strcmp(old_name.name.name, new_name) == 0) - goto out; - target = lookup_noperm(&QSTR(new_name), parent); - if (IS_ERR(target)) { - error = PTR_ERR(target); - goto out; - } - if (d_really_is_positive(target)) { - dput(target); - error = -EINVAL; + error = start_renaming_two_dentries(&rd, dentry, target); + if (error) { + if (error == -EEXIST && target == dentry) + /* it isn't an error to rename a thing to itself */ + error = 0; goto out; } - simple_rename_timestamp(dir, dentry, dir, target); - d_move(dentry, target); - dput(target); + + dir = d_inode(rd.old_parent); + take_dentry_name_snapshot(&old_name, dentry); + simple_rename_timestamp(dir, dentry, dir, rd.new_dentry); + d_move(dentry, rd.new_dentry); fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry); -out: release_dentry_name_snapshot(&old_name); - inode_unlock(dir); - dput(parent); + end_renaming(&rd); +out: + dput(rd.old_parent); + dput(target); kfree_const(new_name); return error; } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9a0b6c2b6b01..b3958008ba3f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1126,7 +1126,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) static int sctp_bind_addrs(struct socket *sock, __be16 port) { struct sockaddr_storage localaddr; - struct sockaddr *addr = (struct sockaddr *)&localaddr; + struct sockaddr_unsized *addr = (struct sockaddr_unsized *)&localaddr; int i, addr_len, result = 0; for (i = 0; i < dlm_local_count; i++) { @@ -1599,7 +1599,7 @@ static int dlm_connect(struct connection *con) log_print_ratelimited("connecting to %d", con->nodeid); make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); - result = kernel_connect(sock, (struct sockaddr *)&addr, addr_len, 0); + result = kernel_connect(sock, (struct sockaddr_unsized *)&addr, addr_len, 0); switch (result) { case -EINPROGRESS: /* not an error */ @@ -1813,7 +1813,7 @@ static int dlm_tcp_bind(struct socket *sock) memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); make_sockaddr(&src_addr, 0, &addr_len); - result = kernel_bind(sock, (struct sockaddr *)&src_addr, + result = kernel_bind(sock, (struct sockaddr_unsized *)&src_addr, addr_len); if (result < 0) { /* This *may* not indicate a critical error */ @@ -1852,7 +1852,7 @@ static int dlm_tcp_listen_bind(struct socket *sock) /* Bind to our port */ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); - return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0], + return kernel_bind(sock, (struct sockaddr_unsized *)&dlm_local_addr[0], addr_len); } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 019a8b4eaaf9..49f56a598ecb 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -28,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) * inodes without pages but we deliberately won't in case * we need to reschedule to avoid softlockups. */ - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || (mapping_empty(inode->i_mapping) && !need_resched())) { spin_unlock(&inode->i_lock); continue; diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 1bdeaa6d5790..c2f4fb41b4e6 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -4,7 +4,7 @@ config ECRYPT_FS depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO_ECB select CRYPTO_CBC - select CRYPTO_MD5 + select CRYPTO_LIB_MD5 help Encrypted filesystem that operates on the VFS layer. See <file:Documentation/filesystems/ecryptfs.rst> to learn more about diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 69536cacdea8..260f8a4938b0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -9,7 +9,6 @@ * Michael C. Thompson <mcthomps@us.ibm.com> */ -#include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/fs.h> #include <linux/mount.h> @@ -48,32 +47,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size) } } -/** - * ecryptfs_calculate_md5 - calculates the md5 of @src - * @dst: Pointer to 16 bytes of allocated memory - * @crypt_stat: Pointer to crypt_stat struct for the current inode - * @src: Data to be md5'd - * @len: Length of @src - * - * Uses the allocated crypto context that crypt_stat references to - * generate the MD5 sum of the contents of src. - */ -static int ecryptfs_calculate_md5(char *dst, - struct ecryptfs_crypt_stat *crypt_stat, - char *src, int len) -{ - int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst); - - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; rc = [%d]\n", - __func__, rc); - goto out; - } -out: - return rc; -} - static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name, char *cipher_name, char *chaining_modifier) @@ -104,13 +77,10 @@ out: * * Generate the initialization vector from the given root IV and page * offset. - * - * Returns zero on success; non-zero on error. */ -int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset) +void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset) { - int rc = 0; char dst[MD5_DIGEST_SIZE]; char src[ECRYPTFS_MAX_IV_BYTES + 16]; @@ -129,20 +99,12 @@ int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, ecryptfs_printk(KERN_DEBUG, "source:\n"); ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16)); } - rc = ecryptfs_calculate_md5(dst, crypt_stat, src, - (crypt_stat->iv_bytes + 16)); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error attempting to compute " - "MD5 while generating IV for a page\n"); - goto out; - } + md5(src, crypt_stat->iv_bytes + 16, dst); memcpy(iv, dst, crypt_stat->iv_bytes); if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "derived iv:\n"); ecryptfs_dump_hex(iv, crypt_stat->iv_bytes); } -out: - return rc; } /** @@ -151,29 +113,14 @@ out: * * Initialize the crypt_stat structure. */ -int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) { - struct crypto_shash *tfm; - int rc; - - tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0); - if (IS_ERR(tfm)) { - rc = PTR_ERR(tfm); - ecryptfs_printk(KERN_ERR, "Error attempting to " - "allocate crypto context; rc = [%d]\n", - rc); - return rc; - } - memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); INIT_LIST_HEAD(&crypt_stat->keysig_list); mutex_init(&crypt_stat->keysig_list_mutex); mutex_init(&crypt_stat->cs_mutex); mutex_init(&crypt_stat->cs_tfm_mutex); - crypt_stat->hash_tfm = tfm; crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED; - - return 0; } /** @@ -187,7 +134,6 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) struct ecryptfs_key_sig *key_sig, *key_sig_tmp; crypto_free_skcipher(crypt_stat->tfm); - crypto_free_shash(crypt_stat->hash_tfm); list_for_each_entry_safe(key_sig, key_sig_tmp, &crypt_stat->keysig_list, crypt_stat_list) { list_del(&key_sig->crypt_stat_list); @@ -361,14 +307,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat, int rc; extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size)); - rc = ecryptfs_derive_iv(extent_iv, crypt_stat, - (extent_base + extent_offset)); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " - "extent [0x%.16llx]; rc = [%d]\n", - (unsigned long long)(extent_base + extent_offset), rc); - goto out; - } + ecryptfs_derive_iv(extent_iv, crypt_stat, extent_base + extent_offset); sg_init_table(&src_sg, 1); sg_init_table(&dst_sg, 1); @@ -609,31 +548,20 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) */ int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat) { - int rc = 0; char dst[MD5_DIGEST_SIZE]; BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE); BUG_ON(crypt_stat->iv_bytes <= 0); if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { - rc = -EINVAL; ecryptfs_printk(KERN_WARNING, "Session key not valid; " "cannot generate root IV\n"); - goto out; - } - rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key, - crypt_stat->key_size); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error attempting to compute " - "MD5 while generating root IV\n"); - goto out; - } - memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); -out: - if (rc) { memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes); crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING; + return -EINVAL; } - return rc; + md5(crypt_stat->key, crypt_stat->key_size, dst); + memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); + return 0; } static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat) diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 9e6ab0b41337..62a2ea7f59ed 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -14,6 +14,7 @@ #ifndef ECRYPTFS_KERNEL_H #define ECRYPTFS_KERNEL_H +#include <crypto/md5.h> #include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> @@ -137,8 +138,6 @@ ecryptfs_get_key_payload_data(struct key *key) + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES) #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 -#define ECRYPTFS_DEFAULT_HASH "md5" -#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED @@ -163,8 +162,6 @@ ecryptfs_get_key_payload_data(struct key *key) * ECRYPTFS_MAX_IV_BYTES */ #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ -#define MD5_DIGEST_SIZE 16 -#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE #define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \ + ECRYPTFS_SIG_SIZE + 1 + 1) #define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \ @@ -237,8 +234,6 @@ struct ecryptfs_crypt_stat { unsigned int extent_mask; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct crypto_skcipher *tfm; - struct crypto_shash *hash_tfm; /* Crypto context for generating - * the initialization vectors */ unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; @@ -558,7 +553,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_rotate_iv(unsigned char *iv); -int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat); @@ -693,8 +688,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, char *data, size_t max_packet_size); int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, struct ecryptfs_mount_crypt_stat *mount_crypt_stat); -int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset); +void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset); extern const struct xattr_handler * const ecryptfs_xattr_handlers[]; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ed1394da8d6b..3978248247dc 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -24,18 +24,26 @@ #include <linux/unaligned.h> #include "ecryptfs_kernel.h" -static int lock_parent(struct dentry *dentry, - struct dentry **lower_dentry, - struct inode **lower_dir) +static struct dentry *ecryptfs_start_creating_dentry(struct dentry *dentry) { - struct dentry *lower_dir_dentry; + struct dentry *parent = dget_parent(dentry); + struct dentry *ret; - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - *lower_dir = d_inode(lower_dir_dentry); - *lower_dentry = ecryptfs_dentry_to_lower(dentry); + ret = start_creating_dentry(ecryptfs_dentry_to_lower(parent), + ecryptfs_dentry_to_lower(dentry)); + dput(parent); + return ret; +} - inode_lock_nested(*lower_dir, I_MUTEX_PARENT); - return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL; +static struct dentry *ecryptfs_start_removing_dentry(struct dentry *dentry) +{ + struct dentry *parent = dget_parent(dentry); + struct dentry *ret; + + ret = start_removing_dentry(ecryptfs_dentry_to_lower(parent), + ecryptfs_dentry_to_lower(dentry)); + dput(parent); + return ret; } static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) @@ -95,7 +103,7 @@ static struct inode *__ecryptfs_get_inode(struct inode *lower_inode, iput(lower_inode); return ERR_PTR(-EACCES); } - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) iput(lower_inode); return inode; @@ -106,7 +114,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode, { struct inode *inode = __ecryptfs_get_inode(lower_inode, sb); - if (!IS_ERR(inode) && (inode->i_state & I_NEW)) + if (!IS_ERR(inode) && (inode_state_read_once(inode) & I_NEW)) unlock_new_inode(inode); return inode; @@ -141,15 +149,12 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, struct inode *lower_dir; int rc; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - dget(lower_dentry); // don't even try to make the lower negative - if (!rc) { - if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, - NULL); - } + lower_dentry = ecryptfs_start_removing_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + + lower_dir = lower_dentry->d_parent->d_inode; + rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -158,8 +163,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); out_unlock: - dput(lower_dentry); - inode_unlock(lower_dir); + end_removing(lower_dentry); if (!rc) d_drop(dentry); return rc; @@ -186,10 +190,11 @@ ecryptfs_do_create(struct inode *directory_inode, struct inode *lower_dir; struct inode *inode; - rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); - if (!rc) - rc = vfs_create(&nop_mnt_idmap, lower_dir, - lower_dentry, mode, true); + lower_dentry = ecryptfs_start_creating_dentry(ecryptfs_dentry); + if (IS_ERR(lower_dentry)) + return ERR_CAST(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode, NULL); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -205,7 +210,7 @@ ecryptfs_do_create(struct inode *directory_inode, fsstack_copy_attr_times(directory_inode, lower_dir); fsstack_copy_inode_size(directory_inode, lower_dir); out_lock: - inode_unlock(lower_dir); + end_creating(lower_dentry); return inode; } @@ -364,7 +369,7 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, } } - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); return d_splice_alias(inode, dentry); } @@ -433,10 +438,12 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, file_size_save = i_size_read(d_inode(old_dentry)); lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); - rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir); - if (!rc) - rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir, - lower_new_dentry, NULL); + lower_new_dentry = ecryptfs_start_creating_dentry(new_dentry); + if (IS_ERR(lower_new_dentry)) + return PTR_ERR(lower_new_dentry); + lower_dir = lower_new_dentry->d_parent->d_inode; + rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir, + lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); @@ -448,7 +455,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); i_size_write(d_inode(new_dentry), file_size_save); out_lock: - inode_unlock(lower_dir); + end_creating(lower_new_dentry); return rc; } @@ -468,9 +475,11 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, size_t encoded_symlen; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (rc) - goto out_lock; + lower_dentry = ecryptfs_start_creating_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + mount_crypt_stat = &ecryptfs_superblock_to_private( dir->i_sb)->mount_crypt_stat; rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, @@ -480,7 +489,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, if (rc) goto out_lock; rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry, - encoded_symname); + encoded_symname, NULL); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) goto out_lock; @@ -490,7 +499,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, fsstack_copy_attr_times(dir, lower_dir); fsstack_copy_inode_size(dir, lower_dir); out_lock: - inode_unlock(lower_dir); + end_creating(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -501,14 +510,16 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, { int rc; struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; struct inode *lower_dir; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (rc) - goto out; - + lower_dentry = ecryptfs_start_creating_dentry(dentry); + if (IS_ERR(lower_dentry)) + return lower_dentry; + lower_dir_dentry = dget(lower_dentry->d_parent); + lower_dir = lower_dir_dentry->d_inode; lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir, - lower_dentry, mode); + lower_dentry, mode, NULL); rc = PTR_ERR(lower_dentry); if (IS_ERR(lower_dentry)) goto out; @@ -522,7 +533,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, fsstack_copy_inode_size(dir, lower_dir); set_nlink(dir, lower_dir->i_nlink); out: - inode_unlock(lower_dir); + end_creating(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return ERR_PTR(rc); @@ -534,21 +545,18 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) struct inode *lower_dir; int rc; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - dget(lower_dentry); // don't even try to make the lower negative - if (!rc) { - if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry); - } + lower_dentry = ecryptfs_start_removing_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + + rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); if (!rc) { clear_nlink(d_inode(dentry)); fsstack_copy_attr_times(dir, lower_dir); set_nlink(dir, lower_dir->i_nlink); } - dput(lower_dentry); - inode_unlock(lower_dir); + end_removing(lower_dentry); if (!rc) d_drop(dentry); return rc; @@ -562,10 +570,12 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *lower_dentry; struct inode *lower_dir; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (!rc) - rc = vfs_mknod(&nop_mnt_idmap, lower_dir, - lower_dentry, mode, dev); + lower_dentry = ecryptfs_start_creating_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + + rc = vfs_mknod(&nop_mnt_idmap, lower_dir, lower_dentry, mode, dev, NULL); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); @@ -574,7 +584,7 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, fsstack_copy_attr_times(dir, lower_dir); fsstack_copy_inode_size(dir, lower_dir); out: - inode_unlock(lower_dir); + end_removing(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -590,7 +600,6 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *lower_new_dentry; struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; - struct dentry *trap; struct inode *target_inode; struct renamedata rd = {}; @@ -605,31 +614,13 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, target_inode = d_inode(new_dentry); - trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); - if (IS_ERR(trap)) - return PTR_ERR(trap); - dget(lower_new_dentry); - rc = -EINVAL; - if (lower_old_dentry->d_parent != lower_old_dir_dentry) - goto out_lock; - if (lower_new_dentry->d_parent != lower_new_dir_dentry) - goto out_lock; - if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry)) - goto out_lock; - /* source should not be ancestor of target */ - if (trap == lower_old_dentry) - goto out_lock; - /* target should not be ancestor of source */ - if (trap == lower_new_dentry) { - rc = -ENOTEMPTY; - goto out_lock; - } + rd.mnt_idmap = &nop_mnt_idmap; + rd.old_parent = lower_old_dir_dentry; + rd.new_parent = lower_new_dir_dentry; + rc = start_renaming_two_dentries(&rd, lower_old_dentry, lower_new_dentry); + if (rc) + return rc; - rd.mnt_idmap = &nop_mnt_idmap; - rd.old_parent = lower_old_dir_dentry; - rd.old_dentry = lower_old_dentry; - rd.new_parent = lower_new_dir_dentry; - rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); if (rc) goto out_lock; @@ -640,8 +631,7 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new_dir != old_dir) fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); out_lock: - dput(lower_new_dentry); - unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + end_renaming(&rd); return rc; } @@ -903,11 +893,8 @@ static int ecryptfs_setattr(struct mnt_idmap *idmap, struct ecryptfs_crypt_stat *crypt_stat; crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; - if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) { - rc = ecryptfs_init_crypt_stat(crypt_stat); - if (rc) - return rc; - } + if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) + ecryptfs_init_crypt_stat(crypt_stat); inode = d_inode(dentry); lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 7f9f68c00ef6..bbf8603242fa 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -11,7 +11,6 @@ * Trevor S. Highland <trevor.highland@gmail.com> */ -#include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/string.h> #include <linux/pagemap.h> @@ -601,10 +600,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack { struct crypto_skcipher *skcipher_tfm; struct skcipher_request *skcipher_req; char iv[ECRYPTFS_MAX_IV_BYTES]; - char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; - char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; - struct crypto_shash *hash_tfm; - struct shash_desc *hash_desc; + char hash[MD5_DIGEST_SIZE]; }; /* @@ -741,51 +737,15 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "password tokens\n", __func__); goto out_free_unlock; } - s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0); - if (IS_ERR(s->hash_tfm)) { - rc = PTR_ERR(s->hash_tfm); - printk(KERN_ERR "%s: Error attempting to " - "allocate hash crypto context; rc = [%d]\n", - __func__, rc); - goto out_free_unlock; - } - - s->hash_desc = kmalloc(sizeof(*s->hash_desc) + - crypto_shash_descsize(s->hash_tfm), GFP_KERNEL); - if (!s->hash_desc) { - rc = -ENOMEM; - goto out_release_free_unlock; - } - s->hash_desc->tfm = s->hash_tfm; - - rc = crypto_shash_digest(s->hash_desc, - (u8 *)s->auth_tok->token.password.session_key_encryption_key, - s->auth_tok->token.password.session_key_encryption_key_bytes, - s->hash); - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; rc = [%d]\n", - __func__, rc); - goto out_release_free_unlock; - } + md5(s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes, + s->hash); for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { s->block_aligned_filename[s->j] = - s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; - if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) - == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { - rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash, - ECRYPTFS_TAG_70_DIGEST_SIZE, - s->tmp_hash); - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; " - "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; - } - memcpy(s->hash, s->tmp_hash, - ECRYPTFS_TAG_70_DIGEST_SIZE); - } + s->hash[s->j % MD5_DIGEST_SIZE]; + if ((s->j % MD5_DIGEST_SIZE) == (MD5_DIGEST_SIZE - 1)) + md5(s->hash, MD5_DIGEST_SIZE, s->hash); if (s->block_aligned_filename[s->j] == '\0') s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; } @@ -798,7 +758,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "convert filename memory to scatterlist; rc = [%d]. " "block_aligned_filename_size = [%zd]\n", __func__, rc, s->block_aligned_filename_size); - goto out_release_free_unlock; + goto out_free_unlock; } rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, s->dst_sg, 2); @@ -807,7 +767,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "convert encrypted filename memory to scatterlist; " "rc = [%d]. block_aligned_filename_size = [%zd]\n", __func__, rc, s->block_aligned_filename_size); - goto out_release_free_unlock; + goto out_free_unlock; } /* The characters in the first block effectively do the job * of the IV here, so we just use 0's for the IV. Note the @@ -825,7 +785,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, rc, s->auth_tok->token.password.session_key_encryption_key, mount_crypt_stat->global_default_fn_cipher_key_bytes); - goto out_release_free_unlock; + goto out_free_unlock; } skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg, s->block_aligned_filename_size, s->iv); @@ -833,13 +793,11 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt filename; " "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; + goto out_free_unlock; } s->i += s->block_aligned_filename_size; (*packet_size) = s->i; (*remaining_bytes) -= (*packet_size); -out_release_free_unlock: - crypto_free_shash(s->hash_tfm); out_free_unlock: kfree_sensitive(s->block_aligned_filename); out_unlock: @@ -850,7 +808,6 @@ out: key_put(auth_tok_key); } skcipher_request_free(s->skcipher_req); - kfree_sensitive(s->hash_desc); kfree(s); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 16ea14dd2c62..c12dc680f8fe 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -12,6 +12,7 @@ #include <linux/dcache.h> #include <linux/file.h> +#include <linux/fips.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/skbuff.h> @@ -454,6 +455,12 @@ static int ecryptfs_get_tree(struct fs_context *fc) goto out; } + if (fips_enabled) { + rc = -EINVAL; + err = "eCryptfs support is disabled due to FIPS"; + goto out; + } + s = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(s)) { rc = PTR_ERR(s); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index e7b7f426fecf..3bc21d677564 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -41,10 +41,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL); if (unlikely(!inode_info)) goto out; - if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) { - kmem_cache_free(ecryptfs_inode_info_cache, inode_info); - goto out; - } + ecryptfs_init_crypt_stat(&inode_info->crypt_stat); mutex_init(&inode_info->lower_file_mutex); atomic_set(&inode_info->lower_file_count, 0); inode_info->lower_file = NULL; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 1f4d8ce56667..6de97565d5f7 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -533,6 +533,7 @@ static struct file_system_type efivarfs_type = { .init_fs_context = efivarfs_init_fs_context, .kill_sb = efivarfs_kill_sb, .parameters = efivarfs_parameters, + .fs_flags = FS_POWER_FREEZE, }; static __init int efivarfs_init(void) diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 462619e59766..28407578f83a 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -62,7 +62,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) inode = iget_locked(super, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; in = INODE_INFO(inode); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8ca29962a3dd..bb13c4cb8455 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio) { trace_erofs_read_folio(folio, true); - return iomap_read_folio(folio, &erofs_iomap_ops); + iomap_bio_read_folio(folio, &erofs_iomap_ops); + return 0; } static void erofs_readahead(struct readahead_control *rac) @@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac) trace_erofs_readahead(rac->mapping->host, readahead_index(rac), readahead_count(rac), true); - return iomap_readahead(rac, &erofs_iomap_ops); + iomap_bio_readahead(rac, &erofs_iomap_ops); } static sector_t erofs_bmap(struct address_space *mapping, sector_t block) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 301647b84ca9..932e8b353ba1 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -47,7 +47,6 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) { - const struct cred *old_cred; struct iov_iter iter; int ret; @@ -61,9 +60,8 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); - old_cred = override_creds(rq->iocb.ki_filp->f_cred); - ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); - revert_creds(old_cred); + scoped_with_creds(rq->iocb.ki_filp->f_cred) + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); if (ret != -EIOCBQUEUED) erofs_fileio_ki_complete(&rq->iocb, ret); } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index cb780c095d28..bce98c845a18 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -295,7 +295,7 @@ struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { int err = erofs_fill_inode(inode); if (err) { diff --git a/fs/eventfd.c b/fs/eventfd.c index af42b2c7d235..3219e0d596fe 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -378,9 +378,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); static int do_eventfd(unsigned int count, int flags) { - struct eventfd_ctx *ctx; - struct file *file; - int fd; + struct eventfd_ctx *ctx __free(kfree) = NULL; /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); @@ -398,26 +396,19 @@ static int do_eventfd(unsigned int count, int flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; - ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; - fd = get_unused_fd_flags(flags); - if (fd < 0) - goto err; - - file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, - ctx, flags, FMODE_NOWAIT); - if (IS_ERR(file)) { - put_unused_fd(fd); - fd = PTR_ERR(file); - goto err; - } - fd_install(fd, file); - return fd; -err: - eventfd_free_ctx(ctx); - return fd; + + FD_PREPARE(fdf, flags, + anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, ctx, + flags, FMODE_NOWAIT)); + if (fdf.err) + return fdf.err; + + ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); + retain_and_null_ptr(ctx); + return fd_publish(fdf); } SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ee7c4b683ec3..6c36d9dc6926 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2165,9 +2165,8 @@ static void clear_tfile_check_list(void) */ static int do_epoll_create(int flags) { - int error, fd; - struct eventpoll *ep = NULL; - struct file *file; + int error; + struct eventpoll *ep; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); @@ -2184,26 +2183,15 @@ static int do_epoll_create(int flags) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); - if (fd < 0) { - error = fd; - goto out_free_ep; - } - file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, - O_RDWR | (flags & O_CLOEXEC)); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto out_free_fd; + FD_PREPARE(fdf, O_RDWR | (flags & O_CLOEXEC), + anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, + O_RDWR | (flags & O_CLOEXEC))); + if (fdf.err) { + ep_clear_and_put(ep); + return fdf.err; } - ep->file = file; - fd_install(fd, file); - return fd; - -out_free_fd: - put_unused_fd(fd); -out_free_ep: - ep_clear_and_put(ep); - return error; + ep->file = fd_prepare_file(fdf); + return fd_publish(fdf); } SYSCALL_DEFINE1(epoll_create1, int, flags) diff --git a/fs/exec.c b/fs/exec.c index 4298e7e08d5d..9d5ebc9d15b0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1280,10 +1280,9 @@ int begin_new_exec(struct linux_binprm * bprm) /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { - retval = get_unused_fd_flags(0); + retval = FD_ADD(0, bprm->executable); if (retval < 0) goto out_unlock; - fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } @@ -1775,7 +1774,7 @@ out: force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); - rseq_set_notify_resume(current); + rseq_force_update(); current->in_execve = 0; return retval; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 7f9592856bf7..74d451f732c7 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -433,7 +433,10 @@ static int exfat_read_boot_sector(struct super_block *sb) struct exfat_sb_info *sbi = EXFAT_SB(sb); /* set block size to read super block */ - sb_min_blocksize(sb, 512); + if (!sb_min_blocksize(sb, 512)) { + exfat_err(sb, "unable to set blocksize"); + return -EINVAL; + } /* read boot sector */ sbi->boot_bh = sb_bread(sb, 0); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e10c376843d7..dbfe9098a124 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1398,7 +1398,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ei = EXT2_I(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e99306a8f47c..78ea864fa8cd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -202,8 +202,7 @@ void ext4_evict_inode(struct inode *inode) * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ - if (!list_empty_careful(&inode->i_io_list)) - inode_io_list_del(inode); + inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any @@ -425,7 +424,7 @@ void ext4_check_map_extents_env(struct inode *inode) if (!S_ISREG(inode->i_mode) || IS_NOQUOTA(inode) || IS_VERITY(inode) || is_special_ino(inode->i_sb, inode->i_ino) || - (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || ext4_verity_in_progress(inode)) return; @@ -1319,8 +1318,8 @@ retry_grab: if (IS_ERR(folio)) return PTR_ERR(folio); - if (pos + len > folio_pos(folio) + folio_size(folio)) - len = folio_pos(folio) + folio_size(folio) - pos; + if (len > folio_next_pos(folio) - pos) + len = folio_next_pos(folio) - pos; from = offset_in_folio(folio, pos); to = from + len; @@ -2619,10 +2618,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); - if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(mpd->wbc); mpd->map.m_len = 0; mpd->next_pos = mpd->start_pos; @@ -2704,7 +2700,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->start_pos = folio_pos(folio); - mpd->next_pos = folio_pos(folio) + folio_size(folio); + mpd->next_pos = folio_next_pos(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -3146,8 +3142,8 @@ retry: if (IS_ERR(folio)) return PTR_ERR(folio); - if (pos + len > folio_pos(folio) + folio_size(folio)) - len = folio_pos(folio) + folio_size(folio) - pos; + if (len > folio_next_pos(folio) - pos) + len = folio_next_pos(folio) - pos; ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); @@ -3473,7 +3469,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; - return inode->i_state & I_DIRTY_DATASYNC; + return inode_state_read_once(inode) & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, @@ -4552,7 +4548,7 @@ int ext4_truncate(struct inode *inode) * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ - if (!(inode->i_state & (I_NEW|I_FREEING))) + if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); @@ -5210,7 +5206,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); @@ -5549,7 +5545,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index ab1ff51302fb..6f57c181ff77 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -57,16 +57,12 @@ static int write_mmp_block_thawed(struct super_block *sb, static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) { - int err; - /* * We protect against freezing so that we don't create dirty buffers * on frozen filesystem. */ - sb_start_write(sb); - err = write_mmp_block_thawed(sb, bh); - sb_end_write(sb); - return err; + scoped_guard(super_write, sb) + return write_mmp_block_thawed(sb, bh); } /* diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 82d5e7501455..5fd54adf0c88 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -107,7 +107,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!sbi->s_journal || is_bad_inode(inode)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_inode_orphan_tracked(inode)) return 0; @@ -232,7 +232,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index d4d7f329d23f..fa8d81a30fb9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -9,6 +9,7 @@ * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ +#include <linux/fs_struct.h> #include <linux/f2fs_fs.h> #include "f2fs.h" #include "xattr.h" diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6ad8d3bc6df7..be53e06caf3d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1329,7 +1329,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, } folio = page_folio(cc->rpages[last_index]); - psize = folio_pos(folio) + folio_size(folio); + psize = folio_next_pos(folio); err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 775aa4f63aa3..8bf4feda42b0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) @@ -4222,7 +4219,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (map.m_flags & F2FS_MAP_NEW) iomap->flags |= IOMAP_F_NEW; - if ((inode->i_state & I_DIRTY_DATASYNC) || + if ((inode_state_read_once(inode) & I_DIRTY_DATASYNC) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 8c4eafe9ffac..f1cda1900658 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -569,7 +569,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { if (is_meta_ino(sbi, ino)) { f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); set_sbi_flag(sbi, SBI_NEED_FSCK); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b882771e4699..af40282a6948 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -844,7 +844,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); - inode->i_state |= I_LINKABLE; + inode_state_set(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } else { if (file) @@ -1057,7 +1057,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto put_out_dir; spin_lock(&whiteout->i_lock); - whiteout->i_state &= ~I_LINKABLE; + inode_state_clear(whiteout, I_LINKABLE); spin_unlock(&whiteout->i_lock); iput(whiteout); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index db7afb806411..47489d48f2b9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1798,7 +1798,7 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { + if ((!inode_unhashed(inode) && inode_state_read(inode) & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ __iget(inode); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 9648ed097816..0b6009cd1844 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -22,6 +22,7 @@ #include <linux/unaligned.h> #include <linux/random.h> #include <linux/iversion.h> +#include <linux/fs_struct.h> #include "fat.h" #ifndef CONFIG_FAT_DEFAULT_IOCHARSET @@ -1595,8 +1596,12 @@ int fat_fill_super(struct super_block *sb, struct fs_context *fc, setup(sb); /* flavour-specific stuff that needs options */ + error = -EINVAL; + if (!sb_min_blocksize(sb, 512)) { + fat_msg(sb, KERN_ERR, "unable to set blocksize"); + goto out_fail; + } error = -EIO; - sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); if (bh == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector"); diff --git a/fs/fcntl.c b/fs/fcntl.c index 72f8433d9109..f93dbca08435 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -445,6 +445,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; + struct delegation deleg; int argi = (int)arg; struct flock flock; long err = -EINVAL; @@ -550,6 +551,18 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_SET_RW_HINT: err = fcntl_set_rw_hint(filp, arg); break; + case F_GETDELEG: + if (copy_from_user(&deleg, argp, sizeof(deleg))) + return -EFAULT; + err = fcntl_getdeleg(filp, &deleg); + if (!err && copy_to_user(argp, &deleg, sizeof(deleg))) + return -EFAULT; + break; + case F_SETDELEG: + if (copy_from_user(&deleg, argp, sizeof(deleg))) + return -EFAULT; + err = fcntl_setdeleg(fd, filp, &deleg); + break; default: break; } diff --git a/fs/fhandle.c b/fs/fhandle.c index 052f9c9368fb..3de1547ec9d4 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -404,32 +404,28 @@ out_path: return retval; } +static struct file *file_open_handle(struct path *path, int open_flag) +{ + const struct export_operations *eops; + + eops = path->mnt->mnt_sb->s_export_op; + if (eops->open) + return eops->open(path, open_flag); + + return file_open_root(path, "", open_flag, 0); +} + static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag) { - long retval = 0; + long retval; struct path path __free(path_put) = {}; - struct file *file; - const struct export_operations *eops; retval = handle_to_path(mountdirfd, ufh, &path, open_flag); if (retval) return retval; - CLASS(get_unused_fd, fd)(open_flag); - if (fd < 0) - return fd; - - eops = path.mnt->mnt_sb->s_export_op; - if (eops->open) - file = eops->open(&path, open_flag); - else - file = file_open_root(&path, "", open_flag, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - fd_install(fd, file); - return take_fd(fd); + return FD_ADD(open_flag, file_open_handle(&path, open_flag)); } /** diff --git a/fs/file.c b/fs/file.c index 28743b742e3c..0a4f3bdb2dec 100644 --- a/fs/file.c +++ b/fs/file.c @@ -641,6 +641,34 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); +/* + * Install a file pointer in the fd array while it is being resized. + * + * We need to make sure our update to the array does not get lost as the resizing + * thread can be copying the content as we modify it. + * + * We have two ways to do it: + * - go off CPU waiting for resize_in_progress to clear + * - take the spin lock + * + * The latter is trivial to implement and saves us from having to might_sleep() + * for debugging purposes. + * + * This is moved out of line from fd_install() to convince gcc to optimize that + * routine better. + */ +static void noinline fd_install_slowpath(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); +} + /** * fd_install - install a file pointer in the fd array * @fd: file descriptor to install the file in @@ -658,14 +686,9 @@ void fd_install(unsigned int fd, struct file *file) return; rcu_read_lock_sched(); - if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + fd_install_slowpath(fd, file); return; } /* coupled with smp_wmb() in expand_fdtable() */ @@ -1357,28 +1380,25 @@ out_unlock: */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { - int new_fd; int error; error = security_file_receive(file); if (error) return error; - new_fd = get_unused_fd_flags(o_flags); - if (new_fd < 0) - return new_fd; + FD_PREPARE(fdf, o_flags, file); + if (fdf.err) + return fdf.err; + get_file(file); if (ufd) { - error = put_user(new_fd, ufd); - if (error) { - put_unused_fd(new_fd); + error = put_user(fd_prepare_fd(fdf), ufd); + if (error) return error; - } } - fd_install(new_fd, get_file(file)); - __receive_sock(file); - return new_fd; + __receive_sock(fd_prepare_file(fdf)); + return fd_publish(fdf); } EXPORT_SYMBOL_GPL(receive_fd); diff --git a/fs/file_attr.c b/fs/file_attr.c index 1dcec88c0680..4c4916632f11 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -316,7 +316,6 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp) err = put_user(fa.flags, argp); return err; } -EXPORT_SYMBOL(ioctl_getflags); int ioctl_setflags(struct file *file, unsigned int __user *argp) { @@ -337,7 +336,6 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp) } return err; } -EXPORT_SYMBOL(ioctl_setflags); int ioctl_fsgetxattr(struct file *file, void __user *argp) { @@ -350,7 +348,6 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp) return err; } -EXPORT_SYMBOL(ioctl_fsgetxattr); int ioctl_fssetxattr(struct file *file, void __user *argp) { @@ -369,7 +366,6 @@ int ioctl_fssetxattr(struct file *file, void __user *argp) } return err; } -EXPORT_SYMBOL(ioctl_fssetxattr); SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, struct file_attr __user *, ufattr, size_t, usize, diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 20600e9ea202..21fc94b98209 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -258,7 +258,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino) ip = iget_locked(sbp, ino); if (!ip) return ERR_PTR(-ENOMEM); - if (!(ip->i_state & I_NEW)) + if (!(inode_state_read_once(ip) & I_NEW)) return ip; vip = VXFS_INO(ip); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2b35e80037fe..6800886c4d10 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,6 +14,7 @@ * Additions for address_space-based writeback */ +#include <linux/sched/sysctl.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -32,11 +33,6 @@ #include "internal.h" /* - * 4MB minimal write chunk size - */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) - -/* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { @@ -121,7 +117,7 @@ static bool inode_io_list_move_locked(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_FREEING); + WARN_ON_ONCE(inode_state_read(inode) & I_FREEING); list_move(&inode->i_io_list, head); @@ -200,6 +196,19 @@ static void wb_queue_work(struct bdi_writeback *wb, spin_unlock_irq(&wb->work_lock); } +static bool wb_wait_for_completion_cb(struct wb_completion *done) +{ + unsigned long waited_secs = (jiffies - done->wait_start) / HZ; + + done->progress_stamp = jiffies; + if (waited_secs > sysctl_hung_task_timeout_secs) + pr_info("INFO: The task %s:%d has been waiting for writeback " + "completion for more than %lu seconds.", + current->comm, current->pid, waited_secs); + + return !atomic_read(&done->cnt); +} + /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion @@ -212,8 +221,9 @@ static void wb_queue_work(struct bdi_writeback *wb, */ void wb_wait_for_completion(struct wb_completion *done) { + done->wait_start = jiffies; atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, !atomic_read(&done->cnt)); + wait_event(*done->waitq, wb_wait_for_completion_cb(done)); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -304,9 +314,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_FREEING); + WARN_ON_ONCE(inode_state_read(inode) & I_FREEING); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); if (wb != &wb->bdi->wb) list_move(&inode->i_io_list, &wb->b_attached); else @@ -408,7 +418,7 @@ static bool inode_do_switch_wbs(struct inode *inode, * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction * path owns the inode and we shouldn't modify ->i_io_list. */ - if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) + if (unlikely(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); @@ -451,7 +461,7 @@ static bool inode_do_switch_wbs(struct inode *inode, if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; - if (inode->i_state & I_DIRTY_ALL) { + if (inode_state_read(inode) & I_DIRTY_ALL) { /* * We need to keep b_dirty list sorted by * dirtied_time_when. However properly sorting the @@ -476,10 +486,11 @@ static bool inode_do_switch_wbs(struct inode *inode, switched = true; skip_switch: /* - * Paired with load_acquire in unlocked_inode_to_wb_begin() and + * Paired with an acquire fence in unlocked_inode_to_wb_begin() and * ensures that the new wb is visible if they see !I_WB_SWITCH. */ - smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); + smp_wmb(); + inode_state_clear(inode, I_WB_SWITCH); xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); @@ -600,12 +611,12 @@ static bool inode_prepare_wbs_switch(struct inode *inode, /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || + inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); return false; } - inode->i_state |= I_WB_SWITCH; + inode_state_set(inode, I_WB_SWITCH); __iget(inode); spin_unlock(&inode->i_lock); @@ -635,7 +646,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ - if (inode->i_state & I_WB_SWITCH) + if (inode_state_read_once(inode) & I_WB_SWITCH) return; /* avoid queueing a new switch if too many are already in flight */ @@ -807,9 +818,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, * @wbc: writeback_control of interest * @inode: target inode * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. + * This function is to be used by filemap_writeback(), which is an alternative + * entry point into writeback code, and first ensures @inode is associated with + * a bdi_writeback and attaches it to @wbc. */ void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) @@ -1236,9 +1247,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_FREEING); + WARN_ON_ONCE(inode_state_read(inode) & I_FREEING); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -1348,10 +1359,17 @@ void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; + /* + * FIXME: ext4 can call here from ext4_evict_inode() after evict() already + * unlinked the inode. + */ + if (list_empty_careful(&inode->i_io_list)) + return; + wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); @@ -1409,13 +1427,13 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&inode->i_lock); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); /* * When the inode is being freed just don't bother with dirty list * tracking. Flush worker will ignore this inode anyway and it will * trigger assertions in inode_io_list_move_locked(). */ - if (inode->i_state & I_FREEING) { + if (inode_state_read(inode) & I_FREEING) { list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); return; @@ -1449,9 +1467,9 @@ static void inode_sync_complete(struct inode *inode) { assert_spin_locked(&inode->i_lock); - inode->i_state &= ~I_SYNC; + inode_state_clear(inode, I_SYNC); /* If inode is clean an unused, put it into LRU now... */ - inode_add_lru(inode); + inode_lru_list_add(inode); /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_SYNC); } @@ -1493,7 +1511,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; - inode->i_state |= I_SYNC_QUEUED; + inode_state_set(inode, I_SYNC_QUEUED); spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; @@ -1579,14 +1597,14 @@ void inode_wait_for_writeback(struct inode *inode) assert_spin_locked(&inode->i_lock); - if (!(inode->i_state & I_SYNC)) + if (!(inode_state_read(inode) & I_SYNC)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); for (;;) { prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ - if (!(inode->i_state & I_SYNC)) + if (!(inode_state_read(inode) & I_SYNC)) break; spin_unlock(&inode->i_lock); schedule(); @@ -1612,7 +1630,7 @@ static void inode_sleep_on_writeback(struct inode *inode) wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ - sleep = !!(inode->i_state & I_SYNC); + sleep = !!(inode_state_read(inode) & I_SYNC); spin_unlock(&inode->i_lock); if (sleep) schedule(); @@ -1631,7 +1649,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc, unsigned long dirtied_before) { - if (inode->i_state & I_FREEING) + if (inode_state_read(inode) & I_FREEING) return; /* @@ -1639,7 +1657,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * shot. If still dirty, it will be redirty_tail()'ed below. Update * the dirty time to prevent enqueue and sync it again. */ - if ((inode->i_state & I_DIRTY) && + if ((inode_state_read(inode) & I_DIRTY) && (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) inode->dirtied_when = jiffies; @@ -1650,7 +1668,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * is odd for clean inodes, it can happen for some * filesystems so handle that gracefully. */ - if (inode->i_state & I_DIRTY_ALL) + if (inode_state_read(inode) & I_DIRTY_ALL) redirty_tail_locked(inode, wb); else inode_cgwb_move_to_attached(inode, wb); @@ -1676,17 +1694,17 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, */ redirty_tail_locked(inode, wb); } - } else if (inode->i_state & I_DIRTY) { + } else if (inode_state_read(inode) & I_DIRTY) { /* * Filesystems can dirty the inode during writeback operations, * such as delayed allocation during submission or metadata * updates after data IO completion. */ redirty_tail_locked(inode, wb); - } else if (inode->i_state & I_DIRTY_TIME) { + } else if (inode_state_read(inode) & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); } else { /* The inode is clean. Remove from writeback lists. */ inode_cgwb_move_to_attached(inode, wb); @@ -1712,7 +1730,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) unsigned dirty; int ret; - WARN_ON(!(inode->i_state & I_SYNC)); + WARN_ON(!(inode_state_read_once(inode) & I_SYNC)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); @@ -1736,7 +1754,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * mark_inode_dirty_sync() to notify the filesystem about it and to * change I_DIRTY_TIME into I_DIRTY_SYNC. */ - if ((inode->i_state & I_DIRTY_TIME) && + if ((inode_state_read_once(inode) & I_DIRTY_TIME) && (wbc->sync_mode == WB_SYNC_ALL || time_after(jiffies, inode->dirtied_time_when + dirtytime_expire_interval * HZ))) { @@ -1751,8 +1769,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * after handling timestamp expiration, as that may dirty the inode too. */ spin_lock(&inode->i_lock); - dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~dirty; + dirty = inode_state_read(inode) & I_DIRTY; + inode_state_clear(inode, dirty); /* * Paired with smp_mb() in __mark_inode_dirty(). This allows @@ -1768,10 +1786,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) smp_mb(); if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - inode->i_state |= I_DIRTY_PAGES; - else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) { - if (!(inode->i_state & I_DIRTY_PAGES)) { - inode->i_state &= ~I_PINNING_NETFS_WB; + inode_state_set(inode, I_DIRTY_PAGES); + else if (unlikely(inode_state_read(inode) & I_PINNING_NETFS_WB)) { + if (!(inode_state_read(inode) & I_DIRTY_PAGES)) { + inode_state_clear(inode, I_PINNING_NETFS_WB); wbc->unpinned_netfs_wb = true; dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */ } @@ -1807,11 +1825,11 @@ static int writeback_single_inode(struct inode *inode, spin_lock(&inode->i_lock); if (!icount_read(inode)) - WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); + WARN_ON(!(inode_state_read(inode) & (I_WILL_FREE | I_FREEING))); else - WARN_ON(inode->i_state & I_WILL_FREE); + WARN_ON(inode_state_read(inode) & I_WILL_FREE); - if (inode->i_state & I_SYNC) { + if (inode_state_read(inode) & I_SYNC) { /* * Writeback is already running on the inode. For WB_SYNC_NONE, * that's enough and we can just return. For WB_SYNC_ALL, we @@ -1822,7 +1840,7 @@ static int writeback_single_inode(struct inode *inode, goto out; inode_wait_for_writeback(inode); } - WARN_ON(inode->i_state & I_SYNC); + WARN_ON(inode_state_read(inode) & I_SYNC); /* * If the inode is already fully clean, then there's nothing to do. * @@ -1830,11 +1848,11 @@ static int writeback_single_inode(struct inode *inode, * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If * there are any such pages, we'll need to wait for them. */ - if (!(inode->i_state & I_DIRTY_ALL) && + if (!(inode_state_read(inode) & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; - inode->i_state |= I_SYNC; + inode_state_set(inode, I_SYNC); wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); @@ -1847,18 +1865,18 @@ static int writeback_single_inode(struct inode *inode, * If the inode is freeing, its i_io_list shoudn't be updated * as it can be finally deleted at this moment. */ - if (!(inode->i_state & I_FREEING)) { + if (!(inode_state_read(inode) & I_FREEING)) { /* * If the inode is now fully clean, then it can be safely * removed from its writeback list (if any). Otherwise the * flusher threads are responsible for the writeback lists. */ - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read(inode) & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); - else if (!(inode->i_state & I_SYNC_QUEUED)) { - if ((inode->i_state & I_DIRTY)) + else if (!(inode_state_read(inode) & I_SYNC_QUEUED)) { + if ((inode_state_read(inode) & I_DIRTY)) redirty_tail_locked(inode, wb); - else if (inode->i_state & I_DIRTY_TIME) { + else if (inode_state_read(inode) & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, @@ -1874,8 +1892,8 @@ out: return ret; } -static long writeback_chunk_size(struct bdi_writeback *wb, - struct wb_writeback_work *work) +static long writeback_chunk_size(struct super_block *sb, + struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -1893,16 +1911,13 @@ static long writeback_chunk_size(struct bdi_writeback *wb, * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) - pages = LONG_MAX; - else { - pages = min(wb->avg_write_bandwidth / 2, - global_wb_domain.dirty_limit / DIRTY_SCOPE); - pages = min(pages, work->nr_pages); - pages = round_down(pages + MIN_WRITEBACK_PAGES, - MIN_WRITEBACK_PAGES); - } + return LONG_MAX; - return pages; + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + return round_down(pages + sb->s_min_writeback_pages, + sb->s_min_writeback_pages); } /* @@ -1967,12 +1982,12 @@ static long writeback_sb_inodes(struct super_block *sb, * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) { redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); continue; } - if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { + if ((inode_state_read(inode) & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { /* * If this inode is locked for writeback and we are not * doing writeback-for-data-integrity, move it to @@ -1994,17 +2009,17 @@ static long writeback_sb_inodes(struct super_block *sb, * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */ - if (inode->i_state & I_SYNC) { + if (inode_state_read(inode) & I_SYNC) { /* Wait for I_SYNC. This function drops i_lock... */ inode_sleep_on_writeback(inode); /* Inode may be gone, start again */ spin_lock(&wb->list_lock); continue; } - inode->i_state |= I_SYNC; + inode_state_set(inode, I_SYNC); wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb, work); + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; @@ -2014,6 +2029,12 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + /* Report progress to inform the hung task detector of the progress. */ + if (work->done && work->done->progress_stamp && + (jiffies - work->done->progress_stamp) > HZ * + sysctl_hung_task_timeout_secs / 2) + wake_up_all(work->done->waitq); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; @@ -2039,7 +2060,7 @@ static long writeback_sb_inodes(struct super_block *sb, */ tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read(inode) & I_DIRTY_ALL)) total_wrote++; requeue_inode(inode, tmp_wb, &wbc, dirtied_before); inode_sync_complete(inode); @@ -2545,10 +2566,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) * We tell ->dirty_inode callback that timestamps need to * be updated by setting I_DIRTY_TIME in flags. */ - if (inode->i_state & I_DIRTY_TIME) { + if (inode_state_read_once(inode) & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - if (inode->i_state & I_DIRTY_TIME) { - inode->i_state &= ~I_DIRTY_TIME; + if (inode_state_read(inode) & I_DIRTY_TIME) { + inode_state_clear(inode, I_DIRTY_TIME); flags |= I_DIRTY_TIME; } spin_unlock(&inode->i_lock); @@ -2585,16 +2606,16 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ smp_mb(); - if ((inode->i_state & flags) == flags) + if ((inode_state_read_once(inode) & flags) == flags) return; spin_lock(&inode->i_lock); - if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + if ((inode_state_read(inode) & flags) != flags) { + const int was_dirty = inode_state_read(inode) & I_DIRTY; inode_attach_wb(inode, NULL); - inode->i_state |= flags; + inode_state_set(inode, flags); /* * Grab inode's wb early because it requires dropping i_lock and we @@ -2613,7 +2634,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * the inode it will place it on the appropriate superblock * list, based upon its state. */ - if (inode->i_state & I_SYNC_QUEUED) + if (inode_state_read(inode) & I_SYNC_QUEUED) goto out_unlock; /* @@ -2624,7 +2645,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (inode_unhashed(inode)) goto out_unlock; } - if (inode->i_state & I_FREEING) + if (inode_state_read(inode) & I_FREEING) goto out_unlock; /* @@ -2639,7 +2660,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (dirtytime) inode->dirtied_time_when = jiffies; - if (inode->i_state & I_DIRTY) + if (inode_state_read(inode) & I_DIRTY) dirty_list = &wb->b_dirty; else dirty_list = &wb->b_dirty_time; @@ -2736,7 +2757,7 @@ static void wait_sb_inodes(struct super_block *sb) spin_unlock_irq(&sb->s_inode_wblist_lock); spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); spin_lock_irq(&sb->s_inode_wblist_lock); diff --git a/fs/fs_types.c b/fs/fs_dirent.c index 78365e5dc08c..e5e08f213816 100644 --- a/fs/fs_types.c +++ b/fs/fs_dirent.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/fs.h> +#include <linux/fs_dirent.h> #include <linux/export.h> /* diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 28be762ac1c6..b8c46c5a38a0 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -146,12 +146,6 @@ int unshare_fs_struct(void) } EXPORT_SYMBOL_GPL(unshare_fs_struct); -int current_umask(void) -{ - return current->fs->umask; -} -EXPORT_SYMBOL(current_umask); - /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f6b12aebb8bb..f8c93dc45768 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1209,14 +1209,15 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, * User buffers are not mapped yet - the application does not have permission * to write to it - this has to be executed in ring task context. */ -static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) { + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); struct fuse_ring_queue *queue = ent->queue; int err; - if (!(issue_flags & IO_URING_F_TASK_DEAD)) { + if (!tw.cancel) { err = fuse_uring_prepare_send(ent, ent->fuse_req); if (err) { fuse_uring_next_fuse_req(ent, queue, issue_flags); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ecaec0fea3a1..87a63ae93a45 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1192,7 +1192,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else - blkbits = fc->blkbits; + blkbits = inode->i_sb->s_blocksize_bits; stat->blksize = 1 << blkbits; } @@ -1397,27 +1397,25 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, if (!parent) return -ENOENT; - inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) - goto unlock; + goto put_parent; err = -ENOENT; dir = d_find_alias(parent); if (!dir) - goto unlock; + goto put_parent; - name->hash = full_name_hash(dir, name->name, name->len); - entry = d_lookup(dir, name); + entry = start_removing_noperm(dir, name); dput(dir); - if (!entry) - goto unlock; + if (IS_ERR(entry)) + goto put_parent; fuse_dir_changed(parent); if (!(flags & FUSE_EXPIRE_ONLY)) d_invalidate(entry); fuse_invalidate_entry_cache(entry); - if (child_nodeid != 0 && d_really_is_positive(entry)) { + if (child_nodeid != 0) { inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; @@ -1445,10 +1443,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, } else { err = 0; } - dput(entry); - unlock: - inode_unlock(parent); + end_removing(entry); + put_parent: iput(parent); return err; } @@ -2230,6 +2227,7 @@ static const struct file_operations fuse_dir_operations = { .fsync = fuse_dir_fsync, .unlocked_ioctl = fuse_dir_ioctl, .compat_ioctl = fuse_dir_compat_ioctl, + .setlease = simple_nosetlease, }; static const struct inode_operations fuse_common_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f1ef77a0be05..7bcb650a9f26 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -834,23 +834,142 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio, return 0; } +static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + iomap->type = IOMAP_MAPPED; + iomap->length = length; + iomap->offset = offset; + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, +}; + +struct fuse_fill_read_data { + struct file *file; + + /* Fields below are used if sending the read request asynchronously */ + struct fuse_conn *fc; + struct fuse_io_args *ia; + unsigned int nr_bytes; +}; + +/* forward declarations */ +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write); +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count, bool async); + +static int fuse_handle_readahead(struct folio *folio, + struct readahead_control *rac, + struct fuse_fill_read_data *data, loff_t pos, + size_t len) +{ + struct fuse_io_args *ia = data->ia; + size_t off = offset_in_folio(folio, pos); + struct fuse_conn *fc = data->fc; + struct fuse_args_pages *ap; + unsigned int nr_pages; + + if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, + false)) { + fuse_send_readpages(ia, data->file, data->nr_bytes, + fc->async_read); + data->nr_bytes = 0; + data->ia = NULL; + ia = NULL; + } + if (!ia) { + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + return -EAGAIN; + + nr_pages = min(fc->max_pages, readahead_count(rac)); + data->ia = fuse_io_alloc(NULL, nr_pages); + if (!data->ia) + return -ENOMEM; + ia = data->ia; + } + folio_get(folio); + ap = &ia->ap; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = off; + ap->descs[ap->num_folios].length = len; + data->nr_bytes += len; + ap->num_folios++; + + return 0; +} + +static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, + size_t len) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + struct folio *folio = ctx->cur_folio; + loff_t pos = iter->pos; + size_t off = offset_in_folio(folio, pos); + struct file *file = data->file; + int ret; + + if (ctx->rac) { + ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); + } else { + /* + * for non-readahead read requests, do reads synchronously + * since it's not guaranteed that the server can handle + * out-of-order reads + */ + ret = fuse_do_readfolio(file, folio, off, len); + if (!ret) + iomap_finish_folio_read(folio, off, len, ret); + } + return ret; +} + +static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + + if (data->ia) + fuse_send_readpages(data->ia, data->file, data->nr_bytes, + data->fc->async_read); +} + +static const struct iomap_read_ops fuse_iomap_read_ops = { + .read_folio_range = fuse_iomap_read_folio_range_async, + .submit_read = fuse_iomap_read_submit, +}; + static int fuse_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; - int err; + struct fuse_fill_read_data data = { + .file = file, + }; + struct iomap_read_folio_ctx ctx = { + .cur_folio = folio, + .ops = &fuse_iomap_read_ops, + .read_ctx = &data, - err = -EIO; - if (fuse_is_bad(inode)) - goto out; + }; - err = fuse_do_readfolio(file, folio, 0, folio_size(folio)); - if (!err) - folio_mark_uptodate(folio); + if (fuse_is_bad(inode)) { + folio_unlock(folio); + return -EIO; + } + iomap_read_folio(&fuse_iomap_ops, &ctx); fuse_invalidate_atime(inode); - out: - folio_unlock(folio); - return err; + return 0; } static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, @@ -887,7 +1006,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); for (i = 0; i < ap->num_folios; i++) { - folio_end_read(ap->folios[i], !err); + iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, + ap->descs[i].length, err); folio_put(ap->folios[i]); } if (ia->ff) @@ -897,7 +1017,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, } static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, - unsigned int count) + unsigned int count, bool async) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; @@ -919,7 +1039,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, fuse_read_args_fill(ia, file, pos, count, FUSE_READ); ia->read.attr_ver = fuse_get_attr_version(fm->fc); - if (fm->fc->async_read) { + if (async) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); @@ -936,81 +1056,20 @@ static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int max_pages, nr_pages; - struct folio *folio = NULL; + struct fuse_fill_read_data data = { + .file = rac->file, + .fc = fc, + }; + struct iomap_read_folio_ctx ctx = { + .ops = &fuse_iomap_read_ops, + .rac = rac, + .read_ctx = &data + }; if (fuse_is_bad(inode)) return; - max_pages = min_t(unsigned int, fc->max_pages, - fc->max_read / PAGE_SIZE); - - /* - * This is only accurate the first time through, since readahead_folio() - * doesn't update readahead_count() from the previous folio until the - * next call. Grab nr_pages here so we know how many pages we're going - * to have to process. This means that we will exit here with - * readahead_count() == folio_nr_pages(last_folio), but we will have - * consumed all of the folios, and read_pages() will call - * readahead_folio() again which will clean up the rac. - */ - nr_pages = readahead_count(rac); - - while (nr_pages) { - struct fuse_io_args *ia; - struct fuse_args_pages *ap; - unsigned cur_pages = min(max_pages, nr_pages); - unsigned int pages = 0; - - if (fc->num_background >= fc->congestion_threshold && - rac->ra->async_size >= readahead_count(rac)) - /* - * Congested and only async pages left, so skip the - * rest. - */ - break; - - ia = fuse_io_alloc(NULL, cur_pages); - if (!ia) - break; - ap = &ia->ap; - - while (pages < cur_pages) { - unsigned int folio_pages; - - /* - * This returns a folio with a ref held on it. - * The ref needs to be held until the request is - * completed, since the splice case (see - * fuse_try_move_page()) drops the ref after it's - * replaced in the page cache. - */ - if (!folio) - folio = __readahead_folio(rac); - - folio_pages = folio_nr_pages(folio); - if (folio_pages > cur_pages - pages) { - /* - * Large folios belonging to fuse will never - * have more pages than max_pages. - */ - WARN_ON(!pages); - break; - } - - ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].length = folio_size(folio); - ap->num_folios++; - pages += folio_pages; - folio = NULL; - } - fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); - nr_pages -= pages; - } - if (folio) { - folio_end_read(folio, false); - folio_put(folio); - } + iomap_readahead(&fuse_iomap_ops, &ctx); } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -1397,20 +1456,6 @@ static const struct iomap_write_ops fuse_iomap_write_ops = { .read_folio_range = fuse_iomap_read_folio_range, }; -static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) -{ - iomap->type = IOMAP_MAPPED; - iomap->length = length; - iomap->offset = offset; - return 0; -} - -static const struct iomap_ops fuse_iomap_ops = { - .iomap_begin = fuse_iomap_begin, -}; - static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1834,7 +1879,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ - iomap_finish_folio_write(inode, ap->folios[i], 1); + iomap_finish_folio_write(inode, ap->folios[i], + ap->descs[i].length); wake_up(&fi->page_waitq); } @@ -2047,7 +2093,7 @@ struct fuse_fill_wb_data { struct fuse_file *ff; unsigned int max_folios; /* - * nr_bytes won't overflow since fuse_writepage_need_send() caps + * nr_bytes won't overflow since fuse_folios_need_send() caps * wb requests to never exceed fc->max_pages (which has an upper bound * of U16_MAX). */ @@ -2092,14 +2138,15 @@ static void fuse_writepages_send(struct inode *inode, spin_unlock(&fi->lock); } -static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, - unsigned len, struct fuse_args_pages *ap, - struct fuse_fill_wb_data *data) +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write) { struct folio *prev_folio; struct fuse_folio_desc prev_desc; - unsigned bytes = data->nr_bytes + len; + unsigned bytes = cur_bytes + len; loff_t prev_pos; + size_t max_bytes = write ? fc->max_write : fc->max_read; WARN_ON(!ap->num_folios); @@ -2107,8 +2154,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) return true; - /* Reached max write bytes */ - if (bytes > fc->max_write) + if (bytes > max_bytes) return true; /* Discontinuity */ @@ -2118,11 +2164,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, if (prev_pos != pos) return true; - /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_folios == data->max_folios && - !fuse_pages_realloc(data, fc->max_pages)) - return true; - return false; } @@ -2146,10 +2187,24 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, return -EIO; } - if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { - fuse_writepages_send(inode, data); - data->wpa = NULL; - data->nr_bytes = 0; + if (wpa) { + bool send = fuse_folios_need_send(fc, pos, len, ap, + data->nr_bytes, true); + + if (!send) { + /* + * Need to grow the pages array? If so, did the + * expansion fail? + */ + send = (ap->num_folios == data->max_folios) && + !fuse_pages_realloc(data, fc->max_pages); + } + + if (send) { + fuse_writepages_send(inode, data); + data->wpa = NULL; + data->nr_bytes = 0; + } } if (data->wpa == NULL) { @@ -2161,7 +2216,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, ap = &wpa->ia.ap; } - iomap_start_folio_write(inode, folio, 1); fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, offset, len); data->nr_bytes += len; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c2f2a48156d6..f616c1991fed 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -981,14 +981,6 @@ struct fuse_conn { /* Request timeout (in jiffies). 0 = no timeout */ unsigned int req_timeout; } timeout; - - /* - * This is a workaround until fuse uses iomap for reads. - * For fuseblk servers, this represents the blocksize passed in at - * mount time and for regular fuse servers, this is equivalent to - * inode->i_blkbits. - */ - u8 blkbits; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d1babf56f254..1a397be53f49 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -160,7 +160,7 @@ static void fuse_evict_inode(struct inode *inode) struct fuse_inode *fi = get_fuse_inode(inode); /* Will write inode on close/munmap and in all other dirtiers */ - WARN_ON(inode->i_state & I_DIRTY_INODE); + WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE); if (FUSE_IS_DAX(inode)) dax_break_layout_final(inode); @@ -291,7 +291,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (attr->blksize) fi->cached_i_blkbits = ilog2(attr->blksize); else - fi->cached_i_blkbits = fc->blkbits; + fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; /* * Don't set the sticky bit in i_mode, unless we want the VFS @@ -505,7 +505,7 @@ retry: if (!inode) return NULL; - if ((inode->i_state & I_NEW)) { + if ((inode_state_read_once(inode) & I_NEW)) { inode->i_flags |= S_NOATIME; if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; @@ -1838,22 +1838,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; - /* - * This is a workaround until fuse hooks into iomap for reads. - * Use PAGE_SIZE for the blocksize else if the writeback cache - * is enabled, buffered writes go through iomap and a read may - * overwrite partially written data if blocksize < PAGE_SIZE - */ - fc->blkbits = sb->s_blocksize_bits; - if (ctx->blksize != PAGE_SIZE && - !sb_set_blocksize(sb, PAGE_SIZE)) - goto err; #endif fc->sync_fs = 1; } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - fc->blkbits = sb->s_blocksize_bits; } sb->s_subtype = ctx->subtype; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6bc7c97b017d..b2f6486fe1d5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) sprintf(buff, "%d", i); fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); - if (!fs->mqs_kobj) { + if (!fsvq->kobj) { ret = -ENOMEM; goto out_del; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 47d74afd63ac..ff1cf335449a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -81,8 +81,7 @@ static int gfs2_write_jdata_folio(struct folio *folio, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - if (folio_pos(folio) < i_size && - i_size < folio_pos(folio) + folio_size(folio)) + if (folio_pos(folio) < i_size && i_size < folio_next_pos(folio)) folio_zero_segment(folio, offset_in_folio(folio, i_size), folio_size(folio)); @@ -311,10 +310,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) @@ -424,11 +420,11 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) struct inode *inode = folio->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; + int error = 0; if (!gfs2_is_jdata(ip) || (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { - error = iomap_read_folio(folio, &gfs2_iomap_ops); + iomap_bio_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { error = stuffed_read_folio(ip, folio); } else { @@ -503,7 +499,7 @@ static void gfs2_readahead(struct readahead_control *rac) else if (gfs2_is_jdata(ip)) mpage_readahead(rac, gfs2_block_map); else - iomap_readahead(rac, &gfs2_iomap_ops); + iomap_bio_readahead(rac, &gfs2_iomap_ops); } /** diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index bc67fa058c84..ee92f5910ae1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -744,7 +744,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & I_DIRTY; + int sync_state = inode_state_read_once(inode) & I_DIRTY; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b677c0e6b9ab..c9712235e7a0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -957,7 +957,7 @@ static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { - wait_on_inode(&ip->i_inode); + wait_on_new_inode(&ip->i_inode); if (is_bad_inode(&ip->i_inode)) { iput(&ip->i_inode); ip = NULL; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 0c0a80b3baca..c94e42b0c94d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -394,7 +394,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) u16 height, depth; umode_t mode = be32_to_cpu(str->di_mode); struct inode *inode = &ip->i_inode; - bool is_new = inode->i_state & I_NEW; + bool is_new = inode_state_read_once(inode) & I_NEW; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) { gfs2_consist_inode(ip); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8a7ed80d9f2d..890c87e3e365 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -127,7 +127,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, ip = GFS2_I(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; int extra_flags = 0; @@ -924,7 +924,7 @@ fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(&d_gh); if (!IS_ERR_OR_NULL(inode)) { - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) iget_failed(inode); else iput(inode); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index aa15183f9a16..889682f051ea 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1751,7 +1751,7 @@ static void gfs2_evict_inodes(struct super_block *sb) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) && + if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) && !need_resched()) { spin_unlock(&inode->i_lock); continue; diff --git a/fs/hfs/.kunitconfig b/fs/hfs/.kunitconfig new file mode 100644 index 000000000000..5caa9af1e3bb --- /dev/null +++ b/fs/hfs/.kunitconfig @@ -0,0 +1,7 @@ +CONFIG_KUNIT=y +CONFIG_HFS_FS=y +CONFIG_HFS_KUNIT_TEST=y +CONFIG_BLOCK=y +CONFIG_BUFFER_HEAD=y +CONFIG_NLS=y +CONFIG_LEGACY_DIRECT_IO=y diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index 5ea5cd8ecea9..7f3cbe43b4b7 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -13,3 +13,18 @@ config HFS_FS To compile this file system support as a module, choose M here: the module will be called hfs. + +config HFS_KUNIT_TEST + tristate "KUnit tests for HFS filesystem" if !KUNIT_ALL_TESTS + depends on HFS_FS && KUNIT + default KUNIT_ALL_TESTS + help + This builds KUnit tests for the HFS filesystem. + + KUnit tests run during boot and output the results to the debug + log in TAP format (https://testanything.org/). Only useful for + kernel devs running KUnit test harness and are not for inclusion + into a production build. + + For more information on KUnit and unit tests in general please + refer to the KUnit documentation in Documentation/dev-tools/kunit/. diff --git a/fs/hfs/Makefile b/fs/hfs/Makefile index b65459bf3dc4..a7c9ce6b4609 100644 --- a/fs/hfs/Makefile +++ b/fs/hfs/Makefile @@ -9,3 +9,5 @@ hfs-objs := bitmap.o bfind.o bnode.o brec.o btree.o \ catalog.o dir.o extent.o inode.o attr.o mdb.o \ part_tbl.o string.o super.o sysdep.o trans.o +# KUnit tests +obj-$(CONFIG_HFS_KUNIT_TEST) += string_test.o diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index c2f840c49e60..d56e47bdc517 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -167,7 +167,7 @@ release: return res; } -int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) +int hfs_brec_read(struct hfs_find_data *fd, void *rec, u32 rec_len) { int res; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index fcfffe75d84e..13d58c51fc46 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -16,14 +16,14 @@ #include "btree.h" static inline -bool is_bnode_offset_valid(struct hfs_bnode *node, int off) +bool is_bnode_offset_valid(struct hfs_bnode *node, u32 off) { bool is_valid = off < node->tree->node_size; if (!is_valid) { pr_err("requested invalid offset: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d\n", + "node_size %u, offset %u\n", node->this, node->type, node->height, node->tree->node_size, off); } @@ -32,7 +32,7 @@ bool is_bnode_offset_valid(struct hfs_bnode *node, int off) } static inline -int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) +u32 check_and_correct_requested_length(struct hfs_bnode *node, u32 off, u32 len) { unsigned int node_size; @@ -42,12 +42,12 @@ int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) node_size = node->tree->node_size; if ((off + len) > node_size) { - int new_len = (int)node_size - off; + u32 new_len = node_size - off; pr_err("requested length has been corrected: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, " - "requested_len %d, corrected_len %d\n", + "node_size %u, offset %u, " + "requested_len %u, corrected_len %u\n", node->this, node->type, node->height, node->tree->node_size, off, len, new_len); @@ -57,12 +57,12 @@ int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) return len; } -void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) +void hfs_bnode_read(struct hfs_bnode *node, void *buf, u32 off, u32 len) { struct page *page; - int pagenum; - int bytes_read; - int bytes_to_read; + u32 pagenum; + u32 bytes_read; + u32 bytes_to_read; if (!is_bnode_offset_valid(node, off)) return; @@ -70,7 +70,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) if (len == 0) { pr_err("requested zero length: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, len %d\n", + "node_size %u, offset %u, len %u\n", node->this, node->type, node->height, node->tree->node_size, off, len); return; @@ -86,7 +86,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) if (pagenum >= node->tree->pages_per_bnode) break; page = node->page[pagenum]; - bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off); + bytes_to_read = min_t(u32, len - bytes_read, PAGE_SIZE - off); memcpy_from_page(buf + bytes_read, page, off, bytes_to_read); @@ -95,7 +95,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) } } -u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) +u16 hfs_bnode_read_u16(struct hfs_bnode *node, u32 off) { __be16 data; // optimize later... @@ -103,7 +103,7 @@ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) return be16_to_cpu(data); } -u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) +u8 hfs_bnode_read_u8(struct hfs_bnode *node, u32 off) { u8 data; // optimize later... @@ -111,10 +111,10 @@ u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) return data; } -void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, u32 off) { struct hfs_btree *tree; - int key_len; + u32 key_len; tree = node->tree; if (node->type == HFS_NODE_LEAF || @@ -125,14 +125,14 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) if (key_len > sizeof(hfs_btree_key) || key_len < 1) { memset(key, 0, sizeof(hfs_btree_key)); - pr_err("hfs: Invalid key length: %d\n", key_len); + pr_err("hfs: Invalid key length: %u\n", key_len); return; } hfs_bnode_read(node, key, off, key_len); } -void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) +void hfs_bnode_write(struct hfs_bnode *node, void *buf, u32 off, u32 len) { struct page *page; @@ -142,7 +142,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) if (len == 0) { pr_err("requested zero length: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, len %d\n", + "node_size %u, offset %u, len %u\n", node->this, node->type, node->height, node->tree->node_size, off, len); return; @@ -157,20 +157,20 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) set_page_dirty(page); } -void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) +void hfs_bnode_write_u16(struct hfs_bnode *node, u32 off, u16 data) { __be16 v = cpu_to_be16(data); // optimize later... hfs_bnode_write(node, &v, off, 2); } -void hfs_bnode_write_u8(struct hfs_bnode *node, int off, u8 data) +void hfs_bnode_write_u8(struct hfs_bnode *node, u32 off, u8 data) { // optimize later... hfs_bnode_write(node, &data, off, 1); } -void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) +void hfs_bnode_clear(struct hfs_bnode *node, u32 off, u32 len) { struct page *page; @@ -180,7 +180,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) if (len == 0) { pr_err("requested zero length: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, len %d\n", + "node_size %u, offset %u, len %u\n", node->this, node->type, node->height, node->tree->node_size, off, len); return; @@ -195,8 +195,8 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) set_page_dirty(page); } -void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, - struct hfs_bnode *src_node, int src, int len) +void hfs_bnode_copy(struct hfs_bnode *dst_node, u32 dst, + struct hfs_bnode *src_node, u32 src, u32 len) { struct page *src_page, *dst_page; @@ -216,7 +216,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, set_page_dirty(dst_page); } -void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) +void hfs_bnode_move(struct hfs_bnode *node, u32 dst, u32 src, u32 len) { struct page *page; void *ptr; diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index e49a141c87e5..5a2f740ddefd 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -62,7 +62,7 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) return retval; } -int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, u32 entry_len) { struct hfs_btree *tree; struct hfs_bnode *node, *new_node; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 22e62fe7448b..7bc425283d49 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -42,7 +42,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->inode = iget_locked(sb, id); if (!tree->inode) goto free_tree; - BUG_ON(!(tree->inode->i_state & I_NEW)); + BUG_ON(!(inode_state_read_once(tree->inode) & I_NEW)); { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; HFS_I(tree->inode)->flags = 0; @@ -259,7 +259,7 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) } /* Make sure @tree has enough space for the @rsvd_nodes */ -int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes) +int hfs_bmap_reserve(struct hfs_btree *tree, u32 rsvd_nodes) { struct inode *inode = tree->inode; u32 count; diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index 0e6baee93245..99be858b2446 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -86,87 +86,46 @@ struct hfs_find_data { /* btree.c */ -extern struct hfs_btree *hfs_btree_open(struct super_block *, u32, btree_keycmp); -extern void hfs_btree_close(struct hfs_btree *); -extern void hfs_btree_write(struct hfs_btree *); -extern int hfs_bmap_reserve(struct hfs_btree *, int); -extern struct hfs_bnode * hfs_bmap_alloc(struct hfs_btree *); +extern struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, + btree_keycmp keycmp); +extern void hfs_btree_close(struct hfs_btree *tree); +extern void hfs_btree_write(struct hfs_btree *tree); +extern int hfs_bmap_reserve(struct hfs_btree *tree, u32 rsvd_nodes); +extern struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree); extern void hfs_bmap_free(struct hfs_bnode *node); /* bnode.c */ -extern void hfs_bnode_read(struct hfs_bnode *, void *, int, int); -extern u16 hfs_bnode_read_u16(struct hfs_bnode *, int); -extern u8 hfs_bnode_read_u8(struct hfs_bnode *, int); -extern void hfs_bnode_read_key(struct hfs_bnode *, void *, int); -extern void hfs_bnode_write(struct hfs_bnode *, void *, int, int); -extern void hfs_bnode_write_u16(struct hfs_bnode *, int, u16); -extern void hfs_bnode_write_u8(struct hfs_bnode *, int, u8); -extern void hfs_bnode_clear(struct hfs_bnode *, int, int); -extern void hfs_bnode_copy(struct hfs_bnode *, int, - struct hfs_bnode *, int, int); -extern void hfs_bnode_move(struct hfs_bnode *, int, int, int); -extern void hfs_bnode_dump(struct hfs_bnode *); -extern void hfs_bnode_unlink(struct hfs_bnode *); -extern struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32); -extern struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32); -extern void hfs_bnode_unhash(struct hfs_bnode *); -extern void hfs_bnode_free(struct hfs_bnode *); -extern struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32); -extern void hfs_bnode_get(struct hfs_bnode *); -extern void hfs_bnode_put(struct hfs_bnode *); +extern void hfs_bnode_read(struct hfs_bnode *node, void *buf, u32 off, u32 len); +extern u16 hfs_bnode_read_u16(struct hfs_bnode *node, u32 off); +extern u8 hfs_bnode_read_u8(struct hfs_bnode *node, u32 off); +extern void hfs_bnode_read_key(struct hfs_bnode *node, void *key, u32 off); +extern void hfs_bnode_write(struct hfs_bnode *node, void *buf, u32 off, u32 len); +extern void hfs_bnode_write_u16(struct hfs_bnode *node, u32 off, u16 data); +extern void hfs_bnode_write_u8(struct hfs_bnode *node, u32 off, u8 data); +extern void hfs_bnode_clear(struct hfs_bnode *node, u32 off, u32 len); +extern void hfs_bnode_copy(struct hfs_bnode *dst_node, u32 dst, + struct hfs_bnode *src_node, u32 src, u32 len); +extern void hfs_bnode_move(struct hfs_bnode *node, u32 dst, u32 src, u32 len); +extern void hfs_bnode_dump(struct hfs_bnode *node); +extern void hfs_bnode_unlink(struct hfs_bnode *node); +extern struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid); +extern struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num); +extern void hfs_bnode_unhash(struct hfs_bnode *node); +extern void hfs_bnode_free(struct hfs_bnode *node); +extern struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num); +extern void hfs_bnode_get(struct hfs_bnode *node); +extern void hfs_bnode_put(struct hfs_bnode *node); /* brec.c */ -extern u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *); -extern u16 hfs_brec_keylen(struct hfs_bnode *, u16); -extern int hfs_brec_insert(struct hfs_find_data *, void *, int); -extern int hfs_brec_remove(struct hfs_find_data *); +extern u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off); +extern u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec); +extern int hfs_brec_insert(struct hfs_find_data *fd, void *entry, u32 entry_len); +extern int hfs_brec_remove(struct hfs_find_data *fd); /* bfind.c */ -extern int hfs_find_init(struct hfs_btree *, struct hfs_find_data *); -extern void hfs_find_exit(struct hfs_find_data *); -extern int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *); -extern int hfs_brec_find(struct hfs_find_data *); -extern int hfs_brec_read(struct hfs_find_data *, void *, int); -extern int hfs_brec_goto(struct hfs_find_data *, int); - - -struct hfs_bnode_desc { - __be32 next; /* (V) Number of the next node at this level */ - __be32 prev; /* (V) Number of the prev node at this level */ - u8 type; /* (F) The type of node */ - u8 height; /* (F) The level of this node (leaves=1) */ - __be16 num_recs; /* (V) The number of records in this node */ - u16 reserved; -} __packed; - -#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ -#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ -#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ -#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ - -struct hfs_btree_header_rec { - __be16 depth; /* (V) The number of levels in this B-tree */ - __be32 root; /* (V) The node number of the root node */ - __be32 leaf_count; /* (V) The number of leaf records */ - __be32 leaf_head; /* (V) The number of the first leaf node */ - __be32 leaf_tail; /* (V) The number of the last leaf node */ - __be16 node_size; /* (F) The number of bytes in a node (=512) */ - __be16 max_key_len; /* (F) The length of a key in an index node */ - __be32 node_count; /* (V) The total number of nodes */ - __be32 free_nodes; /* (V) The number of unused nodes */ - u16 reserved1; - __be32 clump_size; /* (F) clump size. not usually used. */ - u8 btree_type; /* (F) BTree type */ - u8 reserved2; - __be32 attributes; /* (F) attributes */ - u32 reserved3[16]; -} __packed; - -#define BTREE_ATTR_BADCLOSE 0x00000001 /* b-tree not closed properly. not - used by hfsplus. */ -#define HFS_TREE_BIGKEYS 0x00000002 /* key length is u16 instead of u8. - used by hfsplus. */ -#define HFS_TREE_VARIDXKEYS 0x00000004 /* variable key length instead of - max key length. use din catalog - b-tree but not in extents - b-tree (hfsplus). */ +extern int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd); +extern void hfs_find_exit(struct hfs_find_data *fd); +extern int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd); +extern int hfs_brec_find(struct hfs_find_data *fd); +extern int hfs_brec_read(struct hfs_find_data *fd, void *rec, u32 rec_len); +extern int hfs_brec_goto(struct hfs_find_data *fd, int cnt); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index caebabb6642f..b80ba40e3877 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -322,9 +322,9 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) } } + node_id = node->prev; hfs_bnode_put(node); - node_id = node->prev; } while (node_id >= leaf_head); return -ENOENT; diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h index 6f194d0768b6..3f2293ff6fdd 100644 --- a/fs/hfs/hfs.h +++ b/fs/hfs/hfs.h @@ -9,274 +9,7 @@ #ifndef _HFS_H #define _HFS_H -/* offsets to various blocks */ -#define HFS_DD_BLK 0 /* Driver Descriptor block */ -#define HFS_PMAP_BLK 1 /* First block of partition map */ -#define HFS_MDB_BLK 2 /* Block (w/i partition) of MDB */ - -/* magic numbers for various disk blocks */ -#define HFS_DRVR_DESC_MAGIC 0x4552 /* "ER": driver descriptor map */ -#define HFS_OLD_PMAP_MAGIC 0x5453 /* "TS": old-type partition map */ -#define HFS_NEW_PMAP_MAGIC 0x504D /* "PM": new-type partition map */ -#define HFS_SUPER_MAGIC 0x4244 /* "BD": HFS MDB (super block) */ -#define HFS_MFS_SUPER_MAGIC 0xD2D7 /* MFS MDB (super block) */ - -/* various FIXED size parameters */ -#define HFS_SECTOR_SIZE 512 /* size of an HFS sector */ -#define HFS_SECTOR_SIZE_BITS 9 /* log_2(HFS_SECTOR_SIZE) */ -#define HFS_NAMELEN 31 /* maximum length of an HFS filename */ -#define HFS_MAX_NAMELEN 128 -#define HFS_MAX_VALENCE 32767U - -/* Meanings of the drAtrb field of the MDB, - * Reference: _Inside Macintosh: Files_ p. 2-61 - */ -#define HFS_SB_ATTRIB_HLOCK (1 << 7) -#define HFS_SB_ATTRIB_UNMNT (1 << 8) -#define HFS_SB_ATTRIB_SPARED (1 << 9) -#define HFS_SB_ATTRIB_INCNSTNT (1 << 11) -#define HFS_SB_ATTRIB_SLOCK (1 << 15) - -/* Some special File ID numbers */ -#define HFS_POR_CNID 1 /* Parent Of the Root */ -#define HFS_ROOT_CNID 2 /* ROOT directory */ -#define HFS_EXT_CNID 3 /* EXTents B-tree */ -#define HFS_CAT_CNID 4 /* CATalog B-tree */ -#define HFS_BAD_CNID 5 /* BAD blocks file */ -#define HFS_ALLOC_CNID 6 /* ALLOCation file (HFS+) */ -#define HFS_START_CNID 7 /* STARTup file (HFS+) */ -#define HFS_ATTR_CNID 8 /* ATTRibutes file (HFS+) */ -#define HFS_EXCH_CNID 15 /* ExchangeFiles temp id */ -#define HFS_FIRSTUSER_CNID 16 - -/* values for hfs_cat_rec.cdrType */ -#define HFS_CDR_DIR 0x01 /* folder (directory) */ -#define HFS_CDR_FIL 0x02 /* file */ -#define HFS_CDR_THD 0x03 /* folder (directory) thread */ -#define HFS_CDR_FTH 0x04 /* file thread */ - -/* legal values for hfs_ext_key.FkType and hfs_file.fork */ -#define HFS_FK_DATA 0x00 -#define HFS_FK_RSRC 0xFF - -/* bits in hfs_fil_entry.Flags */ -#define HFS_FIL_LOCK 0x01 /* locked */ -#define HFS_FIL_THD 0x02 /* file thread */ -#define HFS_FIL_DOPEN 0x04 /* data fork open */ -#define HFS_FIL_ROPEN 0x08 /* resource fork open */ -#define HFS_FIL_DIR 0x10 /* directory (always clear) */ -#define HFS_FIL_NOCOPY 0x40 /* copy-protected file */ -#define HFS_FIL_USED 0x80 /* open */ - -/* bits in hfs_dir_entry.Flags. dirflags is 16 bits. */ -#define HFS_DIR_LOCK 0x01 /* locked */ -#define HFS_DIR_THD 0x02 /* directory thread */ -#define HFS_DIR_INEXPFOLDER 0x04 /* in a shared area */ -#define HFS_DIR_MOUNTED 0x08 /* mounted */ -#define HFS_DIR_DIR 0x10 /* directory (always set) */ -#define HFS_DIR_EXPFOLDER 0x20 /* share point */ - -/* bits hfs_finfo.fdFlags */ -#define HFS_FLG_INITED 0x0100 -#define HFS_FLG_LOCKED 0x1000 -#define HFS_FLG_INVISIBLE 0x4000 - -/*======== HFS structures as they appear on the disk ========*/ - -/* Pascal-style string of up to 31 characters */ -struct hfs_name { - u8 len; - u8 name[HFS_NAMELEN]; -} __packed; - -struct hfs_point { - __be16 v; - __be16 h; -} __packed; - -struct hfs_rect { - __be16 top; - __be16 left; - __be16 bottom; - __be16 right; -} __packed; - -struct hfs_finfo { - __be32 fdType; - __be32 fdCreator; - __be16 fdFlags; - struct hfs_point fdLocation; - __be16 fdFldr; -} __packed; - -struct hfs_fxinfo { - __be16 fdIconID; - u8 fdUnused[8]; - __be16 fdComment; - __be32 fdPutAway; -} __packed; - -struct hfs_dinfo { - struct hfs_rect frRect; - __be16 frFlags; - struct hfs_point frLocation; - __be16 frView; -} __packed; - -struct hfs_dxinfo { - struct hfs_point frScroll; - __be32 frOpenChain; - __be16 frUnused; - __be16 frComment; - __be32 frPutAway; -} __packed; - -union hfs_finder_info { - struct { - struct hfs_finfo finfo; - struct hfs_fxinfo fxinfo; - } file; - struct { - struct hfs_dinfo dinfo; - struct hfs_dxinfo dxinfo; - } dir; -} __packed; - -/* Cast to a pointer to a generic bkey */ -#define HFS_BKEY(X) (((void)((X)->KeyLen)), ((struct hfs_bkey *)(X))) - -/* The key used in the catalog b-tree: */ -struct hfs_cat_key { - u8 key_len; /* number of bytes in the key */ - u8 reserved; /* padding */ - __be32 ParID; /* CNID of the parent dir */ - struct hfs_name CName; /* The filename of the entry */ -} __packed; - -/* The key used in the extents b-tree: */ -struct hfs_ext_key { - u8 key_len; /* number of bytes in the key */ - u8 FkType; /* HFS_FK_{DATA,RSRC} */ - __be32 FNum; /* The File ID of the file */ - __be16 FABN; /* allocation blocks number*/ -} __packed; - -typedef union hfs_btree_key { - u8 key_len; /* number of bytes in the key */ - struct hfs_cat_key cat; - struct hfs_ext_key ext; -} hfs_btree_key; - -#define HFS_MAX_CAT_KEYLEN (sizeof(struct hfs_cat_key) - sizeof(u8)) -#define HFS_MAX_EXT_KEYLEN (sizeof(struct hfs_ext_key) - sizeof(u8)) - -typedef union hfs_btree_key btree_key; - -struct hfs_extent { - __be16 block; - __be16 count; -}; -typedef struct hfs_extent hfs_extent_rec[3]; - -/* The catalog record for a file */ -struct hfs_cat_file { - s8 type; /* The type of entry */ - u8 reserved; - u8 Flags; /* Flags such as read-only */ - s8 Typ; /* file version number = 0 */ - struct hfs_finfo UsrWds; /* data used by the Finder */ - __be32 FlNum; /* The CNID */ - __be16 StBlk; /* obsolete */ - __be32 LgLen; /* The logical EOF of the data fork*/ - __be32 PyLen; /* The physical EOF of the data fork */ - __be16 RStBlk; /* obsolete */ - __be32 RLgLen; /* The logical EOF of the rsrc fork */ - __be32 RPyLen; /* The physical EOF of the rsrc fork */ - __be32 CrDat; /* The creation date */ - __be32 MdDat; /* The modified date */ - __be32 BkDat; /* The last backup date */ - struct hfs_fxinfo FndrInfo; /* more data for the Finder */ - __be16 ClpSize; /* number of bytes to allocate - when extending files */ - hfs_extent_rec ExtRec; /* first extent record - for the data fork */ - hfs_extent_rec RExtRec; /* first extent record - for the resource fork */ - u32 Resrv; /* reserved by Apple */ -} __packed; - -/* the catalog record for a directory */ -struct hfs_cat_dir { - s8 type; /* The type of entry */ - u8 reserved; - __be16 Flags; /* flags */ - __be16 Val; /* Valence: number of files and - dirs in the directory */ - __be32 DirID; /* The CNID */ - __be32 CrDat; /* The creation date */ - __be32 MdDat; /* The modification date */ - __be32 BkDat; /* The last backup date */ - struct hfs_dinfo UsrInfo; /* data used by the Finder */ - struct hfs_dxinfo FndrInfo; /* more data used by Finder */ - u8 Resrv[16]; /* reserved by Apple */ -} __packed; - -/* the catalog record for a thread */ -struct hfs_cat_thread { - s8 type; /* The type of entry */ - u8 reserved[9]; /* reserved by Apple */ - __be32 ParID; /* CNID of parent directory */ - struct hfs_name CName; /* The name of this entry */ -} __packed; - -/* A catalog tree record */ -typedef union hfs_cat_rec { - s8 type; /* The type of entry */ - struct hfs_cat_file file; - struct hfs_cat_dir dir; - struct hfs_cat_thread thread; -} hfs_cat_rec; - -struct hfs_mdb { - __be16 drSigWord; /* Signature word indicating fs type */ - __be32 drCrDate; /* fs creation date/time */ - __be32 drLsMod; /* fs modification date/time */ - __be16 drAtrb; /* fs attributes */ - __be16 drNmFls; /* number of files in root directory */ - __be16 drVBMSt; /* location (in 512-byte blocks) - of the volume bitmap */ - __be16 drAllocPtr; /* location (in allocation blocks) - to begin next allocation search */ - __be16 drNmAlBlks; /* number of allocation blocks */ - __be32 drAlBlkSiz; /* bytes in an allocation block */ - __be32 drClpSiz; /* clumpsize, the number of bytes to - allocate when extending a file */ - __be16 drAlBlSt; /* location (in 512-byte blocks) - of the first allocation block */ - __be32 drNxtCNID; /* CNID to assign to the next - file or directory created */ - __be16 drFreeBks; /* number of free allocation blocks */ - u8 drVN[28]; /* the volume label */ - __be32 drVolBkUp; /* fs backup date/time */ - __be16 drVSeqNum; /* backup sequence number */ - __be32 drWrCnt; /* fs write count */ - __be32 drXTClpSiz; /* clumpsize for the extents B-tree */ - __be32 drCTClpSiz; /* clumpsize for the catalog B-tree */ - __be16 drNmRtDirs; /* number of directories in - the root directory */ - __be32 drFilCnt; /* number of files in the fs */ - __be32 drDirCnt; /* number of directories in the fs */ - u8 drFndrInfo[32]; /* data used by the Finder */ - __be16 drEmbedSigWord; /* embedded volume signature */ - __be32 drEmbedExtent; /* starting block number (xdrStABN) - and number of allocation blocks - (xdrNumABlks) occupied by embedded - volume */ - __be32 drXTFlSize; /* bytes in the extents B-tree */ - hfs_extent_rec drXTExtRec; /* extents B-tree's first 3 extents */ - __be32 drCTFlSize; /* bytes in the catalog B-tree */ - hfs_extent_rec drCTExtRec; /* catalog B-tree's first 3 extents */ -} __packed; +#include <linux/hfs_common.h> /*======== Data structures kept in memory ========*/ diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index fff149af89da..e94dbc04a1e4 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -18,7 +18,6 @@ #include <asm/byteorder.h> #include <linux/uaccess.h> -#include <linux/hfs_common.h> #include "hfs.h" @@ -140,74 +139,90 @@ struct hfs_sb_info { #define HFS_FLG_ALT_MDB_DIRTY 2 /* bitmap.c */ -extern u32 hfs_vbm_search_free(struct super_block *, u32, u32 *); -extern int hfs_clear_vbm_bits(struct super_block *, u16, u16); +extern u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits); +extern int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count); /* catalog.c */ -extern int hfs_cat_keycmp(const btree_key *, const btree_key *); +extern int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2); struct hfs_find_data; -extern int hfs_cat_find_brec(struct super_block *, u32, struct hfs_find_data *); -extern int hfs_cat_create(u32, struct inode *, const struct qstr *, struct inode *); -extern int hfs_cat_delete(u32, struct inode *, const struct qstr *); -extern int hfs_cat_move(u32, struct inode *, const struct qstr *, - struct inode *, const struct qstr *); -extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, const struct qstr *); +extern int hfs_cat_find_brec(struct super_block *sb, u32 cnid, + struct hfs_find_data *fd); +extern int hfs_cat_create(u32 cnid, struct inode *dir, + const struct qstr *str, struct inode *inode); +extern int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str); +extern int hfs_cat_move(u32 cnid, struct inode *src_dir, + const struct qstr *src_name, + struct inode *dst_dir, + const struct qstr *dst_name); +extern void hfs_cat_build_key(struct super_block *sb, btree_key *key, + u32 parent, const struct qstr *name); /* dir.c */ extern const struct file_operations hfs_dir_operations; extern const struct inode_operations hfs_dir_inode_operations; /* extent.c */ -extern int hfs_ext_keycmp(const btree_key *, const btree_key *); +extern int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2); extern u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off); -extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int); -extern int hfs_ext_write_extent(struct inode *); -extern int hfs_extend_file(struct inode *); -extern void hfs_file_truncate(struct inode *); +extern int hfs_free_fork(struct super_block *sb, + struct hfs_cat_file *file, int type); +extern int hfs_ext_write_extent(struct inode *inode); +extern int hfs_extend_file(struct inode *inode); +extern void hfs_file_truncate(struct inode *inode); -extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int hfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create); /* inode.c */ extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata); -extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); -extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); -extern int hfs_write_inode(struct inode *, struct writeback_control *); -extern int hfs_inode_setattr(struct mnt_idmap *, struct dentry *, - struct iattr *); + loff_t pos, unsigned int len, struct folio **foliop, + void **fsdata); +extern struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, + umode_t mode); +extern void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, + __be32 *log_size, __be32 *phys_size); +extern int hfs_write_inode(struct inode *inode, struct writeback_control *wbc); +extern int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, - __be32 log_size, __be32 phys_size, u32 clump_size); -extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *); -extern void hfs_evict_inode(struct inode *); -extern void hfs_delete_inode(struct inode *); + __be32 __log_size, __be32 phys_size, + u32 clump_size); +extern struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, + hfs_cat_rec *rec); +extern void hfs_evict_inode(struct inode *inode); +extern void hfs_delete_inode(struct inode *inode); /* attr.c */ extern const struct xattr_handler * const hfs_xattr_handlers[]; /* mdb.c */ -extern int hfs_mdb_get(struct super_block *); -extern void hfs_mdb_commit(struct super_block *); -extern void hfs_mdb_close(struct super_block *); -extern void hfs_mdb_put(struct super_block *); +extern int hfs_mdb_get(struct super_block *sb); +extern void hfs_mdb_commit(struct super_block *sb); +extern void hfs_mdb_close(struct super_block *sb); +extern void hfs_mdb_put(struct super_block *sb); /* part_tbl.c */ -extern int hfs_part_find(struct super_block *, sector_t *, sector_t *); +extern int hfs_part_find(struct super_block *sb, + sector_t *part_start, sector_t *part_size); /* string.c */ extern const struct dentry_operations hfs_dentry_operations; -extern int hfs_hash_dentry(const struct dentry *, struct qstr *); -extern int hfs_strcmp(const unsigned char *, unsigned int, - const unsigned char *, unsigned int); +extern int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this); +extern int hfs_strcmp(const unsigned char *s1, unsigned int len1, + const unsigned char *s2, unsigned int len2); extern int hfs_compare_dentry(const struct dentry *dentry, - unsigned int len, const char *str, const struct qstr *name); + unsigned int len, const char *str, + const struct qstr *name); /* trans.c */ -extern void hfs_asc2mac(struct super_block *, struct hfs_name *, const struct qstr *); -extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *); +extern void hfs_asc2mac(struct super_block *sb, + struct hfs_name *out, const struct qstr *in); +extern int hfs_mac2asc(struct super_block *sb, + char *out, const struct hfs_name *in); /* super.c */ extern void hfs_mark_mdb_dirty(struct super_block *sb); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 9cd449913dc8..524db1389737 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -45,7 +45,8 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) } int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) + loff_t pos, unsigned int len, struct folio **foliop, + void **fsdata) { int ret; @@ -412,7 +413,7 @@ struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_ return NULL; } inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data); - if (inode && (inode->i_state & I_NEW)) + if (inode && (inode_state_read_once(inode) & I_NEW)) unlock_new_inode(inode); return inode; } diff --git a/fs/hfs/string.c b/fs/hfs/string.c index 3912209153a8..0cfa35e82abc 100644 --- a/fs/hfs/string.c +++ b/fs/hfs/string.c @@ -16,6 +16,8 @@ #include "hfs_fs.h" #include <linux/dcache.h> +#include <kunit/visibility.h> + /*================ File-local variables ================*/ /* @@ -65,6 +67,7 @@ int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this) this->hash = end_name_hash(hash); return 0; } +EXPORT_SYMBOL_IF_KUNIT(hfs_hash_dentry); /* * Compare two strings in the HFS filename character ordering @@ -87,6 +90,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1, } return len1 - len2; } +EXPORT_SYMBOL_IF_KUNIT(hfs_strcmp); /* * Test for equality of two strings in the HFS filename character ordering. @@ -112,3 +116,4 @@ int hfs_compare_dentry(const struct dentry *dentry, } return 0; } +EXPORT_SYMBOL_IF_KUNIT(hfs_compare_dentry); diff --git a/fs/hfs/string_test.c b/fs/hfs/string_test.c new file mode 100644 index 000000000000..e1bf6f954312 --- /dev/null +++ b/fs/hfs/string_test.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit tests for HFS string operations + * + * Copyright (C) 2025 Viacheslav Dubeyko <slava@dubeyko.com> + */ + +#include <kunit/test.h> +#include <linux/dcache.h> +#include "hfs_fs.h" + +/* Test hfs_strcmp function */ +static void hfs_strcmp_test(struct kunit *test) +{ + /* Test equal strings */ + KUNIT_EXPECT_EQ(test, 0, hfs_strcmp("hello", 5, "hello", 5)); + KUNIT_EXPECT_EQ(test, 0, hfs_strcmp("test", 4, "test", 4)); + KUNIT_EXPECT_EQ(test, 0, hfs_strcmp("", 0, "", 0)); + + /* Test unequal strings */ + KUNIT_EXPECT_NE(test, 0, hfs_strcmp("hello", 5, "world", 5)); + KUNIT_EXPECT_NE(test, 0, hfs_strcmp("test", 4, "testing", 7)); + + /* Test different lengths */ + KUNIT_EXPECT_LT(test, hfs_strcmp("test", 4, "testing", 7), 0); + KUNIT_EXPECT_GT(test, hfs_strcmp("testing", 7, "test", 4), 0); + + /* Test case insensitive comparison (HFS should handle case) */ + KUNIT_EXPECT_EQ(test, 0, hfs_strcmp("Test", 4, "TEST", 4)); + KUNIT_EXPECT_EQ(test, 0, hfs_strcmp("hello", 5, "HELLO", 5)); + + /* Test with special characters */ + KUNIT_EXPECT_EQ(test, 0, hfs_strcmp("file.txt", 8, "file.txt", 8)); + KUNIT_EXPECT_NE(test, 0, hfs_strcmp("file.txt", 8, "file.dat", 8)); + + /* Test boundary cases */ + KUNIT_EXPECT_EQ(test, 0, hfs_strcmp("a", 1, "a", 1)); + KUNIT_EXPECT_NE(test, 0, hfs_strcmp("a", 1, "b", 1)); +} + +/* Test hfs_hash_dentry function */ +static void hfs_hash_dentry_test(struct kunit *test) +{ + struct qstr test_name1, test_name2, test_name3; + struct dentry dentry = {}; + char name1[] = "testfile"; + char name2[] = "TestFile"; + char name3[] = "different"; + + /* Initialize test strings */ + test_name1.name = name1; + test_name1.len = strlen(name1); + test_name1.hash = 0; + + test_name2.name = name2; + test_name2.len = strlen(name2); + test_name2.hash = 0; + + test_name3.name = name3; + test_name3.len = strlen(name3); + test_name3.hash = 0; + + /* Test hashing */ + KUNIT_EXPECT_EQ(test, 0, hfs_hash_dentry(&dentry, &test_name1)); + KUNIT_EXPECT_EQ(test, 0, hfs_hash_dentry(&dentry, &test_name2)); + KUNIT_EXPECT_EQ(test, 0, hfs_hash_dentry(&dentry, &test_name3)); + + /* Case insensitive names should hash the same */ + KUNIT_EXPECT_EQ(test, test_name1.hash, test_name2.hash); + + /* Different names should have different hashes */ + KUNIT_EXPECT_NE(test, test_name1.hash, test_name3.hash); +} + +/* Test hfs_compare_dentry function */ +static void hfs_compare_dentry_test(struct kunit *test) +{ + struct qstr test_name; + struct dentry dentry = {}; + char name[] = "TestFile"; + + test_name.name = name; + test_name.len = strlen(name); + + /* Test exact match */ + KUNIT_EXPECT_EQ(test, 0, hfs_compare_dentry(&dentry, 8, + "TestFile", &test_name)); + + /* Test case insensitive match */ + KUNIT_EXPECT_EQ(test, 0, hfs_compare_dentry(&dentry, 8, + "testfile", &test_name)); + KUNIT_EXPECT_EQ(test, 0, hfs_compare_dentry(&dentry, 8, + "TESTFILE", &test_name)); + + /* Test different names */ + KUNIT_EXPECT_EQ(test, 1, hfs_compare_dentry(&dentry, 8, + "DiffFile", &test_name)); + + /* Test different lengths */ + KUNIT_EXPECT_EQ(test, 1, hfs_compare_dentry(&dentry, 7, + "TestFil", &test_name)); + KUNIT_EXPECT_EQ(test, 1, hfs_compare_dentry(&dentry, 9, + "TestFiles", &test_name)); + + /* Test empty string */ + test_name.name = ""; + test_name.len = 0; + KUNIT_EXPECT_EQ(test, 0, hfs_compare_dentry(&dentry, 0, "", &test_name)); + + /* Test HFS_NAMELEN boundary */ + test_name.name = "This_is_a_very_long_filename_that_exceeds_normal_limits"; + test_name.len = strlen(test_name.name); + KUNIT_EXPECT_EQ(test, 0, hfs_compare_dentry(&dentry, HFS_NAMELEN, + "This_is_a_very_long_filename_th", &test_name)); +} + +static struct kunit_case hfs_string_test_cases[] = { + KUNIT_CASE(hfs_strcmp_test), + KUNIT_CASE(hfs_hash_dentry_test), + KUNIT_CASE(hfs_compare_dentry_test), + {} +}; + +static struct kunit_suite hfs_string_test_suite = { + .name = "hfs_string", + .test_cases = hfs_string_test_cases, +}; + +kunit_test_suite(hfs_string_test_suite); + +MODULE_DESCRIPTION("KUnit tests for HFS string operations"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); diff --git a/fs/hfsplus/.kunitconfig b/fs/hfsplus/.kunitconfig new file mode 100644 index 000000000000..6c96dc7e872c --- /dev/null +++ b/fs/hfsplus/.kunitconfig @@ -0,0 +1,8 @@ +CONFIG_KUNIT=y +CONFIG_HFSPLUS_FS=y +CONFIG_HFSPLUS_KUNIT_TEST=y +CONFIG_BLOCK=y +CONFIG_BUFFER_HEAD=y +CONFIG_NLS=y +CONFIG_NLS_UTF8=y +CONFIG_LEGACY_DIRECT_IO=y diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 8ce4a33a9ac7..ca8401cb6954 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -14,3 +14,18 @@ config HFSPLUS_FS MacOS 8. It includes all Mac specific filesystem data such as data forks and creator codes, but it also has several UNIX style features such as file ownership and permissions. + +config HFSPLUS_KUNIT_TEST + tristate "KUnit tests for HFS+ filesystem" if !KUNIT_ALL_TESTS + depends on HFSPLUS_FS && KUNIT + default KUNIT_ALL_TESTS + help + This builds KUnit tests for the HFS+ filesystem. + + KUnit tests run during boot and output the results to the debug + log in TAP format (https://testanything.org/). Only useful for + kernel devs running KUnit test harness and are not for inclusion + into a production build. + + For more information on KUnit and unit tests in general please + refer to the KUnit documentation in Documentation/dev-tools/kunit/. diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile index 9ed20e64b983..f2a9ae697e81 100644 --- a/fs/hfsplus/Makefile +++ b/fs/hfsplus/Makefile @@ -8,3 +8,6 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o + +# KUnit tests +obj-$(CONFIG_HFSPLUS_KUNIT_TEST) += unicode_test.o diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index afc9c89e8c6a..336d654861c5 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -210,7 +210,7 @@ release: return res; } -int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) +int hfs_brec_read(struct hfs_find_data *fd, void *rec, u32 rec_len) { int res; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 63e652ad1e0d..191661af9677 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -20,10 +20,10 @@ /* Copy a specified range of bytes from the raw data of a node */ -void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) +void hfs_bnode_read(struct hfs_bnode *node, void *buf, u32 off, u32 len) { struct page **pagep; - int l; + u32 l; if (!is_bnode_offset_valid(node, off)) return; @@ -31,7 +31,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) if (len == 0) { pr_err("requested zero length: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, len %d\n", + "node_size %u, offset %u, len %u\n", node->this, node->type, node->height, node->tree->node_size, off, len); return; @@ -43,17 +43,17 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) pagep = node->page + (off >> PAGE_SHIFT); off &= ~PAGE_MASK; - l = min_t(int, len, PAGE_SIZE - off); + l = min_t(u32, len, PAGE_SIZE - off); memcpy_from_page(buf, *pagep, off, l); while ((len -= l) != 0) { buf += l; - l = min_t(int, len, PAGE_SIZE); + l = min_t(u32, len, PAGE_SIZE); memcpy_from_page(buf, *++pagep, 0, l); } } -u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) +u16 hfs_bnode_read_u16(struct hfs_bnode *node, u32 off) { __be16 data; /* TODO: optimize later... */ @@ -61,7 +61,7 @@ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) return be16_to_cpu(data); } -u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) +u8 hfs_bnode_read_u8(struct hfs_bnode *node, u32 off) { u8 data; /* TODO: optimize later... */ @@ -69,10 +69,10 @@ u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) return data; } -void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, u32 off) { struct hfs_btree *tree; - int key_len; + u32 key_len; tree = node->tree; if (node->type == HFS_NODE_LEAF || @@ -84,17 +84,17 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) { memset(key, 0, sizeof(hfsplus_btree_key)); - pr_err("hfsplus: Invalid key length: %d\n", key_len); + pr_err("hfsplus: Invalid key length: %u\n", key_len); return; } hfs_bnode_read(node, key, off, key_len); } -void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) +void hfs_bnode_write(struct hfs_bnode *node, void *buf, u32 off, u32 len) { struct page **pagep; - int l; + u32 l; if (!is_bnode_offset_valid(node, off)) return; @@ -102,7 +102,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) if (len == 0) { pr_err("requested zero length: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, len %d\n", + "node_size %u, offset %u, len %u\n", node->this, node->type, node->height, node->tree->node_size, off, len); return; @@ -114,29 +114,29 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) pagep = node->page + (off >> PAGE_SHIFT); off &= ~PAGE_MASK; - l = min_t(int, len, PAGE_SIZE - off); + l = min_t(u32, len, PAGE_SIZE - off); memcpy_to_page(*pagep, off, buf, l); set_page_dirty(*pagep); while ((len -= l) != 0) { buf += l; - l = min_t(int, len, PAGE_SIZE); + l = min_t(u32, len, PAGE_SIZE); memcpy_to_page(*++pagep, 0, buf, l); set_page_dirty(*pagep); } } -void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) +void hfs_bnode_write_u16(struct hfs_bnode *node, u32 off, u16 data) { __be16 v = cpu_to_be16(data); /* TODO: optimize later... */ hfs_bnode_write(node, &v, off, 2); } -void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) +void hfs_bnode_clear(struct hfs_bnode *node, u32 off, u32 len) { struct page **pagep; - int l; + u32 l; if (!is_bnode_offset_valid(node, off)) return; @@ -144,7 +144,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) if (len == 0) { pr_err("requested zero length: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, len %d\n", + "node_size %u, offset %u, len %u\n", node->this, node->type, node->height, node->tree->node_size, off, len); return; @@ -156,22 +156,22 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) pagep = node->page + (off >> PAGE_SHIFT); off &= ~PAGE_MASK; - l = min_t(int, len, PAGE_SIZE - off); + l = min_t(u32, len, PAGE_SIZE - off); memzero_page(*pagep, off, l); set_page_dirty(*pagep); while ((len -= l) != 0) { - l = min_t(int, len, PAGE_SIZE); + l = min_t(u32, len, PAGE_SIZE); memzero_page(*++pagep, 0, l); set_page_dirty(*pagep); } } -void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, - struct hfs_bnode *src_node, int src, int len) +void hfs_bnode_copy(struct hfs_bnode *dst_node, u32 dst, + struct hfs_bnode *src_node, u32 src, u32 len) { struct page **src_page, **dst_page; - int l; + u32 l; hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) @@ -188,12 +188,12 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, dst &= ~PAGE_MASK; if (src == dst) { - l = min_t(int, len, PAGE_SIZE - src); + l = min_t(u32, len, PAGE_SIZE - src); memcpy_page(*dst_page, src, *src_page, src, l); set_page_dirty(*dst_page); while ((len -= l) != 0) { - l = min_t(int, len, PAGE_SIZE); + l = min_t(u32, len, PAGE_SIZE); memcpy_page(*++dst_page, 0, *++src_page, 0, l); set_page_dirty(*dst_page); } @@ -225,11 +225,11 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, } } -void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) +void hfs_bnode_move(struct hfs_bnode *node, u32 dst, u32 src, u32 len) { struct page **src_page, **dst_page; void *src_ptr, *dst_ptr; - int l; + u32 l; hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) @@ -299,7 +299,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) dst &= ~PAGE_MASK; if (src == dst) { - l = min_t(int, len, PAGE_SIZE - src); + l = min_t(u32, len, PAGE_SIZE - src); dst_ptr = kmap_local_page(*dst_page) + src; src_ptr = kmap_local_page(*src_page) + src; @@ -309,7 +309,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) kunmap_local(dst_ptr); while ((len -= l) != 0) { - l = min_t(int, len, PAGE_SIZE); + l = min_t(u32, len, PAGE_SIZE); dst_ptr = kmap_local_page(*++dst_page); src_ptr = kmap_local_page(*++src_page); memmove(dst_ptr, src_ptr, l); @@ -481,6 +481,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) tree->node_hash[hash] = node; tree->node_hash_cnt++; } else { + hfs_bnode_get(node2); spin_unlock(&tree->hash_lock); kfree(node); wait_event(node2->lock_wq, @@ -704,6 +705,5 @@ bool hfs_bnode_need_zeroout(struct hfs_btree *tree) struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); const u32 volume_attr = be32_to_cpu(sbi->s_vhdr->attributes); - return tree->cnid == HFSPLUS_CAT_CNID && - volume_attr & HFSPLUS_VOL_UNUSED_NODE_FIX; + return volume_attr & HFSPLUS_VOL_UNUSED_NODE_FIX; } diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index b4645102feec..6796c1a80e99 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -60,7 +60,7 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) return retval; } -int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, u32 entry_len) { struct hfs_btree *tree; struct hfs_bnode *node, *new_node; diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 7cc5aea14572..229f25dc7c49 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -344,7 +344,7 @@ static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) } /* Make sure @tree has enough space for the @rsvd_nodes */ -int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes) +int hfs_bmap_reserve(struct hfs_btree *tree, u32 rsvd_nodes) { struct inode *inode = tree->inode; struct hfsplus_inode_info *hip = HFSPLUS_I(inode); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 1b3e27a0d5e0..cadf0b5f9342 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -552,8 +552,13 @@ static int hfsplus_rename(struct mnt_idmap *idmap, res = hfsplus_rename_cat((u32)(unsigned long)old_dentry->d_fsdata, old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); - if (!res) + if (!res) { new_dentry->d_fsdata = old_dentry->d_fsdata; + + res = hfsplus_cat_write_inode(old_dir); + if (!res) + res = hfsplus_cat_write_inode(new_dir); + } return res; } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 89e8b19c127b..45fe3a12ecba 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -16,7 +16,6 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/fs_context.h> -#include <linux/hfs_common.h> #include "hfsplus_raw.h" /* Runtime config options */ @@ -357,21 +356,21 @@ u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors, struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id); void hfs_btree_close(struct hfs_btree *tree); int hfs_btree_write(struct hfs_btree *tree); -int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes); +int hfs_bmap_reserve(struct hfs_btree *tree, u32 rsvd_nodes); struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree); void hfs_bmap_free(struct hfs_bnode *node); /* bnode.c */ -void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len); -u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off); -u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off); -void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off); -void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len); -void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data); -void hfs_bnode_clear(struct hfs_bnode *node, int off, int len); -void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, - struct hfs_bnode *src_node, int src, int len); -void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len); +void hfs_bnode_read(struct hfs_bnode *node, void *buf, u32 off, u32 len); +u16 hfs_bnode_read_u16(struct hfs_bnode *node, u32 off); +u8 hfs_bnode_read_u8(struct hfs_bnode *node, u32 off); +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, u32 off); +void hfs_bnode_write(struct hfs_bnode *node, void *buf, u32 off, u32 len); +void hfs_bnode_write_u16(struct hfs_bnode *node, u32 off, u16 data); +void hfs_bnode_clear(struct hfs_bnode *node, u32 off, u32 len); +void hfs_bnode_copy(struct hfs_bnode *dst_node, u32 dst, + struct hfs_bnode *src_node, u32 src, u32 len); +void hfs_bnode_move(struct hfs_bnode *node, u32 dst, u32 src, u32 len); void hfs_bnode_dump(struct hfs_bnode *node); void hfs_bnode_unlink(struct hfs_bnode *node); struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid); @@ -386,7 +385,7 @@ bool hfs_bnode_need_zeroout(struct hfs_btree *tree); /* brec.c */ u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off); u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec); -int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len); +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, u32 entry_len); int hfs_brec_remove(struct hfs_find_data *fd); /* bfind.c */ @@ -399,7 +398,7 @@ int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd, int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, search_strategy_t rec_found); int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare); -int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len); +int hfs_brec_read(struct hfs_find_data *fd, void *rec, u32 rec_len); int hfs_brec_goto(struct hfs_find_data *fd, int cnt); /* catalog.c */ @@ -477,6 +476,8 @@ int hfs_part_find(struct super_block *sb, sector_t *part_start, /* super.c */ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino); void hfsplus_mark_mdb_dirty(struct super_block *sb); +void hfsplus_prepare_volume_header_for_commit(struct hfsplus_vh *vhdr); +int hfsplus_commit_superblock(struct super_block *sb); /* tables.c */ extern u16 hfsplus_case_fold_table[]; @@ -549,14 +550,14 @@ hfsplus_btree_lock_class(struct hfs_btree *tree) } static inline -bool is_bnode_offset_valid(struct hfs_bnode *node, int off) +bool is_bnode_offset_valid(struct hfs_bnode *node, u32 off) { bool is_valid = off < node->tree->node_size; if (!is_valid) { pr_err("requested invalid offset: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d\n", + "node_size %u, offset %u\n", node->this, node->type, node->height, node->tree->node_size, off); } @@ -565,7 +566,7 @@ bool is_bnode_offset_valid(struct hfs_bnode *node, int off) } static inline -int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) +u32 check_and_correct_requested_length(struct hfs_bnode *node, u32 off, u32 len) { unsigned int node_size; @@ -575,12 +576,12 @@ int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) node_size = node->tree->node_size; if ((off + len) > node_size) { - int new_len = (int)node_size - off; + u32 new_len = node_size - off; pr_err("requested length has been corrected: " "NODE: id %u, type %#x, height %u, " - "node_size %u, offset %d, " - "requested_len %d, corrected_len %d\n", + "node_size %u, offset %u, " + "requested_len %u, corrected_len %u\n", node->this, node->type, node->height, node->tree->node_size, off, len, new_len); diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 68b4240c6191..83b5dbde924b 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -15,398 +15,6 @@ #define _LINUX_HFSPLUS_RAW_H #include <linux/types.h> - -/* Some constants */ -#define HFSPLUS_SECTOR_SIZE 512 -#define HFSPLUS_SECTOR_SHIFT 9 -#define HFSPLUS_VOLHEAD_SECTOR 2 -#define HFSPLUS_VOLHEAD_SIG 0x482b -#define HFSPLUS_VOLHEAD_SIGX 0x4858 -#define HFSPLUS_SUPER_MAGIC 0x482b -#define HFSPLUS_MIN_VERSION 4 -#define HFSPLUS_CURRENT_VERSION 5 - -#define HFSP_WRAP_MAGIC 0x4244 -#define HFSP_WRAP_ATTRIB_SLOCK 0x8000 -#define HFSP_WRAP_ATTRIB_SPARED 0x0200 - -#define HFSP_WRAPOFF_SIG 0x00 -#define HFSP_WRAPOFF_ATTRIB 0x0A -#define HFSP_WRAPOFF_ABLKSIZE 0x14 -#define HFSP_WRAPOFF_ABLKSTART 0x1C -#define HFSP_WRAPOFF_EMBEDSIG 0x7C -#define HFSP_WRAPOFF_EMBEDEXT 0x7E - -#define HFSP_HIDDENDIR_NAME \ - "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data" - -#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */ -#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */ - -#define HFSP_SYMLINK_TYPE 0x736c6e6b /* 'slnk' */ -#define HFSP_SYMLINK_CREATOR 0x72686170 /* 'rhap' */ - -#define HFSP_MOUNT_VERSION 0x482b4c78 /* 'H+Lx' */ - -/* Structures used on disk */ - -typedef __be32 hfsplus_cnid; -typedef __be16 hfsplus_unichr; - -#define HFSPLUS_MAX_STRLEN 255 -#define HFSPLUS_ATTR_MAX_STRLEN 127 - -/* A "string" as used in filenames, etc. */ -struct hfsplus_unistr { - __be16 length; - hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN]; -} __packed; - -/* - * A "string" is used in attributes file - * for name of extended attribute - */ -struct hfsplus_attr_unistr { - __be16 length; - hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN]; -} __packed; - -/* POSIX permissions */ -struct hfsplus_perm { - __be32 owner; - __be32 group; - u8 rootflags; - u8 userflags; - __be16 mode; - __be32 dev; -} __packed; - -#define HFSPLUS_FLG_NODUMP 0x01 -#define HFSPLUS_FLG_IMMUTABLE 0x02 -#define HFSPLUS_FLG_APPEND 0x04 - -/* A single contiguous area of a file */ -struct hfsplus_extent { - __be32 start_block; - __be32 block_count; -} __packed; -typedef struct hfsplus_extent hfsplus_extent_rec[8]; - -/* Information for a "Fork" in a file */ -struct hfsplus_fork_raw { - __be64 total_size; - __be32 clump_size; - __be32 total_blocks; - hfsplus_extent_rec extents; -} __packed; - -/* HFS+ Volume Header */ -struct hfsplus_vh { - __be16 signature; - __be16 version; - __be32 attributes; - __be32 last_mount_vers; - u32 reserved; - - __be32 create_date; - __be32 modify_date; - __be32 backup_date; - __be32 checked_date; - - __be32 file_count; - __be32 folder_count; - - __be32 blocksize; - __be32 total_blocks; - __be32 free_blocks; - - __be32 next_alloc; - __be32 rsrc_clump_sz; - __be32 data_clump_sz; - hfsplus_cnid next_cnid; - - __be32 write_count; - __be64 encodings_bmp; - - u32 finder_info[8]; - - struct hfsplus_fork_raw alloc_file; - struct hfsplus_fork_raw ext_file; - struct hfsplus_fork_raw cat_file; - struct hfsplus_fork_raw attr_file; - struct hfsplus_fork_raw start_file; -} __packed; - -/* HFS+ volume attributes */ -#define HFSPLUS_VOL_UNMNT (1 << 8) -#define HFSPLUS_VOL_SPARE_BLK (1 << 9) -#define HFSPLUS_VOL_NOCACHE (1 << 10) -#define HFSPLUS_VOL_INCNSTNT (1 << 11) -#define HFSPLUS_VOL_NODEID_REUSED (1 << 12) -#define HFSPLUS_VOL_JOURNALED (1 << 13) -#define HFSPLUS_VOL_SOFTLOCK (1 << 15) -#define HFSPLUS_VOL_UNUSED_NODE_FIX (1 << 31) - -/* HFS+ BTree node descriptor */ -struct hfs_bnode_desc { - __be32 next; - __be32 prev; - s8 type; - u8 height; - __be16 num_recs; - u16 reserved; -} __packed; - -/* HFS+ BTree node types */ -#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ -#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ -#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ -#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ - -/* HFS+ BTree header */ -struct hfs_btree_header_rec { - __be16 depth; - __be32 root; - __be32 leaf_count; - __be32 leaf_head; - __be32 leaf_tail; - __be16 node_size; - __be16 max_key_len; - __be32 node_count; - __be32 free_nodes; - u16 reserved1; - __be32 clump_size; - u8 btree_type; - u8 key_type; - __be32 attributes; - u32 reserved3[16]; -} __packed; - -/* BTree attributes */ -#define HFS_TREE_BIGKEYS 2 -#define HFS_TREE_VARIDXKEYS 4 - -/* HFS+ BTree misc info */ -#define HFSPLUS_TREE_HEAD 0 -#define HFSPLUS_NODE_MXSZ 32768 -#define HFSPLUS_ATTR_TREE_NODE_SIZE 8192 -#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT 3 -#define HFSPLUS_BTREE_HDR_USER_BYTES 128 - -/* Some special File ID numbers (stolen from hfs.h) */ -#define HFSPLUS_POR_CNID 1 /* Parent Of the Root */ -#define HFSPLUS_ROOT_CNID 2 /* ROOT directory */ -#define HFSPLUS_EXT_CNID 3 /* EXTents B-tree */ -#define HFSPLUS_CAT_CNID 4 /* CATalog B-tree */ -#define HFSPLUS_BAD_CNID 5 /* BAD blocks file */ -#define HFSPLUS_ALLOC_CNID 6 /* ALLOCation file */ -#define HFSPLUS_START_CNID 7 /* STARTup file */ -#define HFSPLUS_ATTR_CNID 8 /* ATTRibutes file */ -#define HFSPLUS_EXCH_CNID 15 /* ExchangeFiles temp id */ -#define HFSPLUS_FIRSTUSER_CNID 16 /* first available user id */ - -/* btree key type */ -#define HFSPLUS_KEY_CASEFOLDING 0xCF /* case-insensitive */ -#define HFSPLUS_KEY_BINARY 0xBC /* case-sensitive */ - -/* HFS+ catalog entry key */ -struct hfsplus_cat_key { - __be16 key_len; - hfsplus_cnid parent; - struct hfsplus_unistr name; -} __packed; - -#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key)) - -/* Structs from hfs.h */ -struct hfsp_point { - __be16 v; - __be16 h; -} __packed; - -struct hfsp_rect { - __be16 top; - __be16 left; - __be16 bottom; - __be16 right; -} __packed; - - -/* HFS directory info (stolen from hfs.h */ -struct DInfo { - struct hfsp_rect frRect; - __be16 frFlags; - struct hfsp_point frLocation; - __be16 frView; -} __packed; - -struct DXInfo { - struct hfsp_point frScroll; - __be32 frOpenChain; - __be16 frUnused; - __be16 frComment; - __be32 frPutAway; -} __packed; - -/* HFS+ folder data (part of an hfsplus_cat_entry) */ -struct hfsplus_cat_folder { - __be16 type; - __be16 flags; - __be32 valence; - hfsplus_cnid id; - __be32 create_date; - __be32 content_mod_date; - __be32 attribute_mod_date; - __be32 access_date; - __be32 backup_date; - struct hfsplus_perm permissions; - struct_group_attr(info, __packed, - struct DInfo user_info; - struct DXInfo finder_info; - ); - __be32 text_encoding; - __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */ -} __packed; - -/* HFS file info (stolen from hfs.h) */ -struct FInfo { - __be32 fdType; - __be32 fdCreator; - __be16 fdFlags; - struct hfsp_point fdLocation; - __be16 fdFldr; -} __packed; - -struct FXInfo { - __be16 fdIconID; - u8 fdUnused[8]; - __be16 fdComment; - __be32 fdPutAway; -} __packed; - -/* HFS+ file data (part of a cat_entry) */ -struct hfsplus_cat_file { - __be16 type; - __be16 flags; - u32 reserved1; - hfsplus_cnid id; - __be32 create_date; - __be32 content_mod_date; - __be32 attribute_mod_date; - __be32 access_date; - __be32 backup_date; - struct hfsplus_perm permissions; - struct_group_attr(info, __packed, - struct FInfo user_info; - struct FXInfo finder_info; - ); - __be32 text_encoding; - u32 reserved2; - - struct hfsplus_fork_raw data_fork; - struct hfsplus_fork_raw rsrc_fork; -} __packed; - -/* File and folder flag bits */ -#define HFSPLUS_FILE_LOCKED 0x0001 -#define HFSPLUS_FILE_THREAD_EXISTS 0x0002 -#define HFSPLUS_XATTR_EXISTS 0x0004 -#define HFSPLUS_ACL_EXISTS 0x0008 -#define HFSPLUS_HAS_FOLDER_COUNT 0x0010 /* Folder has subfolder count - * (HFSX only) */ - -/* HFS+ catalog thread (part of a cat_entry) */ -struct hfsplus_cat_thread { - __be16 type; - s16 reserved; - hfsplus_cnid parentID; - struct hfsplus_unistr nodeName; -} __packed; - -#define HFSPLUS_MIN_THREAD_SZ 10 - -/* A data record in the catalog tree */ -typedef union { - __be16 type; - struct hfsplus_cat_folder folder; - struct hfsplus_cat_file file; - struct hfsplus_cat_thread thread; -} __packed hfsplus_cat_entry; - -/* HFS+ catalog entry type */ -#define HFSPLUS_FOLDER 0x0001 -#define HFSPLUS_FILE 0x0002 -#define HFSPLUS_FOLDER_THREAD 0x0003 -#define HFSPLUS_FILE_THREAD 0x0004 - -/* HFS+ extents tree key */ -struct hfsplus_ext_key { - __be16 key_len; - u8 fork_type; - u8 pad; - hfsplus_cnid cnid; - __be32 start_block; -} __packed; - -#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) - -#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo" -#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security" - -#define HFSPLUS_ATTR_INLINE_DATA 0x10 -#define HFSPLUS_ATTR_FORK_DATA 0x20 -#define HFSPLUS_ATTR_EXTENTS 0x30 - -/* HFS+ attributes tree key */ -struct hfsplus_attr_key { - __be16 key_len; - __be16 pad; - hfsplus_cnid cnid; - __be32 start_block; - struct hfsplus_attr_unistr key_name; -} __packed; - -#define HFSPLUS_ATTR_KEYLEN sizeof(struct hfsplus_attr_key) - -/* HFS+ fork data attribute */ -struct hfsplus_attr_fork_data { - __be32 record_type; - __be32 reserved; - struct hfsplus_fork_raw the_fork; -} __packed; - -/* HFS+ extension attribute */ -struct hfsplus_attr_extents { - __be32 record_type; - __be32 reserved; - struct hfsplus_extent extents; -} __packed; - -#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802 - -/* HFS+ attribute inline data */ -struct hfsplus_attr_inline_data { - __be32 record_type; - __be32 reserved1; - u8 reserved2[6]; - __be16 length; - u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE]; -} __packed; - -/* A data record in the attributes tree */ -typedef union { - __be32 record_type; - struct hfsplus_attr_fork_data fork_data; - struct hfsplus_attr_extents extents; - struct hfsplus_attr_inline_data inline_data; -} __packed hfsplus_attr_entry; - -/* HFS+ generic BTree key */ -typedef union { - __be16 key_len; - struct hfsplus_cat_key cat; - struct hfsplus_ext_key ext; - struct hfsplus_attr_key attr; -} __packed hfsplus_btree_key; +#include <linux/hfs_common.h> #endif diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b51a411ecd23..7ae6745ca7ae 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -180,13 +180,29 @@ const struct dentry_operations hfsplus_dentry_operations = { .d_compare = hfsplus_compare_dentry, }; -static void hfsplus_get_perms(struct inode *inode, - struct hfsplus_perm *perms, int dir) +static int hfsplus_get_perms(struct inode *inode, + struct hfsplus_perm *perms, int dir) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); u16 mode; mode = be16_to_cpu(perms->mode); + if (dir) { + if (mode && !S_ISDIR(mode)) + goto bad_type; + } else if (mode) { + switch (mode & S_IFMT) { + case S_IFREG: + case S_IFLNK: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + break; + default: + goto bad_type; + } + } i_uid_write(inode, be32_to_cpu(perms->owner)); if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode)) @@ -212,6 +228,10 @@ static void hfsplus_get_perms(struct inode *inode, inode->i_flags |= S_APPEND; else inode->i_flags &= ~S_APPEND; + return 0; +bad_type: + pr_err("invalid file type 0%04o for inode %lu\n", mode, inode->i_ino); + return -EIO; } static int hfsplus_file_open(struct inode *inode, struct file *file) @@ -305,6 +325,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, struct inode *inode = file->f_mapping->host; struct hfsplus_inode_info *hip = HFSPLUS_I(inode); struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; int error = 0, error2; error = file_write_and_wait_range(file, start, end); @@ -348,6 +369,14 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, error = error2; } + mutex_lock(&sbi->vh_mutex); + hfsplus_prepare_volume_header_for_commit(vhdr); + mutex_unlock(&sbi->vh_mutex); + + error2 = hfsplus_commit_superblock(inode->i_sb); + if (!error) + error = error2; + if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(inode->i_sb->s_bdev); @@ -516,7 +545,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) } hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_folder)); - hfsplus_get_perms(inode, &folder->permissions, 1); + res = hfsplus_get_perms(inode, &folder->permissions, 1); + if (res) + goto out; set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); inode_set_atime_to_ts(inode, hfsp_mt2ut(folder->access_date)); @@ -545,7 +576,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? &file->rsrc_fork : &file->data_fork); - hfsplus_get_perms(inode, &file->permissions, 0); + res = hfsplus_get_perms(inode, &file->permissions, 0); + if (res) + goto out; set_nlink(inode, 1); if (S_ISREG(inode->i_mode)) { if (file->permissions.dev) diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index a66a09a56bf7..9b377481f397 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/nls.h> diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 16bc4abc67e0..aaffa9e060a0 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -65,7 +65,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; atomic_set(&HFSPLUS_I(inode)->opencnt, 0); @@ -187,40 +187,15 @@ static void hfsplus_evict_inode(struct inode *inode) } } -static int hfsplus_sync_fs(struct super_block *sb, int wait) +int hfsplus_commit_superblock(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct hfsplus_vh *vhdr = sbi->s_vhdr; int write_backup = 0; - int error, error2; - - if (!wait) - return 0; + int error = 0, error2; hfs_dbg("starting...\n"); - /* - * Explicitly write out the special metadata inodes. - * - * While these special inodes are marked as hashed and written - * out peridocically by the flusher threads we redirty them - * during writeout of normal inodes, and thus the life lock - * prevents us from getting the latest state to disk. - */ - error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); - error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); - if (!error) - error = error2; - if (sbi->attr_tree) { - error2 = - filemap_write_and_wait(sbi->attr_tree->inode->i_mapping); - if (!error) - error = error2; - } - error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); - if (!error) - error = error2; - mutex_lock(&sbi->vh_mutex); mutex_lock(&sbi->alloc_mutex); vhdr->free_blocks = cpu_to_be32(sbi->free_blocks); @@ -249,11 +224,52 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) sbi->part_start + sbi->sect_count - 2, sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE); if (!error) - error2 = error; + error = error2; out: mutex_unlock(&sbi->alloc_mutex); mutex_unlock(&sbi->vh_mutex); + hfs_dbg("finished: err %d\n", error); + + return error; +} + +static int hfsplus_sync_fs(struct super_block *sb, int wait) +{ + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + int error, error2; + + if (!wait) + return 0; + + hfs_dbg("starting...\n"); + + /* + * Explicitly write out the special metadata inodes. + * + * While these special inodes are marked as hashed and written + * out peridocically by the flusher threads we redirty them + * during writeout of normal inodes, and thus the life lock + * prevents us from getting the latest state to disk. + */ + error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); + error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); + if (!error) + error = error2; + if (sbi->attr_tree) { + error2 = + filemap_write_and_wait(sbi->attr_tree->inode->i_mapping); + if (!error) + error = error2; + } + error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); + if (!error) + error = error2; + + error2 = hfsplus_commit_superblock(sb); + if (!error) + error = error2; + if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(sb->s_bdev); @@ -395,6 +411,15 @@ static const struct super_operations hfsplus_sops = { .show_options = hfsplus_show_options, }; +void hfsplus_prepare_volume_header_for_commit(struct hfsplus_vh *vhdr) +{ + vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); + vhdr->modify_date = hfsp_now2mt(); + be32_add_cpu(&vhdr->write_count, 1); + vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); + vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); +} + static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) { struct hfsplus_vh *vhdr; @@ -562,11 +587,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused * all three are registered with Apple for our use */ - vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); - vhdr->modify_date = hfsp_now2mt(); - be32_add_cpu(&vhdr->write_count, 1); - vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); - vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); + hfsplus_prepare_volume_header_for_commit(vhdr); hfsplus_sync_fs(sb, 1); if (!sbi->hidden_dir) { diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 11e08a4a18b2..d3a142f4518b 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -11,6 +11,9 @@ #include <linux/types.h> #include <linux/nls.h> + +#include <kunit/visibility.h> + #include "hfsplus_fs.h" #include "hfsplus_raw.h" @@ -72,6 +75,7 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, return 0; } } +EXPORT_SYMBOL_IF_KUNIT(hfsplus_strcasecmp); /* Compare names as a sequence of 16-bit unsigned integers */ int hfsplus_strcmp(const struct hfsplus_unistr *s1, @@ -110,7 +114,7 @@ int hfsplus_strcmp(const struct hfsplus_unistr *s1, return len1 < len2 ? -1 : len1 > len2 ? 1 : 0; } - +EXPORT_SYMBOL_IF_KUNIT(hfsplus_strcmp); #define Hangul_SBase 0xac00 #define Hangul_LBase 0x1100 @@ -143,8 +147,9 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) return NULL; } -static int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, - int max_len, char *astr, int *len_p) +static int hfsplus_uni2asc(struct super_block *sb, + const struct hfsplus_unistr *ustr, + int max_len, char *astr, int *len_p) { const hfsplus_unichr *ip; struct nls_table *nls = HFSPLUS_SB(sb)->nls; @@ -285,6 +290,7 @@ inline int hfsplus_uni2asc_str(struct super_block *sb, { return hfsplus_uni2asc(sb, ustr, HFSPLUS_MAX_STRLEN, astr, len_p); } +EXPORT_SYMBOL_IF_KUNIT(hfsplus_uni2asc_str); inline int hfsplus_uni2asc_xattr_str(struct super_block *sb, const struct hfsplus_attr_unistr *ustr, @@ -293,6 +299,7 @@ inline int hfsplus_uni2asc_xattr_str(struct super_block *sb, return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr, HFSPLUS_ATTR_MAX_STRLEN, astr, len_p); } +EXPORT_SYMBOL_IF_KUNIT(hfsplus_uni2asc_xattr_str); /* * Convert one or more ASCII characters into a single unicode character. @@ -420,6 +427,7 @@ int hfsplus_asc2uni(struct super_block *sb, return -ENAMETOOLONG; return 0; } +EXPORT_SYMBOL_IF_KUNIT(hfsplus_asc2uni); /* * Hash a string to an integer as appropriate for the HFS+ filesystem. @@ -472,6 +480,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) return 0; } +EXPORT_SYMBOL_IF_KUNIT(hfsplus_hash_dentry); /* * Compare strings with HFS+ filename ordering. @@ -563,3 +572,4 @@ int hfsplus_compare_dentry(const struct dentry *dentry, return 1; return 0; } +EXPORT_SYMBOL_IF_KUNIT(hfsplus_compare_dentry); diff --git a/fs/hfsplus/unicode_test.c b/fs/hfsplus/unicode_test.c new file mode 100644 index 000000000000..5a7a6859efe3 --- /dev/null +++ b/fs/hfsplus/unicode_test.c @@ -0,0 +1,1579 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit tests for HFS+ Unicode string operations + * + * Copyright (C) 2025 Viacheslav Dubeyko <slava@dubeyko.com> + */ + +#include <kunit/test.h> +#include <linux/nls.h> +#include <linux/dcache.h> +#include <linux/stringhash.h> +#include "hfsplus_fs.h" + +struct test_mock_string_env { + struct hfsplus_unistr str1; + struct hfsplus_unistr str2; + char *buf; + u32 buf_size; +}; + +static struct test_mock_string_env *setup_mock_str_env(u32 buf_size) +{ + struct test_mock_string_env *env; + + env = kzalloc(sizeof(struct test_mock_string_env), GFP_KERNEL); + if (!env) + return NULL; + + env->buf = kzalloc(buf_size, GFP_KERNEL); + if (!env->buf) { + kfree(env); + return NULL; + } + + env->buf_size = buf_size; + + return env; +} + +static void free_mock_str_env(struct test_mock_string_env *env) +{ + if (env->buf) + kfree(env->buf); + kfree(env); +} + +/* Helper function to create hfsplus_unistr */ +static void create_unistr(struct hfsplus_unistr *ustr, const char *ascii_str) +{ + int len = strlen(ascii_str); + int i; + + memset(ustr->unicode, 0, sizeof(ustr->unicode)); + + ustr->length = cpu_to_be16(len); + for (i = 0; i < len && i < HFSPLUS_MAX_STRLEN; i++) + ustr->unicode[i] = cpu_to_be16((u16)ascii_str[i]); +} + +static void corrupt_unistr(struct hfsplus_unistr *ustr) +{ + ustr->length = cpu_to_be16(U16_MAX); +} + +/* Test hfsplus_strcasecmp function */ +static void hfsplus_strcasecmp_test(struct kunit *test) +{ + struct test_mock_string_env *mock_env; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + /* Test identical strings */ + create_unistr(&mock_env->str1, "hello"); + create_unistr(&mock_env->str2, "hello"); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + /* Test case insensitive comparison */ + create_unistr(&mock_env->str1, "Hello"); + create_unistr(&mock_env->str2, "hello"); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + create_unistr(&mock_env->str1, "HELLO"); + create_unistr(&mock_env->str2, "hello"); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + /* Test different strings */ + create_unistr(&mock_env->str1, "apple"); + create_unistr(&mock_env->str2, "banana"); + KUNIT_EXPECT_LT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "zebra"); + create_unistr(&mock_env->str2, "apple"); + KUNIT_EXPECT_GT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test different lengths */ + create_unistr(&mock_env->str1, "test"); + create_unistr(&mock_env->str2, "testing"); + KUNIT_EXPECT_LT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "testing"); + create_unistr(&mock_env->str2, "test"); + KUNIT_EXPECT_GT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test empty strings */ + create_unistr(&mock_env->str1, ""); + create_unistr(&mock_env->str2, ""); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + create_unistr(&mock_env->str1, ""); + create_unistr(&mock_env->str2, "test"); + KUNIT_EXPECT_LT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test single characters */ + create_unistr(&mock_env->str1, "A"); + create_unistr(&mock_env->str2, "a"); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + create_unistr(&mock_env->str1, "A"); + create_unistr(&mock_env->str2, "B"); + KUNIT_EXPECT_LT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test maximum length strings */ + memset(mock_env->buf, 'a', HFSPLUS_MAX_STRLEN); + mock_env->buf[HFSPLUS_MAX_STRLEN] = '\0'; + create_unistr(&mock_env->str1, mock_env->buf); + create_unistr(&mock_env->str2, mock_env->buf); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + /* Change one character in the middle */ + mock_env->buf[HFSPLUS_MAX_STRLEN / 2] = 'b'; + create_unistr(&mock_env->str2, mock_env->buf); + KUNIT_EXPECT_LT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test corrupted strings */ + create_unistr(&mock_env->str1, ""); + corrupt_unistr(&mock_env->str1); + create_unistr(&mock_env->str2, ""); + KUNIT_EXPECT_NE(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + create_unistr(&mock_env->str1, ""); + create_unistr(&mock_env->str2, ""); + corrupt_unistr(&mock_env->str2); + KUNIT_EXPECT_NE(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + create_unistr(&mock_env->str1, "test"); + corrupt_unistr(&mock_env->str1); + create_unistr(&mock_env->str2, "testing"); + KUNIT_EXPECT_GT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "test"); + create_unistr(&mock_env->str2, "testing"); + corrupt_unistr(&mock_env->str2); + KUNIT_EXPECT_LT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "testing"); + corrupt_unistr(&mock_env->str1); + create_unistr(&mock_env->str2, "test"); + KUNIT_EXPECT_GT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "testing"); + create_unistr(&mock_env->str2, "test"); + corrupt_unistr(&mock_env->str2); + KUNIT_EXPECT_LT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + free_mock_str_env(mock_env); +} + +/* Test hfsplus_strcmp function (case-sensitive) */ +static void hfsplus_strcmp_test(struct kunit *test) +{ + struct test_mock_string_env *mock_env; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + /* Test identical strings */ + create_unistr(&mock_env->str1, "hello"); + create_unistr(&mock_env->str2, "hello"); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + + /* Test case sensitive comparison - should NOT be equal */ + create_unistr(&mock_env->str1, "Hello"); + create_unistr(&mock_env->str2, "hello"); + KUNIT_EXPECT_NE(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + /* 'H' < 'h' in Unicode */ + KUNIT_EXPECT_LT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test lexicographic ordering */ + create_unistr(&mock_env->str1, "apple"); + create_unistr(&mock_env->str2, "banana"); + KUNIT_EXPECT_LT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "zebra"); + create_unistr(&mock_env->str2, "apple"); + KUNIT_EXPECT_GT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test different lengths with common prefix */ + create_unistr(&mock_env->str1, "test"); + create_unistr(&mock_env->str2, "testing"); + KUNIT_EXPECT_LT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "testing"); + create_unistr(&mock_env->str2, "test"); + KUNIT_EXPECT_GT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test empty strings */ + create_unistr(&mock_env->str1, ""); + create_unistr(&mock_env->str2, ""); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + + /* Test maximum length strings */ + memset(mock_env->buf, 'a', HFSPLUS_MAX_STRLEN); + mock_env->buf[HFSPLUS_MAX_STRLEN] = '\0'; + create_unistr(&mock_env->str1, mock_env->buf); + create_unistr(&mock_env->str2, mock_env->buf); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + + /* Change one character in the middle */ + mock_env->buf[HFSPLUS_MAX_STRLEN / 2] = 'b'; + create_unistr(&mock_env->str2, mock_env->buf); + KUNIT_EXPECT_LT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test corrupted strings */ + create_unistr(&mock_env->str1, ""); + corrupt_unistr(&mock_env->str1); + create_unistr(&mock_env->str2, ""); + KUNIT_EXPECT_NE(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + + create_unistr(&mock_env->str1, ""); + create_unistr(&mock_env->str2, ""); + corrupt_unistr(&mock_env->str2); + KUNIT_EXPECT_NE(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + + create_unistr(&mock_env->str1, "test"); + corrupt_unistr(&mock_env->str1); + create_unistr(&mock_env->str2, "testing"); + KUNIT_EXPECT_LT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "test"); + create_unistr(&mock_env->str2, "testing"); + corrupt_unistr(&mock_env->str2); + KUNIT_EXPECT_LT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "testing"); + corrupt_unistr(&mock_env->str1); + create_unistr(&mock_env->str2, "test"); + KUNIT_EXPECT_GT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + create_unistr(&mock_env->str1, "testing"); + create_unistr(&mock_env->str2, "test"); + corrupt_unistr(&mock_env->str2); + KUNIT_EXPECT_GT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + free_mock_str_env(mock_env); +} + +/* Test Unicode edge cases */ +static void hfsplus_unicode_edge_cases_test(struct kunit *test) +{ + struct test_mock_string_env *mock_env; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + /* Test with special characters */ + mock_env->str1.length = cpu_to_be16(3); + mock_env->str1.unicode[0] = cpu_to_be16(0x00E9); /* é */ + mock_env->str1.unicode[1] = cpu_to_be16(0x00F1); /* ñ */ + mock_env->str1.unicode[2] = cpu_to_be16(0x00FC); /* ü */ + + mock_env->str2.length = cpu_to_be16(3); + mock_env->str2.unicode[0] = cpu_to_be16(0x00E9); /* é */ + mock_env->str2.unicode[1] = cpu_to_be16(0x00F1); /* ñ */ + mock_env->str2.unicode[2] = cpu_to_be16(0x00FC); /* ü */ + + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + /* Test with different special characters */ + mock_env->str2.unicode[1] = cpu_to_be16(0x00F2); /* ò */ + KUNIT_EXPECT_NE(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + + /* Test null characters within string (should be handled correctly) */ + mock_env->str1.length = cpu_to_be16(3); + mock_env->str1.unicode[0] = cpu_to_be16('a'); + mock_env->str1.unicode[1] = cpu_to_be16(0x0000); /* null */ + mock_env->str1.unicode[2] = cpu_to_be16('b'); + + mock_env->str2.length = cpu_to_be16(3); + mock_env->str2.unicode[0] = cpu_to_be16('a'); + mock_env->str2.unicode[1] = cpu_to_be16(0x0000); /* null */ + mock_env->str2.unicode[2] = cpu_to_be16('b'); + + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + + free_mock_str_env(mock_env); +} + +/* Test boundary conditions */ +static void hfsplus_unicode_boundary_test(struct kunit *test) +{ + struct test_mock_string_env *mock_env; + int i; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + /* Test maximum length boundary */ + mock_env->str1.length = cpu_to_be16(HFSPLUS_MAX_STRLEN); + mock_env->str2.length = cpu_to_be16(HFSPLUS_MAX_STRLEN); + + for (i = 0; i < HFSPLUS_MAX_STRLEN; i++) { + mock_env->str1.unicode[i] = cpu_to_be16('A'); + mock_env->str2.unicode[i] = cpu_to_be16('A'); + } + + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + + /* Change last character */ + mock_env->str2.unicode[HFSPLUS_MAX_STRLEN - 1] = cpu_to_be16('B'); + KUNIT_EXPECT_LT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + + /* Test zero length strings */ + mock_env->str1.length = cpu_to_be16(0); + mock_env->str2.length = cpu_to_be16(0); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2)); + KUNIT_EXPECT_EQ(test, 0, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2)); + + /* Test one character vs empty */ + mock_env->str1.length = cpu_to_be16(1); + mock_env->str1.unicode[0] = cpu_to_be16('A'); + mock_env->str2.length = cpu_to_be16(0); + KUNIT_EXPECT_GT(test, hfsplus_strcmp(&mock_env->str1, + &mock_env->str2), 0); + KUNIT_EXPECT_GT(test, hfsplus_strcasecmp(&mock_env->str1, + &mock_env->str2), 0); + + free_mock_str_env(mock_env); +} + +/* Mock superblock and NLS table for testing hfsplus_uni2asc */ +struct test_mock_sb { + struct nls_table nls; + struct hfsplus_sb_info sb_info; + struct super_block sb; +}; + +static struct test_mock_sb *setup_mock_sb(void) +{ + struct test_mock_sb *ptr; + + ptr = kzalloc(sizeof(struct test_mock_sb), GFP_KERNEL); + if (!ptr) + return NULL; + + ptr->nls.charset = "utf8"; + ptr->nls.uni2char = NULL; /* Will use default behavior */ + ptr->sb_info.nls = &ptr->nls; + ptr->sb.s_fs_info = &ptr->sb_info; + + /* Set default flags - no decomposition, no case folding */ + clear_bit(HFSPLUS_SB_NODECOMPOSE, &ptr->sb_info.flags); + clear_bit(HFSPLUS_SB_CASEFOLD, &ptr->sb_info.flags); + + return ptr; +} + +static void free_mock_sb(struct test_mock_sb *ptr) +{ + kfree(ptr); +} + +/* Simple uni2char implementation for testing */ +static int test_uni2char(wchar_t uni, unsigned char *out, int boundlen) +{ + if (boundlen <= 0) + return -ENAMETOOLONG; + + if (uni < 0x80) { + *out = (unsigned char)uni; + return 1; + } + + /* For non-ASCII, just use '?' as fallback */ + *out = '?'; + return 1; +} + +/* Test hfsplus_uni2asc basic functionality */ +static void hfsplus_uni2asc_basic_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + int len, result; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.uni2char = test_uni2char; + + /* Test simple ASCII string conversion */ + create_unistr(&mock_env->str1, "hello"); + len = mock_env->buf_size; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 5, len); + KUNIT_EXPECT_STREQ(test, "hello", mock_env->buf); + + /* Test empty string */ + create_unistr(&mock_env->str1, ""); + len = mock_env->buf_size; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 0, len); + + /* Test single character */ + create_unistr(&mock_env->str1, "A"); + len = mock_env->buf_size; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 1, len); + KUNIT_EXPECT_EQ(test, 'A', mock_env->buf[0]); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Test special character handling */ +static void hfsplus_uni2asc_special_chars_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + int len, result; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.uni2char = test_uni2char; + + /* Test null character conversion (should become 0x2400) */ + mock_env->str1.length = cpu_to_be16(1); + mock_env->str1.unicode[0] = cpu_to_be16(0x0000); + len = mock_env->buf_size; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 1, len); + /* Our test implementation returns '?' for non-ASCII */ + KUNIT_EXPECT_EQ(test, '?', mock_env->buf[0]); + + /* Test forward slash conversion (should become colon) */ + mock_env->str1.length = cpu_to_be16(1); + mock_env->str1.unicode[0] = cpu_to_be16('/'); + len = mock_env->buf_size; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 1, len); + KUNIT_EXPECT_EQ(test, ':', mock_env->buf[0]); + + /* Test string with mixed special characters */ + mock_env->str1.length = cpu_to_be16(3); + mock_env->str1.unicode[0] = cpu_to_be16('a'); + mock_env->str1.unicode[1] = cpu_to_be16('/'); + mock_env->str1.unicode[2] = cpu_to_be16('b'); + len = mock_env->buf_size; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 3, len); + KUNIT_EXPECT_EQ(test, 'a', mock_env->buf[0]); + KUNIT_EXPECT_EQ(test, ':', mock_env->buf[1]); + KUNIT_EXPECT_EQ(test, 'b', mock_env->buf[2]); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Test buffer length handling */ +static void hfsplus_uni2asc_buffer_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + int len, result; + + mock_env = setup_mock_str_env(10); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.uni2char = test_uni2char; + + /* Test insufficient buffer space */ + create_unistr(&mock_env->str1, "toolongstring"); + len = 5; /* Buffer too small */ + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, -ENAMETOOLONG, result); + KUNIT_EXPECT_EQ(test, 5, len); /* Should be set to consumed length */ + + /* Test exact buffer size */ + create_unistr(&mock_env->str1, "exact"); + len = 5; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 5, len); + + /* Test zero length buffer */ + create_unistr(&mock_env->str1, "test"); + len = 0; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, -ENAMETOOLONG, result); + KUNIT_EXPECT_EQ(test, 0, len); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Test corrupted unicode string handling */ +static void hfsplus_uni2asc_corrupted_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + int len, result; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.uni2char = test_uni2char; + + /* Test corrupted length (too large) */ + create_unistr(&mock_env->str1, "test"); + corrupt_unistr(&mock_env->str1); /* Sets length to U16_MAX */ + len = mock_env->buf_size; + + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + /* Should still work but with corrected length */ + KUNIT_EXPECT_EQ(test, 0, result); + /* + * Length should be corrected to HFSPLUS_MAX_STRLEN + * and processed accordingly + */ + KUNIT_EXPECT_GT(test, len, 0); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Test edge cases and boundary conditions */ +static void hfsplus_uni2asc_edge_cases_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + int len, result; + int i; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN * 2); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.uni2char = test_uni2char; + + /* Test maximum length string */ + mock_env->str1.length = cpu_to_be16(HFSPLUS_MAX_STRLEN); + for (i = 0; i < HFSPLUS_MAX_STRLEN; i++) + mock_env->str1.unicode[i] = cpu_to_be16('a'); + + len = mock_env->buf_size; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, HFSPLUS_MAX_STRLEN, len); + + /* Verify all characters are 'a' */ + for (i = 0; i < HFSPLUS_MAX_STRLEN; i++) + KUNIT_EXPECT_EQ(test, 'a', mock_env->buf[i]); + + /* Test string with high Unicode values (non-ASCII) */ + mock_env->str1.length = cpu_to_be16(3); + mock_env->str1.unicode[0] = cpu_to_be16(0x00E9); /* é */ + mock_env->str1.unicode[1] = cpu_to_be16(0x00F1); /* ñ */ + mock_env->str1.unicode[2] = cpu_to_be16(0x00FC); /* ü */ + len = mock_env->buf_size; + result = hfsplus_uni2asc_str(&mock_sb->sb, &mock_env->str1, + mock_env->buf, &len); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 3, len); + /* Our test implementation converts non-ASCII to '?' */ + KUNIT_EXPECT_EQ(test, '?', mock_env->buf[0]); + KUNIT_EXPECT_EQ(test, '?', mock_env->buf[1]); + KUNIT_EXPECT_EQ(test, '?', mock_env->buf[2]); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Simple char2uni implementation for testing */ +static int test_char2uni(const unsigned char *rawstring, + int boundlen, wchar_t *uni) +{ + if (boundlen <= 0) + return -EINVAL; + + *uni = (wchar_t)*rawstring; + return 1; +} + +/* Helper function to check unicode string contents */ +static void check_unistr_content(struct kunit *test, + struct hfsplus_unistr *ustr, + const char *expected_ascii) +{ + int expected_len = strlen(expected_ascii); + int actual_len = be16_to_cpu(ustr->length); + int i; + + KUNIT_EXPECT_EQ(test, expected_len, actual_len); + + for (i = 0; i < expected_len && i < actual_len; i++) { + u16 expected_char = (u16)expected_ascii[i]; + u16 actual_char = be16_to_cpu(ustr->unicode[i]); + + KUNIT_EXPECT_EQ(test, expected_char, actual_char); + } +} + +/* Test hfsplus_asc2uni basic functionality */ +static void hfsplus_asc2uni_basic_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + int result; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.char2uni = test_char2uni; + + /* Test simple ASCII string conversion */ + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str1, + HFSPLUS_MAX_STRLEN, "hello", 5); + + KUNIT_EXPECT_EQ(test, 0, result); + check_unistr_content(test, &mock_env->str1, "hello"); + + /* Test empty string */ + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str1, + HFSPLUS_MAX_STRLEN, "", 0); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 0, be16_to_cpu(mock_env->str1.length)); + + /* Test single character */ + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str1, + HFSPLUS_MAX_STRLEN, "A", 1); + + KUNIT_EXPECT_EQ(test, 0, result); + check_unistr_content(test, &mock_env->str1, "A"); + + /* Test null-terminated string with explicit length */ + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str1, + HFSPLUS_MAX_STRLEN, "test\0extra", 4); + + KUNIT_EXPECT_EQ(test, 0, result); + check_unistr_content(test, &mock_env->str1, "test"); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Test special character handling in asc2uni */ +static void hfsplus_asc2uni_special_chars_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + int result; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.char2uni = test_char2uni; + + /* Test colon conversion (should become forward slash) */ + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str1, + HFSPLUS_MAX_STRLEN, ":", 1); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 1, be16_to_cpu(mock_env->str1.length)); + KUNIT_EXPECT_EQ(test, '/', be16_to_cpu(mock_env->str1.unicode[0])); + + /* Test string with mixed special characters */ + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str1, + HFSPLUS_MAX_STRLEN, "a:b", 3); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 3, be16_to_cpu(mock_env->str1.length)); + KUNIT_EXPECT_EQ(test, 'a', be16_to_cpu(mock_env->str1.unicode[0])); + KUNIT_EXPECT_EQ(test, '/', be16_to_cpu(mock_env->str1.unicode[1])); + KUNIT_EXPECT_EQ(test, 'b', be16_to_cpu(mock_env->str1.unicode[2])); + + /* Test multiple special characters */ + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str1, + HFSPLUS_MAX_STRLEN, ":::", 3); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 3, be16_to_cpu(mock_env->str1.length)); + KUNIT_EXPECT_EQ(test, '/', be16_to_cpu(mock_env->str1.unicode[0])); + KUNIT_EXPECT_EQ(test, '/', be16_to_cpu(mock_env->str1.unicode[1])); + KUNIT_EXPECT_EQ(test, '/', be16_to_cpu(mock_env->str1.unicode[2])); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Test buffer length limits */ +static void hfsplus_asc2uni_buffer_limits_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + int result; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 10); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.char2uni = test_char2uni; + + /* Test exact maximum length */ + memset(mock_env->buf, 'a', HFSPLUS_MAX_STRLEN); + result = hfsplus_asc2uni(&mock_sb->sb, + &mock_env->str1, HFSPLUS_MAX_STRLEN, + mock_env->buf, HFSPLUS_MAX_STRLEN); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, HFSPLUS_MAX_STRLEN, + be16_to_cpu(mock_env->str1.length)); + + /* Test exceeding maximum length */ + memset(mock_env->buf, 'a', HFSPLUS_MAX_STRLEN + 5); + result = hfsplus_asc2uni(&mock_sb->sb, + &mock_env->str1, HFSPLUS_MAX_STRLEN, + mock_env->buf, HFSPLUS_MAX_STRLEN + 5); + + KUNIT_EXPECT_EQ(test, -ENAMETOOLONG, result); + KUNIT_EXPECT_EQ(test, HFSPLUS_MAX_STRLEN, + be16_to_cpu(mock_env->str1.length)); + + /* Test with smaller max_unistr_len */ + result = hfsplus_asc2uni(&mock_sb->sb, + &mock_env->str1, 5, "toolongstring", 13); + + KUNIT_EXPECT_EQ(test, -ENAMETOOLONG, result); + KUNIT_EXPECT_EQ(test, 5, be16_to_cpu(mock_env->str1.length)); + + /* Test zero max length */ + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str1, 0, "test", 4); + + KUNIT_EXPECT_EQ(test, -ENAMETOOLONG, result); + KUNIT_EXPECT_EQ(test, 0, be16_to_cpu(mock_env->str1.length)); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Test error handling and edge cases */ +static void hfsplus_asc2uni_edge_cases_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct hfsplus_unistr ustr; + char test_str[] = {'a', '\0', 'b'}; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.char2uni = test_char2uni; + + /* Test zero length input */ + result = hfsplus_asc2uni(&mock_sb->sb, + &ustr, HFSPLUS_MAX_STRLEN, "test", 0); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 0, be16_to_cpu(ustr.length)); + + /* Test input with length mismatch */ + result = hfsplus_asc2uni(&mock_sb->sb, + &ustr, HFSPLUS_MAX_STRLEN, "hello", 3); + + KUNIT_EXPECT_EQ(test, 0, result); + check_unistr_content(test, &ustr, "hel"); + + /* Test with various printable ASCII characters */ + result = hfsplus_asc2uni(&mock_sb->sb, + &ustr, HFSPLUS_MAX_STRLEN, "ABC123!@#", 9); + + KUNIT_EXPECT_EQ(test, 0, result); + check_unistr_content(test, &ustr, "ABC123!@#"); + + /* Test null character in the middle */ + result = hfsplus_asc2uni(&mock_sb->sb, + &ustr, HFSPLUS_MAX_STRLEN, test_str, 3); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, 3, be16_to_cpu(ustr.length)); + KUNIT_EXPECT_EQ(test, 'a', be16_to_cpu(ustr.unicode[0])); + KUNIT_EXPECT_EQ(test, 0, be16_to_cpu(ustr.unicode[1])); + KUNIT_EXPECT_EQ(test, 'b', be16_to_cpu(ustr.unicode[2])); + + free_mock_sb(mock_sb); +} + +/* Test decomposition flag behavior */ +static void hfsplus_asc2uni_decompose_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + int result; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + mock_sb->nls.char2uni = test_char2uni; + + /* Test with decomposition disabled (default) */ + clear_bit(HFSPLUS_SB_NODECOMPOSE, &mock_sb->sb_info.flags); + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str1, + HFSPLUS_MAX_STRLEN, "test", 4); + + KUNIT_EXPECT_EQ(test, 0, result); + check_unistr_content(test, &mock_env->str1, "test"); + + /* Test with decomposition enabled */ + set_bit(HFSPLUS_SB_NODECOMPOSE, &mock_sb->sb_info.flags); + result = hfsplus_asc2uni(&mock_sb->sb, &mock_env->str2, + HFSPLUS_MAX_STRLEN, "test", 4); + + KUNIT_EXPECT_EQ(test, 0, result); + check_unistr_content(test, &mock_env->str2, "test"); + + /* For simple ASCII, both should produce the same result */ + KUNIT_EXPECT_EQ(test, + be16_to_cpu(mock_env->str1.length), + be16_to_cpu(mock_env->str2.length)); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Mock dentry for testing hfsplus_hash_dentry */ +static struct dentry test_dentry; + +static void setup_mock_dentry(struct super_block *sb) +{ + memset(&test_dentry, 0, sizeof(test_dentry)); + test_dentry.d_sb = sb; +} + +/* Helper function to create qstr */ +static void create_qstr(struct qstr *str, const char *name) +{ + str->name = name; + str->len = strlen(name); + str->hash = 0; /* Will be set by hash function */ +} + +/* Test hfsplus_hash_dentry basic functionality */ +static void hfsplus_hash_dentry_basic_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr str1, str2; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test basic string hashing */ + create_qstr(&str1, "hello"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_NE(test, 0, str1.hash); + + /* Test that identical strings produce identical hashes */ + create_qstr(&str2, "hello"); + result = hfsplus_hash_dentry(&test_dentry, &str2); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, str1.hash, str2.hash); + + /* Test empty string */ + create_qstr(&str1, ""); + result = hfsplus_hash_dentry(&test_dentry, &str1); + + /* Empty string should still produce a hash */ + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test single character */ + create_qstr(&str1, "A"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_NE(test, 0, str1.hash); + + free_mock_sb(mock_sb); +} + +/* Test case folding behavior in hash */ +static void hfsplus_hash_dentry_casefold_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr str1, str2; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test with case folding disabled (default) */ + clear_bit(HFSPLUS_SB_CASEFOLD, &mock_sb->sb_info.flags); + + create_qstr(&str1, "Hello"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + KUNIT_EXPECT_EQ(test, 0, result); + + create_qstr(&str2, "hello"); + result = hfsplus_hash_dentry(&test_dentry, &str2); + KUNIT_EXPECT_EQ(test, 0, result); + + /* + * Without case folding, different cases + * should produce different hashes + */ + KUNIT_EXPECT_NE(test, str1.hash, str2.hash); + + /* Test with case folding enabled */ + set_bit(HFSPLUS_SB_CASEFOLD, &mock_sb->sb_info.flags); + + create_qstr(&str1, "Hello"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + KUNIT_EXPECT_EQ(test, 0, result); + + create_qstr(&str2, "hello"); + result = hfsplus_hash_dentry(&test_dentry, &str2); + KUNIT_EXPECT_EQ(test, 0, result); + + /* With case folding, different cases should produce same hash */ + KUNIT_EXPECT_EQ(test, str1.hash, str2.hash); + + /* Test mixed case */ + create_qstr(&str1, "HeLLo"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_EQ(test, str1.hash, str2.hash); + + free_mock_sb(mock_sb); +} + +/* Test special character handling in hash */ +static void hfsplus_hash_dentry_special_chars_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr str1, str2; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test colon conversion (: becomes /) */ + create_qstr(&str1, "file:name"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + KUNIT_EXPECT_EQ(test, 0, result); + + create_qstr(&str2, "file/name"); + result = hfsplus_hash_dentry(&test_dentry, &str2); + KUNIT_EXPECT_EQ(test, 0, result); + + /* After conversion, these should produce the same hash */ + KUNIT_EXPECT_EQ(test, str1.hash, str2.hash); + + /* Test multiple special characters */ + create_qstr(&str1, ":::"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + KUNIT_EXPECT_EQ(test, 0, result); + + create_qstr(&str2, "///"); + result = hfsplus_hash_dentry(&test_dentry, &str2); + KUNIT_EXPECT_EQ(test, 0, result); + + KUNIT_EXPECT_EQ(test, str1.hash, str2.hash); + + free_mock_sb(mock_sb); +} + +/* Test decomposition flag behavior in hash */ +static void hfsplus_hash_dentry_decompose_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr str1, str2; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test with decomposition disabled (default) */ + clear_bit(HFSPLUS_SB_NODECOMPOSE, &mock_sb->sb_info.flags); + + create_qstr(&str1, "test"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test with decomposition enabled */ + set_bit(HFSPLUS_SB_NODECOMPOSE, &mock_sb->sb_info.flags); + + create_qstr(&str2, "test"); + result = hfsplus_hash_dentry(&test_dentry, &str2); + KUNIT_EXPECT_EQ(test, 0, result); + + /* + * For simple ASCII, decomposition shouldn't change + * the hash much but the function should still work correctly + */ + KUNIT_EXPECT_NE(test, 0, str2.hash); + + free_mock_sb(mock_sb); +} + +/* Test hash consistency and distribution */ +static void hfsplus_hash_dentry_consistency_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr str1, str2, str3; + unsigned long hash1; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test that same string always produces same hash */ + create_qstr(&str1, "consistent"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + KUNIT_EXPECT_EQ(test, 0, result); + hash1 = str1.hash; + + create_qstr(&str2, "consistent"); + result = hfsplus_hash_dentry(&test_dentry, &str2); + KUNIT_EXPECT_EQ(test, 0, result); + + KUNIT_EXPECT_EQ(test, hash1, str2.hash); + + /* Test that different strings produce different hashes */ + create_qstr(&str3, "different"); + result = hfsplus_hash_dentry(&test_dentry, &str3); + KUNIT_EXPECT_EQ(test, 0, result); + + KUNIT_EXPECT_NE(test, str1.hash, str3.hash); + + /* Test similar strings should have different hashes */ + create_qstr(&str1, "file1"); + result = hfsplus_hash_dentry(&test_dentry, &str1); + KUNIT_EXPECT_EQ(test, 0, result); + + create_qstr(&str2, "file2"); + result = hfsplus_hash_dentry(&test_dentry, &str2); + KUNIT_EXPECT_EQ(test, 0, result); + + KUNIT_EXPECT_NE(test, str1.hash, str2.hash); + + free_mock_sb(mock_sb); +} + +/* Test edge cases and boundary conditions */ +static void hfsplus_hash_dentry_edge_cases_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct test_mock_string_env *mock_env; + struct qstr str; + int result; + + mock_env = setup_mock_str_env(HFSPLUS_MAX_STRLEN + 1); + KUNIT_ASSERT_NOT_NULL(test, mock_env); + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test very long filename */ + memset(mock_env->buf, 'a', mock_env->buf_size - 1); + mock_env->buf[mock_env->buf_size - 1] = '\0'; + + create_qstr(&str, mock_env->buf); + result = hfsplus_hash_dentry(&test_dentry, &str); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_NE(test, 0, str.hash); + + /* Test filename with all printable ASCII characters */ + create_qstr(&str, "!@#$%^&*()_+-=[]{}|;':\",./<>?"); + result = hfsplus_hash_dentry(&test_dentry, &str); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_NE(test, 0, str.hash); + + /* Test with embedded null (though not typical for filenames) */ + str.name = "file\0hidden"; + str.len = 11; /* Include the null and text after it */ + str.hash = 0; + result = hfsplus_hash_dentry(&test_dentry, &str); + + KUNIT_EXPECT_EQ(test, 0, result); + KUNIT_EXPECT_NE(test, 0, str.hash); + + free_mock_str_env(mock_env); + free_mock_sb(mock_sb); +} + +/* Test hfsplus_compare_dentry basic functionality */ +static void hfsplus_compare_dentry_basic_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr name; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test identical strings */ + create_qstr(&name, "hello"); + result = hfsplus_compare_dentry(&test_dentry, 5, "hello", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test different strings - lexicographic order */ + create_qstr(&name, "world"); + result = hfsplus_compare_dentry(&test_dentry, 5, "hello", &name); + KUNIT_EXPECT_LT(test, result, 0); /* "hello" < "world" */ + + result = hfsplus_compare_dentry(&test_dentry, 5, "world", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + create_qstr(&name, "hello"); + result = hfsplus_compare_dentry(&test_dentry, 5, "world", &name); + KUNIT_EXPECT_GT(test, result, 0); /* "world" > "hello" */ + + /* Test empty strings */ + create_qstr(&name, ""); + result = hfsplus_compare_dentry(&test_dentry, 0, "", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test one empty, one non-empty */ + create_qstr(&name, "test"); + result = hfsplus_compare_dentry(&test_dentry, 0, "", &name); + KUNIT_EXPECT_LT(test, result, 0); /* "" < "test" */ + + create_qstr(&name, ""); + result = hfsplus_compare_dentry(&test_dentry, 4, "test", &name); + KUNIT_EXPECT_GT(test, result, 0); /* "test" > "" */ + + free_mock_sb(mock_sb); +} + +/* Test case folding behavior in comparison */ +static void hfsplus_compare_dentry_casefold_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr name; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test with case folding disabled (default) */ + clear_bit(HFSPLUS_SB_CASEFOLD, &mock_sb->sb_info.flags); + + create_qstr(&name, "hello"); + result = hfsplus_compare_dentry(&test_dentry, 5, "Hello", &name); + /* Case sensitive: "Hello" != "hello" */ + KUNIT_EXPECT_NE(test, 0, result); + + create_qstr(&name, "Hello"); + result = hfsplus_compare_dentry(&test_dentry, 5, "hello", &name); + /* Case sensitive: "hello" != "Hello" */ + KUNIT_EXPECT_NE(test, 0, result); + + /* Test with case folding enabled */ + set_bit(HFSPLUS_SB_CASEFOLD, &mock_sb->sb_info.flags); + + create_qstr(&name, "hello"); + result = hfsplus_compare_dentry(&test_dentry, 5, "Hello", &name); + /* Case insensitive: "Hello" == "hello" */ + KUNIT_EXPECT_EQ(test, 0, result); + + create_qstr(&name, "Hello"); + result = hfsplus_compare_dentry(&test_dentry, 5, "hello", &name); + /* Case insensitive: "hello" == "Hello" */ + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test mixed case */ + create_qstr(&name, "TeSt"); + result = hfsplus_compare_dentry(&test_dentry, 4, "test", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + create_qstr(&name, "test"); + result = hfsplus_compare_dentry(&test_dentry, 4, "TEST", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + free_mock_sb(mock_sb); +} + +/* Test special character handling in comparison */ +static void hfsplus_compare_dentry_special_chars_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr name; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test colon conversion (: becomes /) */ + create_qstr(&name, "file/name"); + result = hfsplus_compare_dentry(&test_dentry, 9, "file:name", &name); + /* "file:name" == "file/name" after conversion */ + KUNIT_EXPECT_EQ(test, 0, result); + + create_qstr(&name, "file:name"); + result = hfsplus_compare_dentry(&test_dentry, 9, "file/name", &name); + /* "file/name" == "file:name" after conversion */ + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test multiple special characters */ + create_qstr(&name, "///"); + result = hfsplus_compare_dentry(&test_dentry, 3, ":::", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test mixed special and regular characters */ + create_qstr(&name, "a/b:c"); + result = hfsplus_compare_dentry(&test_dentry, 5, "a:b/c", &name); + /* Both become "a/b/c" after conversion */ + KUNIT_EXPECT_EQ(test, 0, result); + + free_mock_sb(mock_sb); +} + +/* Test length differences */ +static void hfsplus_compare_dentry_length_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr name; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test different lengths with common prefix */ + create_qstr(&name, "testing"); + result = hfsplus_compare_dentry(&test_dentry, 4, "test", &name); + KUNIT_EXPECT_LT(test, result, 0); /* "test" < "testing" */ + + create_qstr(&name, "test"); + result = hfsplus_compare_dentry(&test_dentry, 7, "testing", &name); + KUNIT_EXPECT_GT(test, result, 0); /* "testing" > "test" */ + + /* Test exact length match */ + create_qstr(&name, "exact"); + result = hfsplus_compare_dentry(&test_dentry, 5, "exact", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test length parameter vs actual string content */ + create_qstr(&name, "hello"); + result = hfsplus_compare_dentry(&test_dentry, 3, "hel", &name); + KUNIT_EXPECT_LT(test, result, 0); /* "hel" < "hello" */ + + /* Test longer first string but shorter length parameter */ + create_qstr(&name, "hi"); + result = hfsplus_compare_dentry(&test_dentry, 2, "hello", &name); + /* "he" < "hi" (only first 2 chars compared) */ + KUNIT_EXPECT_LT(test, result, 0); + + free_mock_sb(mock_sb); +} + +/* Test decomposition flag behavior */ +static void hfsplus_compare_dentry_decompose_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr name; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test with decomposition disabled (default) */ + clear_bit(HFSPLUS_SB_NODECOMPOSE, &mock_sb->sb_info.flags); + + create_qstr(&name, "test"); + result = hfsplus_compare_dentry(&test_dentry, 4, "test", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test with decomposition enabled */ + set_bit(HFSPLUS_SB_NODECOMPOSE, &mock_sb->sb_info.flags); + + create_qstr(&name, "test"); + result = hfsplus_compare_dentry(&test_dentry, 4, "test", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* For simple ASCII, decomposition shouldn't affect the result */ + create_qstr(&name, "different"); + result = hfsplus_compare_dentry(&test_dentry, 4, "test", &name); + KUNIT_EXPECT_NE(test, 0, result); + + free_mock_sb(mock_sb); +} + +/* Test edge cases and boundary conditions */ +static void hfsplus_compare_dentry_edge_cases_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr name; + char *long_str; + char *long_str2; + u32 str_size = HFSPLUS_MAX_STRLEN + 1; + struct qstr null_name = { + .name = "a\0b", + .len = 3, + .hash = 0 + }; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + long_str = kzalloc(str_size, GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, long_str); + + long_str2 = kzalloc(str_size, GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, long_str2); + + /* Test very long strings */ + memset(long_str, 'a', str_size - 1); + long_str[str_size - 1] = '\0'; + + create_qstr(&name, long_str); + result = hfsplus_compare_dentry(&test_dentry, str_size - 1, + long_str, &name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test with difference at the end of long strings */ + memset(long_str2, 'a', str_size - 1); + long_str2[str_size - 1] = '\0'; + long_str2[str_size - 2] = 'b'; + create_qstr(&name, long_str2); + result = hfsplus_compare_dentry(&test_dentry, str_size - 1, + long_str, &name); + KUNIT_EXPECT_LT(test, result, 0); /* 'a' < 'b' */ + + /* Test single character differences */ + create_qstr(&name, "b"); + result = hfsplus_compare_dentry(&test_dentry, 1, "a", &name); + KUNIT_EXPECT_LT(test, result, 0); /* 'a' < 'b' */ + + create_qstr(&name, "a"); + result = hfsplus_compare_dentry(&test_dentry, 1, "b", &name); + KUNIT_EXPECT_GT(test, result, 0); /* 'b' > 'a' */ + + /* Test with null characters in the middle */ + result = hfsplus_compare_dentry(&test_dentry, 3, "a\0b", &null_name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test all printable ASCII characters */ + create_qstr(&name, "!@#$%^&*()"); + result = hfsplus_compare_dentry(&test_dentry, 10, "!@#$%^&*()", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + kfree(long_str); + kfree(long_str2); + free_mock_sb(mock_sb); +} + +/* Test combined flag behaviors */ +static void hfsplus_compare_dentry_combined_flags_test(struct kunit *test) +{ + struct test_mock_sb *mock_sb; + struct qstr name; + int result; + + mock_sb = setup_mock_sb(); + KUNIT_ASSERT_NOT_NULL(test, mock_sb); + + setup_mock_dentry(&mock_sb->sb); + mock_sb->nls.char2uni = test_char2uni; + + /* Test with both casefold and decompose enabled */ + set_bit(HFSPLUS_SB_CASEFOLD, &mock_sb->sb_info.flags); + set_bit(HFSPLUS_SB_NODECOMPOSE, &mock_sb->sb_info.flags); + + create_qstr(&name, "hello"); + result = hfsplus_compare_dentry(&test_dentry, 5, "HELLO", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test special chars with case folding */ + create_qstr(&name, "File/Name"); + result = hfsplus_compare_dentry(&test_dentry, 9, "file:name", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + /* Test with both flags disabled */ + clear_bit(HFSPLUS_SB_CASEFOLD, &mock_sb->sb_info.flags); + clear_bit(HFSPLUS_SB_NODECOMPOSE, &mock_sb->sb_info.flags); + + create_qstr(&name, "hello"); + result = hfsplus_compare_dentry(&test_dentry, 5, "HELLO", &name); + KUNIT_EXPECT_NE(test, 0, result); /* Case sensitive */ + + /* But special chars should still be converted */ + create_qstr(&name, "file/name"); + result = hfsplus_compare_dentry(&test_dentry, 9, "file:name", &name); + KUNIT_EXPECT_EQ(test, 0, result); + + free_mock_sb(mock_sb); +} + +static struct kunit_case hfsplus_unicode_test_cases[] = { + KUNIT_CASE(hfsplus_strcasecmp_test), + KUNIT_CASE(hfsplus_strcmp_test), + KUNIT_CASE(hfsplus_unicode_edge_cases_test), + KUNIT_CASE(hfsplus_unicode_boundary_test), + KUNIT_CASE(hfsplus_uni2asc_basic_test), + KUNIT_CASE(hfsplus_uni2asc_special_chars_test), + KUNIT_CASE(hfsplus_uni2asc_buffer_test), + KUNIT_CASE(hfsplus_uni2asc_corrupted_test), + KUNIT_CASE(hfsplus_uni2asc_edge_cases_test), + KUNIT_CASE(hfsplus_asc2uni_basic_test), + KUNIT_CASE(hfsplus_asc2uni_special_chars_test), + KUNIT_CASE(hfsplus_asc2uni_buffer_limits_test), + KUNIT_CASE(hfsplus_asc2uni_edge_cases_test), + KUNIT_CASE(hfsplus_asc2uni_decompose_test), + KUNIT_CASE(hfsplus_hash_dentry_basic_test), + KUNIT_CASE(hfsplus_hash_dentry_casefold_test), + KUNIT_CASE(hfsplus_hash_dentry_special_chars_test), + KUNIT_CASE(hfsplus_hash_dentry_decompose_test), + KUNIT_CASE(hfsplus_hash_dentry_consistency_test), + KUNIT_CASE(hfsplus_hash_dentry_edge_cases_test), + KUNIT_CASE(hfsplus_compare_dentry_basic_test), + KUNIT_CASE(hfsplus_compare_dentry_casefold_test), + KUNIT_CASE(hfsplus_compare_dentry_special_chars_test), + KUNIT_CASE(hfsplus_compare_dentry_length_test), + KUNIT_CASE(hfsplus_compare_dentry_decompose_test), + KUNIT_CASE(hfsplus_compare_dentry_edge_cases_test), + KUNIT_CASE(hfsplus_compare_dentry_combined_flags_test), + {} +}; + +static struct kunit_suite hfsplus_unicode_test_suite = { + .name = "hfsplus_unicode", + .test_cases = hfsplus_unicode_test_cases, +}; + +kunit_test_suite(hfsplus_unicode_test_suite); + +MODULE_DESCRIPTION("KUnit tests for HFS+ Unicode string operations"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index ece4d29c0ab9..da95a9de9a65 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -265,10 +265,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, struct hfs_find_data cat_fd; hfsplus_cat_entry entry; u16 cat_entry_flags, cat_entry_type; - u16 folder_finderinfo_len = sizeof(struct DInfo) + - sizeof(struct DXInfo); - u16 file_finderinfo_len = sizeof(struct FInfo) + - sizeof(struct FXInfo); + u16 folder_finderinfo_len = sizeof(DInfo) + sizeof(DXInfo); + u16 file_finderinfo_len = sizeof(FInfo) + sizeof(FXInfo); if ((!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) || @@ -444,11 +442,11 @@ static ssize_t hfsplus_getxattr_finder_info(struct inode *inode, ssize_t res = 0; struct hfs_find_data fd; u16 entry_type; - u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); - u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo); + u16 folder_rec_len = sizeof(DInfo) + sizeof(DXInfo); + u16 file_rec_len = sizeof(FInfo) + sizeof(FXInfo); u16 record_len = max(folder_rec_len, file_rec_len); - u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; - u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; + u8 folder_finder_info[sizeof(DInfo) + sizeof(DXInfo)]; + u8 file_finder_info[sizeof(FInfo) + sizeof(FXInfo)]; if (size >= record_len) { res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); @@ -612,8 +610,8 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 entry_type; - u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; - u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; + u8 folder_finder_info[sizeof(DInfo) + sizeof(DXInfo)]; + u8 file_finder_info[sizeof(FInfo) + sizeof(FXInfo)]; unsigned long len, found_bit; int xattr_name_len, symbols_count; @@ -629,14 +627,14 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); if (entry_type == HFSPLUS_FOLDER) { - len = sizeof(struct DInfo) + sizeof(struct DXInfo); + len = sizeof(DInfo) + sizeof(DXInfo); hfs_bnode_read(fd.bnode, folder_finder_info, fd.entryoffset + offsetof(struct hfsplus_cat_folder, user_info), len); found_bit = find_first_bit((void *)folder_finder_info, len*8); } else if (entry_type == HFSPLUS_FILE) { - len = sizeof(struct FInfo) + sizeof(struct FXInfo); + len = sizeof(FInfo) + sizeof(FXInfo); hfs_bnode_read(fd.bnode, file_finder_info, fd.entryoffset + offsetof(struct hfsplus_cat_file, user_info), diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 1e1acf5775ab..51d26aa2b93e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -581,7 +581,7 @@ static struct inode *hostfs_iget(struct super_block *sb, char *name) if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { unlock_new_inode(inode); } else { spin_lock(&inode->i_lock); @@ -979,7 +979,7 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hostfs_fs_info *fsi = fc->s_fs_info; struct fs_parse_result result; - char *host_root; + char *host_root, *tmp_root; int opt; opt = fs_parse(fc, hostfs_param_specs, param, &result); @@ -990,11 +990,13 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_hostfs: host_root = param->string; if (!*host_root) - host_root = ""; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + break; + tmp_root = kasprintf(GFP_KERNEL, "%s%s", + fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; break; } @@ -1004,17 +1006,17 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) static int hostfs_parse_monolithic(struct fs_context *fc, void *data) { struct hostfs_fs_info *fsi = fc->s_fs_info; - char *host_root = (char *)data; + char *tmp_root, *host_root = (char *)data; /* NULL is printed as '(null)' by printf(): avoid that. */ if (host_root == NULL) - host_root = ""; + return 0; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + tmp_root = kasprintf(GFP_KERNEL, "%s%s", fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; - + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; return 0; } @@ -1049,6 +1051,11 @@ static int hostfs_init_fs_context(struct fs_context *fc) if (!fsi) return -ENOMEM; + fsi->host_root_path = kasprintf(GFP_KERNEL, "%s/", root_ino); + if (!fsi->host_root_path) { + kfree(fsi); + return -ENOMEM; + } fc->s_fs_info = fsi; fc->ops = &hostfs_context_ops; return 0; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 49dd585c2b17..ceb50b2dc91a 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in result = ERR_PTR(-ENOMEM); goto bail1; } - if (result->i_state & I_NEW) { + if (inode_state_read_once(result) & I_NEW) { hpfs_init_inode(result); if (de->directory) hpfs_read_inode(result); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 34008442ee26..93d528f4f4f2 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -196,7 +196,7 @@ void hpfs_write_inode(struct inode *i) parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir); if (parent) { hpfs_inode->i_dirty = 0; - if (parent->i_state & I_NEW) { + if (inode_state_read_once(parent) & I_NEW) { hpfs_init_inode(parent); hpfs_read_inode(parent); unlock_new_inode(parent); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 8ab85e7ac91e..371aa6de8075 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include <linux/module.h> +#include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/init.h> diff --git a/fs/init.c b/fs/init.c index 07f592ccdba8..e0f5429c0a49 100644 --- a/fs/init.c +++ b/fs/init.c @@ -157,7 +157,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) error = security_path_mknod(&path, dentry, mode, dev); if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, mode, new_decode_dev(dev)); + dentry, mode, new_decode_dev(dev), NULL); end_creating_path(&path, dentry); return error; } @@ -209,7 +209,7 @@ int __init init_symlink(const char *oldname, const char *newname) error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, oldname); + dentry, oldname, NULL); end_creating_path(&path, dentry); return error; } @@ -233,7 +233,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) error = security_path_mkdir(&path, dentry, mode); if (!error) { dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, mode); + dentry, mode, NULL); if (IS_ERR(dentry)) error = PTR_ERR(dentry); } diff --git a/fs/inode.c b/fs/inode.c index ec9339024ac3..cc8265cfe80e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -233,7 +233,7 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; - inode->i_state = 0; + inode_state_assign_raw(inode, 0); atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; @@ -471,7 +471,7 @@ EXPORT_SYMBOL(set_nlink); void inc_nlink(struct inode *inode) { if (unlikely(inode->i_nlink == 0)) { - WARN_ON(!(inode->i_state & I_LINKABLE)); + WARN_ON(!(inode_state_read_once(inode) & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); } @@ -530,9 +530,48 @@ void ihold(struct inode *inode) } EXPORT_SYMBOL(ihold); -static void __inode_add_lru(struct inode *inode, bool rotate) +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, + struct inode *inode, u32 bit) +{ + void *bit_address; + + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); +} +EXPORT_SYMBOL(inode_bit_waitqueue); + +void wait_on_new_inode(struct inode *inode) +{ + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + + spin_lock(&inode->i_lock); + if (!(inode_state_read(inode) & I_NEW)) { + spin_unlock(&inode->i_lock); + return; + } + + wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); + for (;;) { + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + if (!(inode_state_read(inode) & I_NEW)) + break; + spin_unlock(&inode->i_lock); + schedule(); + spin_lock(&inode->i_lock); + } + finish_wait(wq_head, &wqe.wq_entry); + WARN_ON(inode_state_read(inode) & I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(wait_on_new_inode); + +static void __inode_lru_list_add(struct inode *inode, bool rotate) { - if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) + lockdep_assert_held(&inode->i_lock); + + if (inode_state_read(inode) & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; if (icount_read(inode)) return; @@ -544,32 +583,22 @@ static void __inode_add_lru(struct inode *inode, bool rotate) if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) - inode->i_state |= I_REFERENCED; -} - -struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, - struct inode *inode, u32 bit) -{ - void *bit_address; - - bit_address = inode_state_wait_address(inode, bit); - init_wait_var_entry(wqe, bit_address, 0); - return __var_waitqueue(bit_address); + inode_state_set(inode, I_REFERENCED); } -EXPORT_SYMBOL(inode_bit_waitqueue); /* * Add inode to LRU if needed (inode is unused and clean). - * - * Needs inode->i_lock held. */ -void inode_add_lru(struct inode *inode) +void inode_lru_list_add(struct inode *inode) { - __inode_add_lru(inode, false); + __inode_lru_list_add(inode, false); } static void inode_lru_list_del(struct inode *inode) { + if (list_empty(&inode->i_lru)) + return; + if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } @@ -577,15 +606,15 @@ static void inode_lru_list_del(struct inode *inode) static void inode_pin_lru_isolating(struct inode *inode) { lockdep_assert_held(&inode->i_lock); - WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); - inode->i_state |= I_LRU_ISOLATING; + WARN_ON(inode_state_read(inode) & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); + inode_state_set(inode, I_LRU_ISOLATING); } static void inode_unpin_lru_isolating(struct inode *inode) { spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); - inode->i_state &= ~I_LRU_ISOLATING; + WARN_ON(!(inode_state_read(inode) & I_LRU_ISOLATING)); + inode_state_clear(inode, I_LRU_ISOLATING); /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_LRU_ISOLATING); spin_unlock(&inode->i_lock); @@ -597,7 +626,7 @@ static void inode_wait_for_lru_isolating(struct inode *inode) struct wait_queue_head *wq_head; lockdep_assert_held(&inode->i_lock); - if (!(inode->i_state & I_LRU_ISOLATING)) + if (!(inode_state_read(inode) & I_LRU_ISOLATING)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); @@ -607,14 +636,14 @@ static void inode_wait_for_lru_isolating(struct inode *inode) * Checking I_LRU_ISOLATING with inode->i_lock guarantees * memory ordering. */ - if (!(inode->i_state & I_LRU_ISOLATING)) + if (!(inode_state_read(inode) & I_LRU_ISOLATING)) break; spin_unlock(&inode->i_lock); schedule(); spin_lock(&inode->i_lock); } finish_wait(wq_head, &wqe.wq_entry); - WARN_ON(inode->i_state & I_LRU_ISOLATING); + WARN_ON(inode_state_read(inode) & I_LRU_ISOLATING); } /** @@ -761,11 +790,11 @@ void clear_inode(struct inode *inode) */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.i_private_list)); - BUG_ON(!(inode->i_state & I_FREEING)); - BUG_ON(inode->i_state & I_CLEAR); + BUG_ON(!(inode_state_read_once(inode) & I_FREEING)); + BUG_ON(inode_state_read_once(inode) & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ - inode->i_state = I_FREEING | I_CLEAR; + inode_state_assign_raw(inode, I_FREEING | I_CLEAR); } EXPORT_SYMBOL(clear_inode); @@ -786,12 +815,10 @@ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; - BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(!(inode_state_read_once(inode) & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - if (!list_empty(&inode->i_io_list)) - inode_io_list_del(inode); - + inode_io_list_del(inode); inode_sb_list_del(inode); spin_lock(&inode->i_lock); @@ -829,7 +856,7 @@ static void evict(struct inode *inode) * This also means we don't need any fences for the call below. */ inode_wake_up_bit(inode, __I_NEW); - BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + BUG_ON(inode_state_read_once(inode) != (I_FREEING | I_CLEAR)); destroy_inode(inode); } @@ -879,12 +906,12 @@ again: spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } - inode->i_state |= I_FREEING; + inode_state_set(inode, I_FREEING); inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); @@ -938,7 +965,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * sync, or the last page cache deletion will requeue them. */ if (icount_read(inode) || - (inode->i_state & ~I_REFERENCED) || + (inode_state_read(inode) & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); @@ -947,8 +974,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, } /* Recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { - inode->i_state &= ~I_REFERENCED; + if (inode_state_read(inode) & I_REFERENCED) { + inode_state_clear(inode, I_REFERENCED); spin_unlock(&inode->i_lock); return LRU_ROTATE; } @@ -975,8 +1002,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_RETRY; } - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; + WARN_ON(inode_state_read(inode) & I_NEW); + inode_state_set(inode, I_FREEING); list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); @@ -1008,7 +1035,8 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), - void *data, bool is_inode_hash_locked) + void *data, bool is_inode_hash_locked, + bool *isnew) { struct inode *inode = NULL; @@ -1025,16 +1053,17 @@ repeat: if (!test(inode, data)) continue; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } - if (unlikely(inode->i_state & I_CREATING)) { + if (unlikely(inode_state_read(inode) & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); + *isnew = !!(inode_state_read(inode) & I_NEW); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; @@ -1049,7 +1078,7 @@ repeat: */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, - bool is_inode_hash_locked) + bool is_inode_hash_locked, bool *isnew) { struct inode *inode = NULL; @@ -1066,16 +1095,17 @@ repeat: if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } - if (unlikely(inode->i_state & I_CREATING)) { + if (unlikely(inode_state_read(inode) & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); + *isnew = !!(inode_state_read(inode) & I_NEW); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; @@ -1180,14 +1210,8 @@ void unlock_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW & ~I_CREATING; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } @@ -1197,14 +1221,8 @@ void discard_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); @@ -1260,6 +1278,7 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set + * @isnew: pointer to a bool which will indicate whether I_NEW is set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a @@ -1278,12 +1297,13 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; + bool isnew; might_sleep(); again: spin_lock(&inode_hash_lock); - old = find_inode(inode->i_sb, head, test, data, true); + old = find_inode(inode->i_sb, head, test, data, true, &isnew); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. @@ -1292,7 +1312,8 @@ again: spin_unlock(&inode_hash_lock); if (IS_ERR(old)) return NULL; - wait_on_inode(old); + if (unlikely(isnew)) + wait_on_new_inode(old); if (unlikely(inode_unhashed(old))) { iput(old); goto again; @@ -1310,7 +1331,7 @@ again: * caller is responsible for filling in the contents */ spin_lock(&inode->i_lock); - inode->i_state |= I_NEW; + inode_state_set(inode, I_NEW); hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); @@ -1383,15 +1404,17 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; + bool isnew; might_sleep(); again: - inode = find_inode(sb, head, test, data, false); + inode = find_inode(sb, head, test, data, false, &isnew); if (inode) { if (IS_ERR(inode)) return NULL; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1426,15 +1449,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + bool isnew; might_sleep(); again: - inode = find_inode_fast(sb, head, ino, false); + inode = find_inode_fast(sb, head, ino, false, &isnew); if (inode) { if (IS_ERR(inode)) return NULL; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1448,11 +1473,11 @@ again: spin_lock(&inode_hash_lock); /* We released the lock, so.. */ - old = find_inode_fast(sb, head, ino, true); + old = find_inode_fast(sb, head, ino, true, &isnew); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); - inode->i_state = I_NEW; + inode_state_assign(inode, I_NEW); hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); @@ -1474,7 +1499,8 @@ again: if (IS_ERR(old)) return NULL; inode = old; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1545,7 +1571,7 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode->i_lock); - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { + if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) { __iget(inode); spin_unlock(&inode->i_lock); } else { @@ -1578,13 +1604,13 @@ EXPORT_SYMBOL(igrab); * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) + int (*test)(struct inode *, void *), void *data, bool *isnew) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; spin_lock(&inode_hash_lock); - inode = find_inode(sb, head, test, data, true); + inode = find_inode(sb, head, test, data, true, isnew); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; @@ -1612,13 +1638,15 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; + bool isnew; might_sleep(); again: - inode = ilookup5_nowait(sb, hashval, test, data); + inode = ilookup5_nowait(sb, hashval, test, data, &isnew); if (inode) { - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1640,16 +1668,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + bool isnew; might_sleep(); again: - inode = find_inode_fast(sb, head, ino, false); + inode = find_inode_fast(sb, head, ino, false, &isnew); if (inode) { if (IS_ERR(inode)) return NULL; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1741,7 +1771,7 @@ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb == sb && - !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && + !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) return inode; } @@ -1780,7 +1810,7 @@ struct inode *find_inode_by_ino_rcu(struct super_block *sb, hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && - !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) + !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE))) return inode; } return NULL; @@ -1792,6 +1822,7 @@ int insert_inode_locked(struct inode *inode) struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); + bool isnew; might_sleep(); @@ -1804,7 +1835,7 @@ int insert_inode_locked(struct inode *inode) if (old->i_sb != sb) continue; spin_lock(&old->i_lock); - if (old->i_state & (I_FREEING|I_WILL_FREE)) { + if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) { spin_unlock(&old->i_lock); continue; } @@ -1812,21 +1843,23 @@ int insert_inode_locked(struct inode *inode) } if (likely(!old)) { spin_lock(&inode->i_lock); - inode->i_state |= I_NEW | I_CREATING; + inode_state_set(inode, I_NEW | I_CREATING); hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; } - if (unlikely(old->i_state & I_CREATING)) { + if (unlikely(inode_state_read(old) & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); return -EBUSY; } __iget(old); + isnew = !!(inode_state_read(old) & I_NEW); spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); - wait_on_inode(old); + if (isnew) + wait_on_new_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; @@ -1843,7 +1876,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, might_sleep(); - inode->i_state |= I_CREATING; + inode_state_set_raw(inode, I_CREATING); old = inode_insert5(inode, hashval, test, NULL, data); if (old != inode) { @@ -1875,10 +1908,10 @@ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; - unsigned long state; int drop; - WARN_ON(inode->i_state & I_NEW); + WARN_ON(inode_state_read(inode) & I_NEW); + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); if (op->drop_inode) drop = op->drop_inode(inode); @@ -1886,29 +1919,33 @@ static void iput_final(struct inode *inode) drop = inode_generic_drop(inode); if (!drop && - !(inode->i_state & I_DONTCACHE) && + !(inode_state_read(inode) & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { - __inode_add_lru(inode, true); + __inode_lru_list_add(inode, true); spin_unlock(&inode->i_lock); return; } - state = inode->i_state; - if (!drop) { - WRITE_ONCE(inode->i_state, state | I_WILL_FREE); + /* + * Re-check ->i_count in case the ->drop_inode() hooks played games. + * Note we only execute this if the verdict was to drop the inode. + */ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); + + if (drop) { + inode_state_set(inode, I_FREEING); + } else { + inode_state_set(inode, I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); - state = inode->i_state; - WARN_ON(state & I_NEW); - state &= ~I_WILL_FREE; + WARN_ON(inode_state_read(inode) & I_NEW); + inode_state_replace(inode, I_WILL_FREE, I_FREEING); } - WRITE_ONCE(inode->i_state, state | I_FREEING); - if (!list_empty(&inode->i_lru)) - inode_lru_list_del(inode); + inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); @@ -1931,7 +1968,7 @@ void iput(struct inode *inode) retry: lockdep_assert_not_held(&inode->i_lock); - VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode); + VFS_BUG_ON_INODE(inode_state_read_once(inode) & I_CLEAR, inode); /* * Note this assert is technically racy as if the count is bogusly * equal to one, then two CPUs racing to further drop it can both @@ -1942,14 +1979,14 @@ retry: if (atomic_add_unless(&inode->i_count, -1, 1)) return; - if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) { + if ((inode_state_read_once(inode) & I_DIRTY_TIME) && inode->i_nlink) { trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); goto retry; } spin_lock(&inode->i_lock); - if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) { + if (unlikely((inode_state_read(inode) & I_DIRTY_TIME) && inode->i_nlink)) { spin_unlock(&inode->i_lock); goto retry; } @@ -1967,6 +2004,18 @@ retry: } EXPORT_SYMBOL(iput); +/** + * iput_not_last - put an inode assuming this is not the last reference + * @inode: inode to put + */ +void iput_not_last(struct inode *inode) +{ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode); + + WARN_ON(atomic_sub_return(1, &inode->i_count) == 0); +} +EXPORT_SYMBOL(iput_not_last); + #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file @@ -2310,42 +2359,40 @@ out: } EXPORT_SYMBOL(current_time); -static int inode_needs_update_time(struct inode *inode) +static int file_update_time_flags(struct file *file, unsigned int flags) { + struct inode *inode = file_inode(file); struct timespec64 now, ts; - int sync_it = 0; + int sync_mode = 0; + int ret = 0; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; + if (unlikely(file->f_mode & FMODE_NOCMTIME)) + return 0; now = current_time(inode); ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) - sync_it |= S_MTIME; - + sync_mode |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) - sync_it |= S_CTIME; - + sync_mode |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) - sync_it |= S_VERSION; + sync_mode |= S_VERSION; - return sync_it; -} - -static int __file_update_time(struct file *file, int sync_mode) -{ - int ret = 0; - struct inode *inode = file_inode(file); + if (!sync_mode) + return 0; - /* try to update time settings */ - if (!mnt_get_write_access_file(file)) { - ret = inode_update_time(inode, sync_mode); - mnt_put_write_access_file(file); - } + if (flags & IOCB_NOWAIT) + return -EAGAIN; + if (mnt_get_write_access_file(file)) + return 0; + ret = inode_update_time(inode, sync_mode); + mnt_put_write_access_file(file); return ret; } @@ -2365,14 +2412,7 @@ static int __file_update_time(struct file *file, int sync_mode) */ int file_update_time(struct file *file) { - int ret; - struct inode *inode = file_inode(file); - - ret = inode_needs_update_time(inode); - if (ret <= 0) - return ret; - - return __file_update_time(file, ret); + return file_update_time_flags(file, 0); } EXPORT_SYMBOL(file_update_time); @@ -2394,7 +2434,6 @@ EXPORT_SYMBOL(file_update_time); static int file_modified_flags(struct file *file, int flags) { int ret; - struct inode *inode = file_inode(file); /* * Clear the security bits if the process is not being run by root. @@ -2403,17 +2442,7 @@ static int file_modified_flags(struct file *file, int flags) ret = file_remove_privs_flags(file, flags); if (ret) return ret; - - if (unlikely(file->f_mode & FMODE_NOCMTIME)) - return 0; - - ret = inode_needs_update_time(inode); - if (ret <= 0) - return ret; - if (flags & IOCB_NOWAIT) - return -EAGAIN; - - return __file_update_time(file, ret); + return file_update_time_flags(file, flags); } /** @@ -2970,7 +2999,7 @@ void dump_inode(struct inode *inode, const char *reason) pr_warn("%s encountered for inode %px\n" "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, - inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); + inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); diff --git a/fs/internal.h b/fs/internal.h index 9b2b4d116880..d08d5e2235e9 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -67,6 +67,9 @@ int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode); struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); +struct dentry *start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags); +int lookup_noperm_common(struct qstr *qname, struct dentry *base); /* * namespace.c diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index f7e1c8534c46..a572b8808524 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -14,5 +14,6 @@ iomap-y += trace.o \ iomap-$(CONFIG_BLOCK) += direct-io.o \ ioend.o \ fiemap.o \ - seek.o + seek.o \ + bio.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c new file mode 100644 index 000000000000..fc045f2e4c45 --- /dev/null +++ b/fs/iomap/bio.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2016-2023 Christoph Hellwig. + */ +#include <linux/iomap.h> +#include <linux/pagemap.h> +#include "internal.h" +#include "trace.h" + +static void iomap_read_end_io(struct bio *bio) +{ + int error = blk_status_to_errno(bio->bi_status); + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) + iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); + bio_put(bio); +} + +static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) +{ + struct bio *bio = ctx->read_ctx; + + if (bio) + submit_bio(bio); +} + +static int iomap_bio_read_folio_range(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t plen) +{ + struct folio *folio = ctx->cur_folio; + const struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos; + size_t poff = offset_in_folio(folio, pos); + loff_t length = iomap_length(iter); + sector_t sector; + struct bio *bio = ctx->read_ctx; + + sector = iomap_sector(iomap, pos); + if (!bio || bio_end_sector(bio) != sector || + !bio_add_folio(bio, folio, plen, poff)) { + gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); + gfp_t orig_gfp = gfp; + unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); + + if (bio) + submit_bio(bio); + + if (ctx->rac) /* same as readahead_gfp_mask */ + gfp |= __GFP_NORETRY | __GFP_NOWARN; + bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, + gfp); + /* + * If the bio_alloc fails, try it again for a single page to + * avoid having to deal with partial page reads. This emulates + * what do_mpage_read_folio does. + */ + if (!bio) + bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); + if (ctx->rac) + bio->bi_opf |= REQ_RAHEAD; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = iomap_read_end_io; + bio_add_folio_nofail(bio, folio, plen, poff); + ctx->read_ctx = bio; + } + return 0; +} + +const struct iomap_read_ops iomap_bio_read_ops = { + .read_folio_range = iomap_bio_read_folio_range, + .submit_read = iomap_bio_submit_read, +}; +EXPORT_SYMBOL_GPL(iomap_bio_read_ops); + +int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + const struct iomap *srcmap = iomap_iter_srcmap(iter); + struct bio_vec bvec; + struct bio bio; + + bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); + bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); + return submit_bio_wait(&bio); +} diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8b847a1e27f1..e5c1ca440d93 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -8,6 +8,7 @@ #include <linux/writeback.h> #include <linux/swap.h> #include <linux/migrate.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -37,10 +38,28 @@ static inline bool ifs_is_fully_uptodate(struct folio *folio, return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } -static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, - unsigned int block) +/* + * Find the next uptodate block in the folio. end_blk is inclusive. + * If no uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_uptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + + return find_next_bit(ifs->state, end_blk + 1, start_blk); +} + +/* + * Find the next non-uptodate block in the folio. end_blk is inclusive. + * If no non-uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_nonuptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { - return test_bit(block, ifs->state); + struct iomap_folio_state *ifs = folio->private; + + return find_next_zero_bit(ifs->state, end_blk + 1, start_blk); } static bool ifs_set_range_uptodate(struct folio *folio, @@ -75,13 +94,34 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, folio_mark_uptodate(folio); } -static inline bool ifs_block_is_dirty(struct folio *folio, - struct iomap_folio_state *ifs, int block) +/* + * Find the next dirty block in the folio. end_blk is inclusive. + * If no dirty block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_dirty_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { + struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int blks = i_blocks_per_folio(inode, folio); + + return find_next_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; +} + +/* + * Find the next clean block in the folio. end_blk is inclusive. + * If no clean block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_clean_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + unsigned int blks = i_blocks_per_folio(inode, folio); - return test_bit(block + blks_per_folio, ifs->state); + return find_next_zero_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; } static unsigned ifs_find_dirty_range(struct folio *folio, @@ -92,18 +132,17 @@ static unsigned ifs_find_dirty_range(struct folio *folio, offset_in_folio(folio, *range_start) >> inode->i_blkbits; unsigned end_blk = min_not_zero( offset_in_folio(folio, range_end) >> inode->i_blkbits, - i_blocks_per_folio(inode, folio)); - unsigned nblks = 1; + i_blocks_per_folio(inode, folio)) - 1; + unsigned nblks; - while (!ifs_block_is_dirty(folio, ifs, start_blk)) - if (++start_blk == end_blk) - return 0; - - while (start_blk + nblks < end_blk) { - if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) - break; - nblks++; - } + start_blk = ifs_next_dirty_block(folio, start_blk, end_blk); + if (start_blk > end_blk) + return 0; + if (start_blk == end_blk) + nblks = 1; + else + nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk) - + start_blk; *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); return nblks << inode->i_blkbits; @@ -218,6 +257,22 @@ static void ifs_free(struct folio *folio) } /* + * Calculate how many bytes to truncate based off the number of blocks to + * truncate and the end position to start truncating from. + */ +static size_t iomap_bytes_to_truncate(loff_t end_pos, unsigned block_bits, + unsigned blocks_truncated) +{ + unsigned block_size = 1 << block_bits; + unsigned block_offset = end_pos & (block_size - 1); + + if (!block_offset) + return blocks_truncated << block_bits; + + return ((blocks_truncated - 1) << block_bits) + block_offset; +} + +/* * Calculate the range inside the folio that we actually need to read. */ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, @@ -240,24 +295,29 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * to avoid reading in already uptodate ranges. */ if (ifs) { - unsigned int i; - - /* move forward for each leading block marked uptodate */ - for (i = first; i <= last; i++) { - if (!ifs_block_is_uptodate(ifs, i)) - break; - *pos += block_size; - poff += block_size; - plen -= block_size; - first++; + unsigned int next, blocks_skipped; + + next = ifs_next_nonuptodate_block(folio, first, last); + blocks_skipped = next - first; + + if (blocks_skipped) { + unsigned long block_offset = *pos & (block_size - 1); + unsigned bytes_skipped = + (blocks_skipped << block_bits) - block_offset; + + *pos += bytes_skipped; + poff += bytes_skipped; + plen -= bytes_skipped; } + first = next; /* truncate len if we find any trailing uptodate block(s) */ - while (++i <= last) { - if (ifs_block_is_uptodate(ifs, i)) { - plen -= (last - i + 1) * block_size; - last = i - 1; - break; + if (++next <= last) { + next = ifs_next_uptodate_block(folio, next, last); + if (next <= last) { + plen -= iomap_bytes_to_truncate(*pos + plen, + block_bits, last - next + 1); + last = next - 1; } } } @@ -271,7 +331,8 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) - plen -= (last - end) * block_size; + plen -= iomap_bytes_to_truncate(*pos + plen, block_bits, + last - end); } *offp = poff; @@ -320,9 +381,8 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, return 0; } -#ifdef CONFIG_BLOCK -static void iomap_finish_folio_read(struct folio *folio, size_t off, - size_t len, int error) +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error) { struct iomap_folio_state *ifs = folio->private; bool uptodate = !error; @@ -342,169 +402,201 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off, if (finished) folio_end_read(folio, uptodate); } +EXPORT_SYMBOL_GPL(iomap_finish_folio_read); -static void iomap_read_end_io(struct bio *bio) +static void iomap_read_init(struct folio *folio) { - int error = blk_status_to_errno(bio->bi_status); - struct folio_iter fi; + struct iomap_folio_state *ifs = folio->private; - bio_for_each_folio_all(fi, bio) - iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); - bio_put(bio); + if (ifs) { + size_t len = folio_size(folio); + + /* + * ifs->read_bytes_pending is used to track how many bytes are + * read in asynchronously by the IO helper. We need to track + * this so that we can know when the IO helper has finished + * reading in all the necessary ranges of the folio and can end + * the read. + * + * Increase ->read_bytes_pending by the folio size to start, and + * add a +1 bias. We'll subtract the bias and any uptodate / + * zeroed ranges that did not require IO in iomap_read_end() + * after we're done processing the folio. + * + * We do this because otherwise, we would have to increment + * ifs->read_bytes_pending every time a range in the folio needs + * to be read in, which can get expensive since the spinlock + * needs to be held whenever modifying ifs->read_bytes_pending. + * + * We add the bias to ensure the read has not been ended on the + * folio when iomap_read_end() is called, even if the IO helper + * has already finished reading in the entire folio. + */ + spin_lock_irq(&ifs->state_lock); + WARN_ON_ONCE(ifs->read_bytes_pending != 0); + ifs->read_bytes_pending = len + 1; + spin_unlock_irq(&ifs->state_lock); + } } -struct iomap_readpage_ctx { - struct folio *cur_folio; - bool cur_folio_in_bio; - struct bio *bio; - struct readahead_control *rac; -}; +/* + * This ends IO if no bytes were submitted to an IO helper. + * + * Otherwise, this calibrates ifs->read_bytes_pending to represent only the + * submitted bytes (see comment in iomap_read_init()). If all bytes submitted + * have already been completed by the IO helper, then this will end the read. + * Else the IO helper will end the read after all submitted ranges have been + * read. + */ +static void iomap_read_end(struct folio *folio, size_t bytes_submitted) +{ + struct iomap_folio_state *ifs = folio->private; -static int iomap_readpage_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) + if (ifs) { + bool end_read, uptodate; + + spin_lock_irq(&ifs->state_lock); + if (!ifs->read_bytes_pending) { + WARN_ON_ONCE(bytes_submitted); + spin_unlock_irq(&ifs->state_lock); + folio_unlock(folio); + return; + } + + /* + * Subtract any bytes that were initially accounted to + * read_bytes_pending but skipped for IO. The +1 accounts for + * the bias we added in iomap_read_init(). + */ + ifs->read_bytes_pending -= + (folio_size(folio) + 1 - bytes_submitted); + + /* + * If !ifs->read_bytes_pending, this means all pending reads by + * the IO helper have already completed, which means we need to + * end the folio read here. If ifs->read_bytes_pending != 0, + * the IO helper will end the folio read. + */ + end_read = !ifs->read_bytes_pending; + if (end_read) + uptodate = ifs_is_fully_uptodate(folio, ifs); + spin_unlock_irq(&ifs->state_lock); + if (end_read) + folio_end_read(folio, uptodate); + } else if (!bytes_submitted) { + /* + * If there were no bytes submitted, this means we are + * responsible for unlocking the folio here, since no IO helper + * has taken ownership of it. If there were bytes submitted, + * then the IO helper will end the read via + * iomap_finish_folio_read(). + */ + folio_unlock(folio); + } +} + +static int iomap_read_folio_iter(struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t *bytes_submitted) { const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos; loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; - struct iomap_folio_state *ifs; size_t poff, plen; - sector_t sector; + loff_t pos_diff; int ret; if (iomap->type == IOMAP_INLINE) { ret = iomap_read_inline_data(iter, folio); if (ret) return ret; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } - /* zero post-eof blocks as the page may be mapped */ - ifs = ifs_alloc(iter->inode, folio, iter->flags); - iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); - if (plen == 0) - goto done; + ifs_alloc(iter->inode, folio, iter->flags); - if (iomap_block_needs_zeroing(iter, pos)) { - folio_zero_range(folio, poff, plen); - iomap_set_range_uptodate(folio, poff, plen); - goto done; - } + length = min_t(loff_t, length, + folio_size(folio) - offset_in_folio(folio, pos)); + while (length) { + iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, + &plen); - ctx->cur_folio_in_bio = true; - if (ifs) { - spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending += plen; - spin_unlock_irq(&ifs->state_lock); - } + pos_diff = pos - iter->pos; + if (WARN_ON_ONCE(pos_diff + plen > length)) + return -EIO; - sector = iomap_sector(iomap, pos); - if (!ctx->bio || - bio_end_sector(ctx->bio) != sector || - !bio_add_folio(ctx->bio, folio, plen, poff)) { - gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); - gfp_t orig_gfp = gfp; - unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); - - if (ctx->bio) - submit_bio(ctx->bio); - - if (ctx->rac) /* same as readahead_gfp_mask */ - gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), - REQ_OP_READ, gfp); - /* - * If the bio_alloc fails, try it again for a single page to - * avoid having to deal with partial page reads. This emulates - * what do_mpage_read_folio does. - */ - if (!ctx->bio) { - ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, - orig_gfp); - } - if (ctx->rac) - ctx->bio->bi_opf |= REQ_RAHEAD; - ctx->bio->bi_iter.bi_sector = sector; - ctx->bio->bi_end_io = iomap_read_end_io; - bio_add_folio_nofail(ctx->bio, folio, plen, poff); - } + ret = iomap_iter_advance(iter, pos_diff); + if (ret) + return ret; -done: - /* - * Move the caller beyond our range so that it keeps making progress. - * For that, we have to include any leading non-uptodate ranges, but - * we can skip trailing ones as they will be handled in the next - * iteration. - */ - length = pos - iter->pos + plen; - return iomap_iter_advance(iter, &length); -} + if (plen == 0) + return 0; -static int iomap_read_folio_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) -{ - int ret; + /* zero post-eof blocks as the page may be mapped */ + if (iomap_block_needs_zeroing(iter, pos)) { + folio_zero_range(folio, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); + } else { + if (!*bytes_submitted) + iomap_read_init(folio); + ret = ctx->ops->read_folio_range(iter, ctx, plen); + if (ret) + return ret; + *bytes_submitted += plen; + } - while (iomap_length(iter)) { - ret = iomap_readpage_iter(iter, ctx); + ret = iomap_iter_advance(iter, plen); if (ret) return ret; + length -= pos_diff + plen; + pos = iter->pos; } - return 0; } -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) +void iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct folio *folio = ctx->cur_folio; struct iomap_iter iter = { .inode = folio->mapping->host, .pos = folio_pos(folio), .len = folio_size(folio), }; - struct iomap_readpage_ctx ctx = { - .cur_folio = folio, - }; + size_t bytes_submitted = 0; int ret; trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, ctx, + &bytes_submitted); - if (ctx.bio) { - submit_bio(ctx.bio); - WARN_ON_ONCE(!ctx.cur_folio_in_bio); - } else { - WARN_ON_ONCE(ctx.cur_folio_in_bio); - folio_unlock(folio); - } + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); - /* - * Just like mpage_readahead and block_read_full_folio, we always - * return 0 and just set the folio error flag on errors. This - * should be cleaned up throughout the stack eventually. - */ - return 0; + iomap_read_end(folio, bytes_submitted); } EXPORT_SYMBOL_GPL(iomap_read_folio); static int iomap_readahead_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) + struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_submitted) { int ret; while (iomap_length(iter)) { if (ctx->cur_folio && offset_in_folio(ctx->cur_folio, iter->pos) == 0) { - if (!ctx->cur_folio_in_bio) - folio_unlock(ctx->cur_folio); + iomap_read_end(ctx->cur_folio, *cur_bytes_submitted); ctx->cur_folio = NULL; } if (!ctx->cur_folio) { ctx->cur_folio = readahead_folio(ctx->rac); - ctx->cur_folio_in_bio = false; + if (WARN_ON_ONCE(!ctx->cur_folio)) + return -EINVAL; + *cur_bytes_submitted = 0; } - ret = iomap_readpage_iter(iter, ctx); + ret = iomap_read_folio_iter(iter, ctx, cur_bytes_submitted); if (ret) return ret; } @@ -514,8 +606,8 @@ static int iomap_readahead_iter(struct iomap_iter *iter, /** * iomap_readahead - Attempt to read pages from a file. - * @rac: Describes the pages to be read. * @ops: The operations vector for the filesystem. + * @ctx: The ctx used for issuing readahead. * * This function is for filesystems to call to implement their readahead * address_space operation. @@ -527,51 +619,30 @@ static int iomap_readahead_iter(struct iomap_iter *iter, * function is called with memalloc_nofs set, so allocations will not cause * the filesystem to be reentered. */ -void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct readahead_control *rac = ctx->rac; struct iomap_iter iter = { .inode = rac->mapping->host, .pos = readahead_pos(rac), .len = readahead_length(rac), }; - struct iomap_readpage_ctx ctx = { - .rac = rac, - }; + size_t cur_bytes_submitted; trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.status = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, ctx, + &cur_bytes_submitted); - if (ctx.bio) - submit_bio(ctx.bio); - if (ctx.cur_folio) { - if (!ctx.cur_folio_in_bio) - folio_unlock(ctx.cur_folio); - } -} -EXPORT_SYMBOL_GPL(iomap_readahead); - -static int iomap_read_folio_range(const struct iomap_iter *iter, - struct folio *folio, loff_t pos, size_t len) -{ - const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct bio_vec bvec; - struct bio bio; + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); - bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); - bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); - return submit_bio_wait(&bio); + if (ctx->cur_folio) + iomap_read_end(ctx->cur_folio, cur_bytes_submitted); } -#else -static int iomap_read_folio_range(const struct iomap_iter *iter, - struct folio *folio, loff_t pos, size_t len) -{ - WARN_ON_ONCE(1); - return -EIO; -} -#endif /* CONFIG_BLOCK */ +EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a folio are @@ -584,7 +655,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned first, last, i; + unsigned first, last; if (!ifs) return false; @@ -596,10 +667,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) first = from >> inode->i_blkbits; last = (from + count - 1) >> inode->i_blkbits; - for (i = first; i <= last; i++) - if (!ifs_block_is_uptodate(ifs, i)) - return false; - return true; + return ifs_next_nonuptodate_block(folio, first, last) > last; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); @@ -707,7 +775,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, * are not changing pagecache contents. */ if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && - pos + len >= folio_pos(folio) + folio_size(folio)) + pos + len >= folio_next_pos(folio)) return 0; ifs = ifs_alloc(iter->inode, folio, iter->flags); @@ -723,9 +791,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, if (plen == 0) break; - if (!(iter->flags & IOMAP_UNSHARE) && - (from <= poff || from >= poff + plen) && - (to <= poff || to >= poff + plen)) + /* + * If the read range will be entirely overwritten by the write, + * we can skip having to zero/read it in. + */ + if (!(iter->flags & IOMAP_UNSHARE) && from <= poff && + to >= poff + plen) continue; if (iomap_block_needs_zeroing(iter, block_start)) { @@ -742,7 +813,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, status = write_ops->read_folio_range(iter, folio, block_start, plen); else - status = iomap_read_folio_range(iter, + status = iomap_bio_read_folio_range_sync(iter, folio, block_start, plen); if (status) return status; @@ -761,6 +832,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); + if (iter->fbatch) { + struct folio *folio = folio_batch_next(iter->fbatch); + + if (!folio) + return NULL; + + /* + * The folio mapping generally shouldn't have changed based on + * fs locks, but be consistent with filemap lookup and retry + * the iter if it does. + */ + folio_lock(folio); + if (unlikely(folio->mapping != iter->inode->i_mapping)) { + iter->iomap.flags |= IOMAP_F_STALE; + folio_unlock(folio); + return NULL; + } + + folio_get(folio); + return folio; + } + if (write_ops && write_ops->get_folio) return write_ops->get_folio(iter, pos, len); return iomap_get_folio(iter, pos, len); @@ -815,15 +908,14 @@ static int iomap_write_begin(struct iomap_iter *iter, size_t *poffset, u64 *plen) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; + loff_t pos; u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); struct folio *folio; int status = 0; len = min_not_zero(len, *plen); - BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); - if (srcmap != &iter->iomap) - BUG_ON(pos + len > srcmap->offset + srcmap->length); + *foliop = NULL; + *plen = 0; if (fatal_signal_pending(current)) return -EINTR; @@ -833,6 +925,15 @@ static int iomap_write_begin(struct iomap_iter *iter, return PTR_ERR(folio); /* + * No folio means we're done with a batch. We still have range to + * process so return and let the caller iterate and refill the batch. + */ + if (!folio) { + WARN_ON_ONCE(!iter->fbatch); + return 0; + } + + /* * Now we have a locked folio, before we do anything with it we need to * check that the iomap we have cached is not stale. The inode extent * mapping can change due to concurrent IO in flight (e.g. @@ -852,6 +953,22 @@ static int iomap_write_begin(struct iomap_iter *iter, } } + /* + * The folios in a batch may not be contiguous. If we've skipped + * forward, advance the iter to the pos of the current folio. If the + * folio starts beyond the end of the mapping, it may have been trimmed + * since the lookup for whatever reason. Return a NULL folio to + * terminate the op. + */ + if (folio_pos(folio) > iter->pos) { + len = min_t(u64, folio_pos(folio) - iter->pos, + iomap_length(iter)); + status = iomap_iter_advance(iter, len); + len = iomap_length(iter); + if (status || !len) + goto out_unlock; + } + pos = iomap_trim_folio_range(iter, folio, poffset, &len); if (srcmap->type == IOMAP_INLINE) @@ -1041,7 +1158,7 @@ retry: } } else { total_written += written; - iomap_iter_advance(iter, &written); + iomap_iter_advance(iter, written); } } while (iov_iter_count(i) && iomap_length(iter)); @@ -1082,7 +1199,7 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, struct folio *folio, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { - unsigned int first_blk, last_blk, i; + unsigned int first_blk, last_blk; loff_t last_byte; u8 blkbits = inode->i_blkbits; struct iomap_folio_state *ifs; @@ -1097,14 +1214,14 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, if (!ifs) return; - last_byte = min_t(loff_t, end_byte - 1, - folio_pos(folio) + folio_size(folio) - 1); + last_byte = min_t(loff_t, end_byte - 1, folio_next_pos(folio) - 1); first_blk = offset_in_folio(folio, start_byte) >> blkbits; last_blk = offset_in_folio(folio, last_byte) >> blkbits; - for (i = first_blk; i <= last_blk; i++) { - if (!ifs_block_is_dirty(folio, ifs, i)) - punch(inode, folio_pos(folio) + (i << blkbits), - 1 << blkbits, iomap); + while ((first_blk = ifs_next_clean_block(folio, first_blk, last_blk)) + <= last_blk) { + punch(inode, folio_pos(folio) + (first_blk << blkbits), + 1 << blkbits, iomap); + first_blk++; } } @@ -1129,8 +1246,7 @@ static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, * Make sure the next punch start is correctly bound to * the end of this data range, not the end of the folio. */ - *punch_start_byte = min_t(loff_t, end_byte, - folio_pos(folio) + folio_size(folio)); + *punch_start_byte = min_t(loff_t, end_byte, folio_next_pos(folio)); } /* @@ -1170,7 +1286,7 @@ static void iomap_write_delalloc_scan(struct inode *inode, start_byte, end_byte, iomap, punch); /* move offset to start of next folio in range */ - start_byte = folio_pos(folio) + folio_size(folio); + start_byte = folio_next_pos(folio); folio_unlock(folio); folio_put(folio); } @@ -1310,7 +1426,7 @@ static int iomap_unshare_iter(struct iomap_iter *iter, int status; if (!iomap_want_unshare_iter(iter)) - return iomap_iter_advance(iter, &bytes); + return iomap_iter_advance(iter, bytes); do { struct folio *folio; @@ -1334,10 +1450,10 @@ static int iomap_unshare_iter(struct iomap_iter *iter, balance_dirty_pages_ratelimited(iter->inode->i_mapping); - status = iomap_iter_advance(iter, &bytes); + status = iomap_iter_advance(iter, bytes); if (status) break; - } while (bytes > 0); + } while ((bytes = iomap_length(iter)) > 0); return status; } @@ -1398,6 +1514,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, if (iter->iomap.flags & IOMAP_F_STALE) break; + /* a NULL folio means we're done with a folio batch */ + if (!folio) { + status = iomap_iter_advance_full(iter); + break; + } + /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); @@ -1412,16 +1534,36 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, if (WARN_ON_ONCE(!ret)) return -EIO; - status = iomap_iter_advance(iter, &bytes); + status = iomap_iter_advance(iter, bytes); if (status) break; - } while (bytes > 0); + } while ((bytes = iomap_length(iter)) > 0); if (did_zero) *did_zero = true; return status; } +loff_t +iomap_fill_dirty_folios( + struct iomap_iter *iter, + loff_t offset, + loff_t length) +{ + struct address_space *mapping = iter->inode->i_mapping; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + length - 1) >> PAGE_SHIFT; + + iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL); + if (!iter->fbatch) + return offset + length; + folio_batch_init(iter->fbatch); + + filemap_get_folios_dirty(mapping, &start, end, iter->fbatch); + return (start << PAGE_SHIFT); +} +EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios); + int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, @@ -1435,46 +1577,26 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, .private = private, }; struct address_space *mapping = inode->i_mapping; - unsigned int blocksize = i_blocksize(inode); - unsigned int off = pos & (blocksize - 1); - loff_t plen = min_t(loff_t, len, blocksize - off); int ret; bool range_dirty; /* - * Zero range can skip mappings that are zero on disk so long as - * pagecache is clean. If pagecache was dirty prior to zero range, the - * mapping converts on writeback completion and so must be zeroed. - * - * The simplest way to deal with this across a range is to flush - * pagecache and process the updated mappings. To avoid excessive - * flushing on partial eof zeroing, special case it to zero the - * unaligned start portion if already dirty in pagecache. - */ - if (off && - filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { - iter.len = plen; - while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_zero_iter(&iter, did_zero, - write_ops); - - iter.len = len - (iter.pos - pos); - if (ret || !iter.len) - return ret; - } - - /* * To avoid an unconditional flush, check pagecache state and only flush * if dirty and the fs returns a mapping that might convert on * writeback. */ - range_dirty = filemap_range_needs_writeback(inode->i_mapping, - iter.pos, iter.pos + iter.len - 1); + range_dirty = filemap_range_needs_writeback(mapping, iter.pos, + iter.pos + iter.len - 1); while ((ret = iomap_iter(&iter, ops)) > 0) { const struct iomap *srcmap = iomap_iter_srcmap(&iter); - if (srcmap->type == IOMAP_HOLE || - srcmap->type == IOMAP_UNWRITTEN) { + if (WARN_ON_ONCE(iter.fbatch && + srcmap->type != IOMAP_UNWRITTEN)) + return -EIO; + + if (!iter.fbatch && + (srcmap->type == IOMAP_HOLE || + srcmap->type == IOMAP_UNWRITTEN)) { s64 status; if (range_dirty) { @@ -1526,7 +1648,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, folio_mark_dirty(folio); } - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, @@ -1559,16 +1681,25 @@ out_unlock: } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len) +static void iomap_writeback_init(struct inode *inode, struct folio *folio) { struct iomap_folio_state *ifs = folio->private; WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); - if (ifs) - atomic_add(len, &ifs->write_bytes_pending); + if (ifs) { + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + /* + * Set this to the folio size. After processing the folio for + * writeback in iomap_writeback_folio(), we'll subtract any + * ranges not written back. + * + * We do this because otherwise, we would have to atomically + * increment ifs->write_bytes_pending every time a range in the + * folio needs to be written back. + */ + atomic_set(&ifs->write_bytes_pending, folio_size(folio)); + } } -EXPORT_SYMBOL_GPL(iomap_start_folio_write); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len) @@ -1585,7 +1716,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write); static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 pos, u32 rlen, u64 end_pos, - bool *wb_pending) + size_t *bytes_submitted) { do { ssize_t ret; @@ -1599,11 +1730,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, pos += ret; /* - * Holes are not be written back by ->writeback_range, so track + * Holes are not written back by ->writeback_range, so track * if we did handle anything that is not a hole here. */ if (wpc->iomap.type != IOMAP_HOLE) - *wb_pending = true; + *bytes_submitted += ret; } while (rlen); return 0; @@ -1674,7 +1805,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); u64 end_aligned = 0; - bool wb_pending = false; + size_t bytes_submitted = 0; int error = 0; u32 rlen; @@ -1694,14 +1825,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) iomap_set_range_dirty(folio, 0, end_pos - pos); } - /* - * Keep the I/O completion handler from clearing the writeback - * bit until we have submitted all blocks by adding a bias to - * ifs->write_bytes_pending, which is dropped after submitting - * all blocks. - */ - WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); - iomap_start_folio_write(inode, folio, 1); + iomap_writeback_init(inode, folio); } /* @@ -1716,13 +1840,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) end_aligned = round_up(end_pos, i_blocksize(inode)); while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos, - &wb_pending); + &bytes_submitted); if (error) break; pos += rlen; } - if (wb_pending) + if (bytes_submitted) wpc->nr_folios++; /* @@ -1740,12 +1864,20 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) * bit ourselves right after unlocking the page. */ if (ifs) { - if (atomic_dec_and_test(&ifs->write_bytes_pending)) - folio_end_writeback(folio); - } else { - if (!wb_pending) - folio_end_writeback(folio); + /* + * Subtract any bytes that were initially accounted to + * write_bytes_pending but skipped for writeback. + */ + size_t bytes_not_submitted = folio_size(folio) - + bytes_submitted; + + if (bytes_not_submitted) + iomap_finish_folio_write(inode, folio, + bytes_not_submitted); + } else if (!bytes_submitted) { + folio_end_writeback(folio); } + mapping_set_error(inode->i_mapping, error); return error; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 5d5d63efbd57..8e273408453a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -16,21 +16,13 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ -#define IOMAP_DIO_NO_INVALIDATE (1U << 25) -#define IOMAP_DIO_CALLER_COMP (1U << 26) -#define IOMAP_DIO_INLINE_COMP (1U << 27) +#define IOMAP_DIO_NO_INVALIDATE (1U << 26) +#define IOMAP_DIO_COMP_WORK (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) #define IOMAP_DIO_NEED_SYNC (1U << 29) #define IOMAP_DIO_WRITE (1U << 30) #define IOMAP_DIO_DIRTY (1U << 31) -/* - * Used for sub block zeroing in iomap_dio_zero() - */ -#define IOMAP_ZERO_PAGE_SIZE (SZ_64K) -#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE)) -static struct page *zero_page; - struct iomap_dio { struct kiocb *iocb; const struct iomap_dio_ops *dops; @@ -140,11 +132,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) } EXPORT_SYMBOL_GPL(iomap_dio_complete); -static ssize_t iomap_dio_deferred_complete(void *data) -{ - return iomap_dio_complete(data); -} - static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); @@ -179,33 +166,33 @@ static void iomap_dio_done(struct iomap_dio *dio) WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { - WRITE_ONCE(iocb->private, NULL); - iomap_dio_complete_work(&dio->aio.work); - } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then - * schedule our completion that way to avoid an async punt to a - * workqueue. - */ - /* only polled IO cares about private cleared */ - iocb->private = dio; - iocb->dio_complete = iomap_dio_deferred_complete; + return; + } - /* - * Invoke ->ki_complete() directly. We've assigned our - * dio_complete callback handler, and since the issuer set - * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will - * notice ->dio_complete being set and will defer calling that - * handler until it can be done from a safe task context. - * - * Note that the 'res' being passed in here is not important - * for this case. The actual completion value of the request - * will be gotten from dio_complete when that is run by the - * issuer. - */ - iocb->ki_complete(iocb, 0); - } else { + /* + * Always run error completions in user context. These are not + * performance critical and some code relies on taking sleeping locks + * for error handling. + */ + if (dio->error) + dio->flags |= IOMAP_DIO_COMP_WORK; + + /* + * Never invalidate pages from this context to avoid deadlocks with + * buffered I/O completions when called from the ioend workqueue, + * or avoid sleeping when called directly from ->bi_end_io. + * Tough luck if you hit the tiny race with someone dirtying the range + * right between this check and the actual completion. + */ + if ((dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_COMP_WORK)) { + if (dio->iocb->ki_filp->f_mapping->nrpages) + dio->flags |= IOMAP_DIO_COMP_WORK; + else + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + + if (dio->flags & IOMAP_DIO_COMP_WORK) { struct inode *inode = file_inode(iocb->ki_filp); /* @@ -216,7 +203,11 @@ static void iomap_dio_done(struct iomap_dio *dio) */ INIT_WORK(&dio->aio.work, iomap_dio_complete_work); queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + return; } + + WRITE_ONCE(iocb->private, NULL); + iomap_dio_complete_work(&dio->aio.work); } void iomap_dio_bio_end_io(struct bio *bio) @@ -252,16 +243,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) /* * Try to avoid another context switch for the completion given * that we are already called from the ioend completion - * workqueue, but never invalidate pages from this thread to - * avoid deadlocks with buffered I/O completions. Tough luck if - * you hit the tiny race with someone dirtying the range now - * between this check and the actual completion. + * workqueue. */ - if (!dio->iocb->ki_filp->f_mapping->nrpages) { - dio->flags |= IOMAP_DIO_INLINE_COMP; - dio->flags |= IOMAP_DIO_NO_INVALIDATE; - } - dio->flags &= ~IOMAP_DIO_CALLER_COMP; + dio->flags &= ~IOMAP_DIO_COMP_WORK; iomap_dio_done(dio); } @@ -285,42 +269,36 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, { struct inode *inode = file_inode(dio->iocb->ki_filp); struct bio *bio; + struct folio *zero_folio = largest_zero_folio(); + int nr_vecs = max(1, i_blocksize(inode) / folio_size(zero_folio)); if (!len) return 0; + /* - * Max block size supported is 64k + * This limit shall never be reached as most filesystems have a + * maximum blocksize of 64k. */ - if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE)) + if (WARN_ON_ONCE(nr_vecs > BIO_MAX_VECS)) return -EINVAL; - bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); + bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - __bio_add_page(bio, zero_page, len, 0); + while (len > 0) { + unsigned int io_len = min(len, folio_size(zero_folio)); + + bio_add_folio_nofail(bio, zero_folio, io_len, 0); + len -= io_len; + } iomap_dio_submit_bio(iter, dio, bio, pos); - return 0; -} -/* - * Use a FUA write if we need datasync semantics and this is a pure data I/O - * that doesn't require any metadata updates (including after I/O completion - * such as unwritten extent conversion) and the underlying device either - * doesn't have a volatile write cache or supports FUA. - * This allows us to avoid cache flushes on I/O completion. - */ -static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, - struct iomap_dio *dio) -{ - if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) - return false; - if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) - return false; - return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); + return 0; } static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) @@ -336,12 +314,39 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) int nr_pages, ret = 0; u64 copied = 0; size_t orig_count; + unsigned int alignment; + + /* + * File systems that write out of place and always allocate new blocks + * need each bio to be block aligned as that's the unit of allocation. + */ + if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED) + alignment = fs_block_size; + else + alignment = bdev_logical_block_size(iomap->bdev); - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1)) + if ((pos | length) & (alignment - 1)) return -EINVAL; if (dio->flags & IOMAP_DIO_WRITE) { - bio_opf |= REQ_OP_WRITE; + bool need_completion_work = true; + + switch (iomap->type) { + case IOMAP_MAPPED: + /* + * Directly mapped I/O does not inherently need to do + * work at I/O completion time. But there are various + * cases below where this will get set again. + */ + need_completion_work = false; + break; + case IOMAP_UNWRITTEN: + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + break; + default: + break; + } if (iomap->flags & IOMAP_F_ATOMIC_BIO) { /* @@ -354,35 +359,54 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio_opf |= REQ_ATOMIC; } - if (iomap->type == IOMAP_UNWRITTEN) { - dio->flags |= IOMAP_DIO_UNWRITTEN; - need_zeroout = true; - } - - if (iomap->flags & IOMAP_F_SHARED) + if (iomap->flags & IOMAP_F_SHARED) { + /* + * Unsharing of needs to update metadata at I/O + * completion time. + */ + need_completion_work = true; dio->flags |= IOMAP_DIO_COW; + } - if (iomap->flags & IOMAP_F_NEW) + if (iomap->flags & IOMAP_F_NEW) { + /* + * Newly allocated blocks might need recording in + * metadata at I/O completion time. + */ + need_completion_work = true; need_zeroout = true; - else if (iomap->type == IOMAP_MAPPED && - iomap_dio_can_use_fua(iomap, dio)) - bio_opf |= REQ_FUA; + } - if (!(bio_opf & REQ_FUA)) - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + /* + * Use a FUA write if we need datasync semantics and this is a + * pure overwrite that doesn't require any metadata updates. + * + * This allows us to avoid cache flushes on I/O completion. + */ + if (dio->flags & IOMAP_DIO_WRITE_THROUGH) { + if (!need_completion_work && + !(iomap->flags & IOMAP_F_DIRTY) && + (!bdev_write_cache(iomap->bdev) || + bdev_fua(iomap->bdev))) + bio_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + } /* - * We can only do deferred completion for pure overwrites that + * We can only do inline completion for pure overwrites that * don't require additional I/O at completion time. * - * This rules out writes that need zeroing or extent conversion, - * extend the file size, or issue metadata I/O or cache flushes - * during completion processing. + * This rules out writes that need zeroing or metdata updates to + * convert unwritten or shared extents. + * + * Writes that extend i_size are also not supported, but this is + * handled in __iomap_dio_rw(). */ - if (need_zeroout || (pos >= i_size_read(inode)) || - ((dio->flags & IOMAP_DIO_NEED_SYNC) && - !(bio_opf & REQ_FUA))) - dio->flags &= ~IOMAP_DIO_CALLER_COMP; + if (need_completion_work) + dio->flags |= IOMAP_DIO_COMP_WORK; + + bio_opf |= REQ_OP_WRITE; } else { bio_opf |= REQ_OP_READ; } @@ -403,7 +427,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) * ones we set for inline and deferred completions. If none of those * are available for this IO, clear the polled flag. */ - if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) + if (dio->flags & IOMAP_DIO_COMP_WORK) dio->iocb->ki_flags &= ~IOCB_HIPRI; if (need_zeroout) { @@ -434,7 +458,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_end_io = iomap_dio_bio_end_io; ret = bio_iov_iter_get_pages(bio, dio->submit.iter, - bdev_logical_block_size(iomap->bdev) - 1); + alignment - 1); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall @@ -496,7 +520,7 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return iomap_iter_advance(iter, &copied); + return iomap_iter_advance(iter, copied); return ret; } @@ -507,7 +531,7 @@ static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) dio->size += length; if (!length) return -EFAULT; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) @@ -542,7 +566,7 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) dio->size += copied; if (!copied) return -EFAULT; - return iomap_iter_advance(iomi, &copied); + return iomap_iter_advance(iomi, copied); } static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) @@ -639,10 +663,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iov_iter_rw(iter) == READ) { - /* reads can always complete inline */ - dio->flags |= IOMAP_DIO_INLINE_COMP; + if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) + dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) goto out_free_dio; @@ -656,15 +680,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; - /* - * Flag as supporting deferred completions, if the issuer - * groks it. This can avoid a workqueue punt for writes. - * We may later clear this flag if we need to do other IO - * as part of this IO completion. - */ - if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) - dio->flags |= IOMAP_DIO_CALLER_COMP; - if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (iomi.pos >= dio->i_size || @@ -694,6 +709,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* + * i_size updates must to happen from process context. + */ + if (iomi.pos + iomi.len > dio->i_size) + dio->flags |= IOMAP_DIO_COMP_WORK; + + /* * Try to invalidate cache pages for the range we are writing. * If this invalidation fails, let the caller fall back to * buffered I/O. @@ -717,12 +738,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } goto out_free_dio; } + } - if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { - ret = sb_init_dio_done_wq(inode->i_sb); - if (ret < 0) - goto out_free_dio; - } + if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + goto out_free_dio; } inode_dio_begin(inode); @@ -765,9 +786,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * If all the writes we issued were already written through to the * media, we don't need to flush the cache on IO completion. Clear the * sync flag for this case. + * + * Otherwise clear the inline completion flag if any sync work is + * needed, as that needs to be performed from process context. */ if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + else if (dio->flags & IOMAP_DIO_NEED_SYNC) + dio->flags |= IOMAP_DIO_COMP_WORK; /* * We are about to drop our additional submission reference, which @@ -825,15 +851,3 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); - -static int __init iomap_dio_init(void) -{ - zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, - IOMAP_ZERO_PAGE_ORDER); - - if (!zero_page) - return -ENOMEM; - - return 0; -} -fs_initcall(iomap_dio_init); diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index d05cb3aed96e..3a4e4aad2bd1 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -6,4 +6,16 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); +#ifdef CONFIG_BLOCK +int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len); +#else +static inline int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + WARN_ON_ONCE(1); + return -EIO; +} +#endif /* CONFIG_BLOCK */ + #endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index b49fa75eab26..86f44922ed3b 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -194,8 +194,6 @@ new_ioend: if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) goto new_ioend; - iomap_start_folio_write(wpc->inode, folio, map_len); - /* * Clamp io_offset and io_size to the incore EOF so that ondisk * file size updates in the ioend completion are byte-accurate. diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index cef77ca0c20b..8692e5e41c6d 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -8,22 +8,24 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { + if (iter->fbatch) { + folio_batch_release(iter->fbatch); + kfree(iter->fbatch); + iter->fbatch = NULL; + } + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); } -/* - * Advance the current iterator position and output the length remaining for the - * current mapping. - */ -int iomap_iter_advance(struct iomap_iter *iter, u64 *count) +/* Advance the current iterator position and decrement the remaining length */ +int iomap_iter_advance(struct iomap_iter *iter, u64 count) { - if (WARN_ON_ONCE(*count > iomap_length(iter))) + if (WARN_ON_ONCE(count > iomap_length(iter))) return -EIO; - iter->pos += *count; - iter->len -= *count; - *count = iomap_length(iter); + iter->pos += count; + iter->len -= count; return 0; } diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 56db2dd4b10d..6cbc587c93da 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -16,13 +16,13 @@ static int iomap_seek_hole_iter(struct iomap_iter *iter, *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_HOLE); if (*hole_pos == iter->pos + length) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); return 0; case IOMAP_HOLE: *hole_pos = iter->pos; return 0; default: - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } } @@ -59,12 +59,12 @@ static int iomap_seek_data_iter(struct iomap_iter *iter, switch (iter->iomap.type) { case IOMAP_HOLE: - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); case IOMAP_UNWRITTEN: *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_DATA); if (*hole_pos < 0) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); return 0; default: *hole_pos = iter->pos; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index a61c1dae4742..532787277b16 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -122,9 +122,10 @@ DEFINE_RANGE_EVENT(iomap_zero_iter); #define IOMAP_DIO_STRINGS \ - {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ - {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ - {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } + {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ + {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ + {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }, \ + {IOMAP_DIO_FSBLOCK_ALIGNED, "DIO_FSBLOCK_ALIGNED" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 6f0e6b19383c..b7cbe126faf3 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -610,6 +610,11 @@ static int isofs_fill_super(struct super_block *s, struct fs_context *fc) goto out_freesbi; } opt->blocksize = sb_min_blocksize(s, opt->blocksize); + if (!opt->blocksize) { + printk(KERN_ERR + "ISOFS: unable to set blocksize\n"); + goto out_freesbi; + } sbi->s_high_sierra = 0; /* default is iso9660 */ sbi->s_session = opt->session; @@ -1515,7 +1520,7 @@ struct inode *__isofs_iget(struct super_block *sb, if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { ret = isofs_read_inode(inode, relocated); if (ret < 0) { iget_failed(inode); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index d175cccb7c55..764bba8ba999 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -265,7 +265,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; f = JFFS2_INODE_INFO(inode); @@ -373,7 +373,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) { struct iattr iattr; - if (!(inode->i_state & I_DIRTY_DATASYNC)) { + if (!(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) { jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n", __func__, inode->i_ino); return; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2a4a288b821c..87ad042221e7 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -26,8 +26,8 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) return rc; inode_lock(inode); - if (!(inode->i_state & I_DIRTY_ALL) || - (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { + if (!(inode_state_read_once(inode) & I_DIRTY_ALL) || + (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); inode_unlock(inode); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 21f3d029da7d..4709762713ef 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -29,7 +29,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ret = diRead(inode); diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 10934f9a11be..5aaafedb8fbc 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -76,14 +76,14 @@ struct jfs_inode_info { struct { unchar _unused[16]; /* 16: */ dxd_t _dxd; /* 16: */ - /* _inline may overflow into _inline_ea when needed */ + /* _inline_sym may overflow into _inline_ea when needed */ /* _inline_ea may overlay the last part of * file._xtroot if maxentry = XTROOTINITSLOT */ union { struct { /* 128: inline symlink */ - unchar _inline[128]; + unchar _inline_sym[128]; /* 128: inline extended attr */ unchar _inline_ea[128]; }; @@ -101,7 +101,7 @@ struct jfs_inode_info { #define i_imap u.file._imap #define i_dirtable u.dir._table #define i_dtroot u.dir._dtroot -#define i_inline u.link._inline +#define i_inline u.link._inline_sym #define i_inline_ea u.link._inline_ea #define i_inline_all u.link._inline_all diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 7840a03e5bcb..c16578af3a77 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1287,7 +1287,7 @@ int txCommit(tid_t tid, /* transaction identifier */ * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. * Joern */ - if (tblk->u.ip->i_state & I_SYNC) + if (inode_state_read_once(tblk->u.ip) & I_SYNC) tblk->xflag &= ~COMMIT_LAZY; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 457f91c412d4..a36aaee98dce 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -251,7 +251,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) struct inode *inode; inode = iget_locked(sb, kernfs_ino(kn)); - if (inode && (inode->i_state & I_NEW)) + if (inode && (inode_state_read_once(inode) & I_NEW)) kernfs_init_inode(kn, inode); return inode; diff --git a/fs/libfs.c b/fs/libfs.c index ce8c496a6940..2d6657947abd 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -680,6 +680,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) s->s_export_op = ctx->eops; s->s_xattr = ctx->xattr; s->s_time_gran = 1; + s->s_d_flags |= ctx->s_d_flags; root = new_inode(s); if (!root) return -ENOMEM; @@ -1542,9 +1543,9 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read_once(inode) & I_DIRTY_ALL)) goto out; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); @@ -1664,7 +1665,7 @@ struct inode *alloc_anon_inode(struct super_block *s) * list because mark_inode_dirty() will think * that it already _is_ on the dirty list. */ - inode->i_state = I_DIRTY; + inode_state_assign_raw(inode, I_DIRTY); /* * Historically anonymous inodes don't have a type at all and * userspace has come to rely on this. @@ -2289,27 +2290,25 @@ void stashed_dentry_prune(struct dentry *dentry) cmpxchg(stashed, dentry, NULL); } -/* parent must be held exclusive */ +/** + * simple_start_creating - prepare to create a given name + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Required lock is taken and a lookup in performed prior to creating an + * object in a directory. No permission checking is performed. + * + * Returns: a negative dentry on which vfs_create() or similar may + * be attempted, or an error. + */ struct dentry *simple_start_creating(struct dentry *parent, const char *name) { - struct dentry *dentry; - struct inode *dir = d_inode(parent); + struct qstr qname = QSTR(name); + int err; - inode_lock(dir); - if (unlikely(IS_DEADDIR(dir))) { - inode_unlock(dir); - return ERR_PTR(-ENOENT); - } - dentry = lookup_noperm(&QSTR(name), parent); - if (IS_ERR(dentry)) { - inode_unlock(dir); - return dentry; - } - if (dentry->d_inode) { - dput(dentry); - inode_unlock(dir); - return ERR_PTR(-EEXIST); - } - return dentry; + err = lookup_noperm_common(&qname, parent); + if (err) + return ERR_PTR(err); + return start_dirop(parent, &qname, LOOKUP_CREATE | LOOKUP_EXCL); } EXPORT_SYMBOL(simple_start_creating); diff --git a/fs/lockd/netlink.c b/fs/lockd/netlink.c index 6e00b02cad90..880c42b4f8c3 100644 --- a/fs/lockd/netlink.c +++ b/fs/lockd/netlink.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/lockd.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> diff --git a/fs/lockd/netlink.h b/fs/lockd/netlink.h index 1920543a7955..d8408f077dd8 100644 --- a/fs/lockd/netlink.h +++ b/fs/lockd/netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/lockd.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_LOCKD_GEN_H #define _LINUX_LOCKD_GEN_H diff --git a/fs/locks.c b/fs/locks.c index 04a3f0e20724..9f565802a88c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -585,7 +585,7 @@ static const struct lease_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, int type, struct file_lease *fl) +static int lease_init(struct file *filp, unsigned int flags, int type, struct file_lease *fl) { if (assign_type(&fl->c, type) != 0) return -EINVAL; @@ -594,13 +594,13 @@ static int lease_init(struct file *filp, int type, struct file_lease *fl) fl->c.flc_pid = current->tgid; fl->c.flc_file = filp; - fl->c.flc_flags = FL_LEASE; + fl->c.flc_flags = flags; fl->fl_lmops = &lease_manager_ops; return 0; } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lease *lease_alloc(struct file *filp, int type) +static struct file_lease *lease_alloc(struct file *filp, unsigned int flags, int type) { struct file_lease *fl = locks_alloc_lease(); int error = -ENOMEM; @@ -608,7 +608,7 @@ static struct file_lease *lease_alloc(struct file *filp, int type) if (fl == NULL) return ERR_PTR(error); - error = lease_init(filp, type, fl); + error = lease_init(filp, flags, type, fl); if (error) { locks_free_lease(fl); return ERR_PTR(error); @@ -1529,29 +1529,35 @@ any_leases_conflict(struct inode *inode, struct file_lease *breaker) /** * __break_lease - revoke all outstanding leases on file * @inode: the inode of the file to return - * @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR: - * break all leases - * @type: FL_LEASE: break leases and delegations; FL_DELEG: break - * only delegations + * @flags: LEASE_BREAK_* flags * * break_lease (inlined for speed) has checked there already is at least * some kind of lock (maybe a lease) on this file. Leases are broken on - * a call to open() or truncate(). This function can sleep unless you - * specified %O_NONBLOCK to your open(). + * a call to open() or truncate(). This function can block waiting for the + * lease break unless you specify LEASE_BREAK_NONBLOCK. */ -int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +int __break_lease(struct inode *inode, unsigned int flags) { - int error = 0; - struct file_lock_context *ctx; struct file_lease *new_fl, *fl, *tmp; + struct file_lock_context *ctx; unsigned long break_time; - int want_write = (mode & O_ACCMODE) != O_RDONLY; + unsigned int type; LIST_HEAD(dispose); + bool want_write = !(flags & LEASE_BREAK_OPEN_RDONLY); + int error = 0; - new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); + if (flags & LEASE_BREAK_LEASE) + type = FL_LEASE; + else if (flags & LEASE_BREAK_DELEG) + type = FL_DELEG; + else if (flags & LEASE_BREAK_LAYOUT) + type = FL_LAYOUT; + else + return -EINVAL; + + new_fl = lease_alloc(NULL, type, want_write ? F_WRLCK : F_RDLCK); if (IS_ERR(new_fl)) return PTR_ERR(new_fl); - new_fl->c.flc_flags = type; /* typically we will check that ctx is non-NULL before calling */ ctx = locks_inode_context(inode); @@ -1596,7 +1602,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) if (list_empty(&ctx->flc_lease)) goto out; - if (mode & O_NONBLOCK) { + if (flags & LEASE_BREAK_NONBLOCK) { trace_break_lease_noblock(inode, new_fl); error = -EWOULDBLOCK; goto out; @@ -1675,8 +1681,9 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time) EXPORT_SYMBOL(lease_get_mtime); /** - * fcntl_getlease - Enquire what lease is currently active + * __fcntl_getlease - Enquire what lease is currently active * @filp: the file + * @flavor: type of lease flags to check * * The value returned by this function will be one of * (if no lease break is pending): @@ -1697,7 +1704,7 @@ EXPORT_SYMBOL(lease_get_mtime); * XXX: sfr & willy disagree over whether F_INPROGRESS * should be returned to userspace. */ -int fcntl_getlease(struct file *filp) +static int __fcntl_getlease(struct file *filp, unsigned int flavor) { struct file_lease *fl; struct inode *inode = file_inode(filp); @@ -1713,7 +1720,8 @@ int fcntl_getlease(struct file *filp) list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { if (fl->c.flc_file != filp) continue; - type = target_leasetype(fl); + if (fl->c.flc_flags & flavor) + type = target_leasetype(fl); break; } spin_unlock(&ctx->flc_lock); @@ -1724,6 +1732,19 @@ int fcntl_getlease(struct file *filp) return type; } +int fcntl_getlease(struct file *filp) +{ + return __fcntl_getlease(filp, FL_LEASE); +} + +int fcntl_getdeleg(struct file *filp, struct delegation *deleg) +{ + if (deleg->d_flags != 0 || deleg->__pad != 0) + return -EINVAL; + deleg->d_type = __fcntl_getlease(filp, FL_DELEG); + return 0; +} + /** * check_conflicting_open - see if the given file points to an inode that has * an existing open that would conflict with the @@ -1929,11 +1950,19 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { + struct inode *inode = file_inode(filp); + + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -EINVAL; + switch (arg) { case F_UNLCK: return generic_delete_lease(filp, *priv); - case F_RDLCK: case F_WRLCK: + if (S_ISDIR(inode->i_mode)) + return -EINVAL; + fallthrough; + case F_RDLCK: if (!(*flp)->fl_lmops->lm_break) { WARN_ON_ONCE(1); return -ENOLCK; @@ -2018,8 +2047,6 @@ vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE)) return -EACCES; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; error = security_file_lock(filp, arg); if (error) return error; @@ -2027,13 +2054,13 @@ vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) } EXPORT_SYMBOL_GPL(vfs_setlease); -static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, unsigned int flavor, int arg) { struct file_lease *fl; struct fasync_struct *new; int error; - fl = lease_alloc(filp, arg); + fl = lease_alloc(filp, flavor, arg); if (IS_ERR(fl)) return PTR_ERR(fl); @@ -2064,9 +2091,33 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) */ int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { + if (S_ISDIR(file_inode(filp)->i_mode)) + return -EINVAL; + if (arg == F_UNLCK) return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); - return do_fcntl_add_lease(fd, filp, arg); + return do_fcntl_add_lease(fd, filp, FL_LEASE, arg); +} + +/** + * fcntl_setdeleg - sets a delegation on an open file + * @fd: open file descriptor + * @filp: file pointer + * @deleg: delegation request from userland + * + * Call this fcntl to establish a delegation on the file. + * Note that you also need to call %F_SETSIG to + * receive a signal when the lease is broken. + */ +int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg) +{ + /* For now, no flags are supported */ + if (deleg->d_flags != 0 || deleg->__pad != 0) + return -EINVAL; + + if (deleg->d_type == F_UNLCK) + return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); + return do_fcntl_add_lease(fd, filp, FL_DELEG, deleg->d_type); } /** diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 32db676127a9..51ea9bdc813f 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -26,6 +26,22 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc); static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); +void __minix_error_inode(struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "minix-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); + va_end(args); +} + static void minix_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); @@ -589,7 +605,7 @@ struct inode *minix_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; if (INODE_VERSION(inode) == MINIX_V1) diff --git a/fs/minix/minix.h b/fs/minix/minix.h index d54273c3c9ff..2bfaf377f208 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -42,6 +42,9 @@ struct minix_sb_info { unsigned short s_version; }; +void __minix_error_inode(struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); + struct inode *minix_iget(struct super_block *, unsigned long); struct minix_inode *minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); struct minix2_inode *minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); @@ -168,4 +171,10 @@ static inline int minix_test_bit(int nr, const void *vaddr) #endif +#define minix_error_inode(inode, fmt, ...) \ + __minix_error_inode((inode), __func__, __LINE__, \ + (fmt), ##__VA_ARGS__) + +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + #endif /* FS_MINIX_H */ diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 8938536d8d3c..263e4ba8b1c8 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -145,6 +145,11 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) struct minix_dir_entry * de; int err; + if (inode->i_nlink == 0) { + minix_error_inode(inode, "inode has corrupted nlink"); + return -EFSCORRUPTED; + } + de = minix_find_entry(dentry, &folio); if (!de) return -ENOENT; @@ -161,15 +166,24 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) static int minix_rmdir(struct inode * dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); - int err = -ENOTEMPTY; + int err = -EFSCORRUPTED; - if (minix_empty_dir(inode)) { - err = minix_unlink(dir, dentry); - if (!err) { - inode_dec_link_count(dir); - inode_dec_link_count(inode); - } + if (dir->i_nlink <= 2) { + minix_error_inode(dir, "inode has corrupted nlink"); + goto out; + } + + err = -ENOTEMPTY; + if (!minix_empty_dir(inode)) + goto out; + + err = minix_unlink(dir, dentry); + if (!err) { + inode_dec_link_count(dir); + inode_dec_link_count(inode); } + +out: return err; } @@ -208,6 +222,17 @@ static int minix_rename(struct mnt_idmap *idmap, if (dir_de && !minix_empty_dir(new_inode)) goto out_dir; + err = -EFSCORRUPTED; + if (new_inode->i_nlink == 0 || (dir_de && new_inode->i_nlink != 2)) { + minix_error_inode(new_inode, "inode has corrupted nlink"); + goto out_dir; + } + + if (dir_de && old_dir->i_nlink <= 2) { + minix_error_inode(old_dir, "inode has corrupted nlink"); + goto out_dir; + } + err = -ENOENT; new_de = minix_find_entry(new_dentry, &new_folio); if (!new_de) diff --git a/fs/mount.h b/fs/mount.h index f13a28752d0b..2d28ef2a3aed 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -27,6 +27,7 @@ struct mnt_namespace { unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; refcount_t passive; /* number references not pinning @mounts */ + bool is_anon; } __randomize_layout; struct mnt_pcp { @@ -175,7 +176,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry) static inline bool is_anon_ns(struct mnt_namespace *ns) { - return ns->ns.ns_id == 0; + return ns->is_anon; } static inline bool anon_ns_root(const struct mount *m) diff --git a/fs/namei.c b/fs/namei.c index 7377020a2cba..bf0f66f0e9b9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -282,7 +282,7 @@ void putname(struct filename *name) return; refcnt = atomic_read(&name->refcnt); - if (refcnt != 1) { + if (unlikely(refcnt != 1)) { if (WARN_ON_ONCE(!refcnt)) return; @@ -290,7 +290,7 @@ void putname(struct filename *name) return; } - if (name->name != name->iname) { + if (unlikely(name->name != name->iname)) { __putname(name->name); kfree(name); } else @@ -540,10 +540,13 @@ static inline int do_inode_permission(struct mnt_idmap *idmap, * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. + * + * Note: lookup_inode_permission_may_exec() does not call here. If you add + * MAY_EXEC checks, adjust it. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ @@ -574,7 +577,7 @@ int inode_permission(struct mnt_idmap *idmap, if (unlikely(retval)) return retval; - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { /* * Nobody gets write access to an immutable file. */ @@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap, } EXPORT_SYMBOL(inode_permission); +/* + * lookup_inode_permission_may_exec - Check traversal right for given inode + * + * This is a special case routine for may_lookup() making assumptions specific + * to path traversal. Use inode_permission() if you are doing something else. + * + * Work is shaved off compared to inode_permission() as follows: + * - we know for a fact there is no MAY_WRITE to worry about + * - it is an invariant the inode is a directory + * + * Since majority of real-world traversal happens on inodes which grant it for + * everyone, we check it upfront and only resort to more expensive work if it + * fails. + * + * Filesystems which have their own ->permission hook and consequently miss out + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC + * on their directory inodes. + */ +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + /* Lookup already checked this to return -ENOTDIR */ + VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode); + VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0); + + mask |= MAY_EXEC; + + if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC)))) + return inode_permission(idmap, inode, mask); + + if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode))) + return inode_permission(idmap, inode, mask); + + return security_inode_permission(inode, mask); +} + /** * path_get - get a reference to a path * @path: path to get the reference to @@ -746,7 +785,8 @@ static void leave_rcu(struct nameidata *nd) static void terminate_walk(struct nameidata *nd) { - drop_links(nd); + if (unlikely(nd->depth)) + drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); @@ -843,7 +883,7 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (unlikely(!legitimize_links(nd))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; @@ -878,7 +918,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (unlikely(!legitimize_links(nd))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { @@ -951,8 +991,8 @@ static int complete_walk(struct nameidata *nd) * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ - if (!(nd->state & ND_ROOT_PRESET)) - if (!(nd->flags & LOOKUP_IS_SCOPED)) + if (likely(!(nd->state & ND_ROOT_PRESET))) + if (likely(!(nd->flags & LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) @@ -1034,7 +1074,7 @@ static int nd_jump_root(struct nameidata *nd) } if (!nd->root.mnt) { int error = set_root(nd); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) { @@ -1632,13 +1672,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; + if (likely(!d_managed(dentry))) + return 0; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; - if (!try_to_unlazy_next(nd, dentry)) + if (unlikely(!try_to_unlazy_next(nd, dentry))) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); @@ -1823,7 +1865,7 @@ again: return dentry; } -static struct dentry *lookup_slow(const struct qstr *name, +static noinline struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { @@ -1855,7 +1897,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; - err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); + err = lookup_inode_permission_may_exec(idmap, nd->inode, mask); if (likely(!err)) return 0; @@ -1870,7 +1912,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, if (err != -ECHILD) // hard error return err; - return inode_permission(idmap, nd->inode, MAY_EXEC); + return lookup_inode_permission_may_exec(idmap, nd->inode, 0); } static int reserve_stack(struct nameidata *nd, struct path *link) @@ -1901,13 +1943,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link) enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; -static const char *pick_link(struct nameidata *nd, struct path *link, +static noinline const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; - int error = reserve_stack(nd, link); + int error; + if (nd->flags & LOOKUP_RCU) { + /* make sure that d_is_symlink from step_into_slowpath() matches the inode */ + if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + } else { + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + } + + error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); @@ -1981,14 +2033,15 @@ all_done: // pure jump * * NOTE: dentry must be what nd->next_seq had been sampled from. */ -static const char *step_into(struct nameidata *nd, int flags, +static noinline const char *step_into_slowpath(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; - int err = handle_mounts(nd, dentry, &path); + int err; - if (err < 0) + err = handle_mounts(nd, dentry, &path); + if (unlikely(err < 0)) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || @@ -2010,15 +2063,32 @@ static const char *step_into(struct nameidata *nd, int flags, nd->seq = nd->next_seq; return NULL; } - if (nd->flags & LOOKUP_RCU) { - /* make sure that d_is_symlink above matches inode */ - if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) + return pick_link(nd, &path, inode, flags); +} + +static __always_inline const char *step_into(struct nameidata *nd, int flags, + struct dentry *dentry) +{ + /* + * In the common case we are in rcu-walk and traversing over a non-mounted on + * directory (as opposed to e.g., a symlink). + * + * We can handle that and negative entries with the checks below. + */ + if (likely((nd->flags & LOOKUP_RCU) && + !d_managed(dentry) && !d_is_symlink(dentry))) { + struct inode *inode = dentry->d_inode; + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); - } else { - if (path.mnt == nd->path.mnt) - mntget(path.mnt); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + nd->path.dentry = dentry; + /* nd->path.mnt is retained on purpose */ + nd->inode = inode; + nd->seq = nd->next_seq; + return NULL; } - return pick_link(nd, &path, inode, flags); + return step_into_slowpath(nd, flags, dentry); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) @@ -2101,7 +2171,7 @@ static const char *handle_dots(struct nameidata *nd, int type) if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) @@ -2131,7 +2201,7 @@ static const char *handle_dots(struct nameidata *nd, int type) return NULL; } -static const char *walk_component(struct nameidata *nd, int flags) +static __always_inline const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* @@ -2140,7 +2210,7 @@ static const char *walk_component(struct nameidata *nd, int flags) * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); return handle_dots(nd, nd->last_type); } @@ -2152,7 +2222,7 @@ static const char *walk_component(struct nameidata *nd, int flags) if (IS_ERR(dentry)) return ERR_CAST(dentry); } - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); return step_into(nd, flags, dentry); } @@ -2505,7 +2575,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ - if (!depth) { + if (likely(!depth)) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; @@ -2543,10 +2613,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags) const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ - if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) + if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)) return ERR_PTR(-EAGAIN); - if (!*s) + if (unlikely(!*s)) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); @@ -2560,7 +2630,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); - if (nd->state & ND_ROOT_PRESET) { + if (unlikely(nd->state & ND_ROOT_PRESET)) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) @@ -2579,7 +2649,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ - if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { + if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); @@ -2632,7 +2702,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } /* For scoped-lookups we need to set the root to the dirfd as well. */ - if (flags & LOOKUP_IS_SCOPED) { + if (unlikely(flags & LOOKUP_IS_SCOPED)) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; @@ -2765,6 +2835,62 @@ static int filename_parentat(int dfd, struct filename *name, return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } +/** + * start_dirop - begin a create or remove dirop, performing locking and lookup + * @parent: the dentry of the parent in which the operation will occur + * @name: a qstr holding the name within that parent + * @lookup_flags: intent and other lookup flags. + * + * The lookup is performed and necessary locks are taken so that, on success, + * the returned dentry can be operated on safely. + * The qstr must already have the hash value calculated. + * + * Returns: a locked dentry, or an error. + * + */ +static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags, + unsigned int state) +{ + struct dentry *dentry; + struct inode *dir = d_inode(parent); + + if (state == TASK_KILLABLE) { + int ret = down_write_killable_nested(&dir->i_rwsem, + I_MUTEX_PARENT); + if (ret) + return ERR_PTR(ret); + } else { + inode_lock_nested(dir, I_MUTEX_PARENT); + } + dentry = lookup_one_qstr_excl(name, parent, lookup_flags); + if (IS_ERR(dentry)) + inode_unlock(dir); + return dentry; +} + +struct dentry *start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags) +{ + return __start_dirop(parent, name, lookup_flags, TASK_NORMAL); +} + +/** + * end_dirop - signal completion of a dirop + * @de: the dentry which was returned by start_dirop or similar. + * + * If the de is an error, nothing happens. Otherwise any lock taken to + * protect the dentry is dropped and the dentry itself is release (dput()). + */ +void end_dirop(struct dentry *de) +{ + if (!IS_ERR(de)) { + inode_unlock(de->d_parent->d_inode); + dput(de); + } +} +EXPORT_SYMBOL(end_dirop); + /* does lookup, returns the object with parent locked */ static struct dentry *__start_removing_path(int dfd, struct filename *name, struct path *path) @@ -2781,10 +2907,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name, return ERR_PTR(-EINVAL); /* don't fail immediately if it's r/o, at least try to report other errors */ error = mnt_want_write(parent_path.mnt); - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); + d = start_dirop(parent_path.dentry, &last, 0); if (IS_ERR(d)) - goto unlock; + goto drop; if (error) goto fail; path->dentry = no_free_ptr(parent_path.dentry); @@ -2792,10 +2917,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name, return d; fail: - dput(d); + end_dirop(d); d = ERR_PTR(error); -unlock: - inode_unlock(parent_path.dentry->d_inode); +drop: if (!error) mnt_drop_write(parent_path.mnt); return d; @@ -2910,7 +3034,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_noperm_common(struct qstr *qname, struct dentry *base) +int lookup_noperm_common(struct qstr *qname, struct dentry *base) { const char *name = qname->name; u32 len = qname->len; @@ -3181,6 +3305,234 @@ struct dentry *lookup_noperm_positive_unlocked(struct qstr *name, } EXPORT_SYMBOL(lookup_noperm_positive_unlocked); +/** + * start_creating - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup is performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned, so + * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail + * with -EEXIST. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); +} +EXPORT_SYMBOL(start_creating); + +/** + * start_removing - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); +} +EXPORT_SYMBOL(start_removing); + +/** + * start_creating_killable - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE); +} +EXPORT_SYMBOL(start_creating_killable); + +/** + * start_removing_killable - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, 0, TASK_KILLABLE); +} +EXPORT_SYMBOL(start_removing_killable); + +/** + * start_creating_noperm - prepare to create a given name without permission checking + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. + * + * If the name already exists, a positive dentry is returned. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_noperm(struct dentry *parent, + struct qstr *name) +{ + int err = lookup_noperm_common(name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); +} +EXPORT_SYMBOL(start_creating_noperm); + +/** + * start_removing_noperm - prepare to remove a given name without permission checking + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing_noperm(struct dentry *parent, + struct qstr *name) +{ + int err = lookup_noperm_common(name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); +} +EXPORT_SYMBOL(start_removing_noperm); + +/** + * start_creating_dentry - prepare to create a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and negative a reference is taken and + * returned. If not an error is returned. + * + * end_creating() should be called when creation is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_creating_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_positive(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EEXIST); + } + return dget(child); +} +EXPORT_SYMBOL(start_creating_dentry); + +/** + * start_removing_dentry - prepare to remove a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and positive, a reference is taken and + * returned. If not an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_removing_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_negative(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-ENOENT); + } + return dget(child); +} +EXPORT_SYMBOL(start_removing_dentry); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { @@ -3419,6 +3771,290 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) EXPORT_SYMBOL(unlock_rename); /** + * __start_renaming - lookup and lock names for rename + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry and an extra ref is taken on @rd.old_parent. + * These references and the lock are dropped by end_renaming(). + * + * The passed in qstrs must have the hash calculated, and no permission + * checking is performed. + * + * Returns: zero or an error. + */ +static int +__start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d1, *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + trap = lock_rename(rd->old_parent, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + + d1 = lookup_one_qstr_excl(old_last, rd->old_parent, + lookup_flags); + err = PTR_ERR(d1); + if (IS_ERR(d1)) + goto out_unlock; + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_dput_d1; + + if (d1 == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = d1; + rd->new_dentry = d2; + dget(rd->old_parent); + return 0; + +out_dput_d2: + dput(d2); +out_dput_d1: + dput(d1); +out_unlock: + unlock_rename(rd->old_parent, rd->new_parent); + return err; +} + +/** + * start_renaming - lookup and lock names for rename with permission checking + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry. Also the refcount on @rd->old_parent is increased. + * These references and the lock are dropped by end_renaming(). + * + * The passed in qstrs need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent); + if (err) + return err; + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming(rd, lookup_flags, old_last, new_last); +} +EXPORT_SYMBOL(start_renaming); + +static int +__start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) { + /* dentry was removed, or moved and explicit parent requested */ + err = -EINVAL; + goto out_unlock; + } + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_unlock; + + if (old_dentry == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = d2; + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_dput_d2: + dput(d2); +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} + +/** + * start_renaming_dentry - lookup and lock name for rename with permission checking + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_dentry: dentry of name to move + * @new_last: name of target in @rd.new_parent + * + * Look up target name and ensure locks are in place for + * rename. + * + * On success the found dentry is stored in @rd.new_dentry and + * @rd.old_parent is confirmed to be the parent of @old_dentry. If it + * was originally %NULL, it is set. In either case a reference is taken + * so that end_renaming() can have a stable reference to unlock. + * + * References and the lock can be dropped with end_renaming() + * + * The passed in qstr need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last); +} +EXPORT_SYMBOL(start_renaming_dentry); + +/** + * start_renaming_two_dentries - Lock to dentries in given parents for rename + * @rd: rename data containing parent + * @old_dentry: dentry of name to move + * @new_dentry: dentry to move to + * + * Ensure locks are in place for rename and check parentage is still correct. + * + * On success the two dentries are stored in @rd.old_dentry and + * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to + * be the parents of the dentries. + * + * References and the lock can be dropped with end_renaming() + * + * Returns: zero or an error. + */ +int +start_renaming_two_dentries(struct renamedata *rd, + struct dentry *old_dentry, struct dentry *new_dentry) +{ + struct dentry *trap; + int err; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + err = -EINVAL; + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) + /* old_dentry was removed, or moved and explicit parent requested */ + goto out_unlock; + if (d_unhashed(new_dentry) || + rd->new_parent != new_dentry->d_parent) + /* new_dentry was removed or moved */ + goto out_unlock; + + if (old_dentry == trap) + /* source is an ancestor of target */ + goto out_unlock; + + if (new_dentry == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_unlock; + } + + err = -EEXIST; + if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE)) + goto out_unlock; + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = dget(new_dentry); + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} +EXPORT_SYMBOL(start_renaming_two_dentries); + +void end_renaming(struct renamedata *rd) +{ + unlock_rename(rd->old_parent, rd->new_parent); + dput(rd->old_dentry); + dput(rd->new_dentry); + dput(rd->old_parent); +} +EXPORT_SYMBOL(end_renaming); + +/** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode @@ -3461,10 +4097,9 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory * @dentry: dentry of the child file * @mode: mode of the child file - * @want_excl: whether the file must not yet exist + * @di: returns parent inode, if the inode is delegated. * * Create a new file. * @@ -3474,9 +4109,10 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ -int vfs_create(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, bool want_excl) +int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode, + struct delegated_inode *di) { + struct inode *dir = d_inode(dentry->d_parent); int error; error = may_create(idmap, dir, dentry); @@ -3490,7 +4126,10 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); + error = try_break_deleg(dir, di); + if (error) + return error; + error = dir->i_op->create(idmap, dir, dentry, mode, true); if (!error) fsnotify_create(dir, dentry); return error; @@ -3697,7 +4336,7 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, - bool got_write) + bool got_write, struct delegated_inode *delegated_inode) { struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; @@ -3786,6 +4425,11 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { + /* but break the directory lease first! */ + error = try_break_deleg(dir_inode, delegated_inode); + if (error) + goto out_dput; + file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { @@ -3848,6 +4492,7 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { + struct delegated_inode delegated_inode = { }; struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; @@ -3879,7 +4524,7 @@ static const char *open_last_lookups(struct nameidata *nd, return ERR_PTR(-ECHILD); } } - +retry: if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* @@ -3892,7 +4537,7 @@ static const char *open_last_lookups(struct nameidata *nd, inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); - dentry = lookup_open(nd, file, op, got_write); + dentry = lookup_open(nd, file, op, got_write, &delegated_inode); if (!IS_ERR(dentry)) { if (file->f_mode & FMODE_CREATED) fsnotify_create(dir->d_inode, dentry); @@ -3907,8 +4552,16 @@ static const char *open_last_lookups(struct nameidata *nd, if (got_write) mnt_drop_write(nd->path.mnt); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + if (is_delegated(&delegated_inode)) { + int error = break_deleg_wait(&delegated_inode); + + if (!error) + goto retry; + return ERR_PTR(error); + } return ERR_CAST(dentry); + } if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); @@ -4036,7 +4689,7 @@ int vfs_tmpfile(struct mnt_idmap *idmap, inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); - inode->i_state |= I_LINKABLE; + inode_state_set(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } security_inode_post_create_tmpfile(idmap, inode); @@ -4223,21 +4876,18 @@ static struct dentry *filename_create(int dfd, struct filename *name, */ if (last.name[last.len] && !want_dir) create_flags &= ~LOOKUP_CREATE; - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr_excl(&last, path->dentry, - reval_flag | create_flags); + dentry = start_dirop(path->dentry, &last, reval_flag | create_flags); if (IS_ERR(dentry)) - goto unlock; + goto out_drop_write; if (unlikely(error)) goto fail; return dentry; fail: - dput(dentry); + end_dirop(dentry); dentry = ERR_PTR(error); -unlock: - inode_unlock(path->dentry->d_inode); +out_drop_write: if (!error) mnt_drop_write(path->mnt); out: @@ -4256,11 +4906,20 @@ struct dentry *start_creating_path(int dfd, const char *pathname, } EXPORT_SYMBOL(start_creating_path); +/** + * end_creating_path - finish a code section started by start_creating_path() + * @path: the path instantiated by start_creating_path() + * @dentry: the dentry returned by start_creating_path() + * + * end_creating_path() will unlock and locks taken by start_creating_path() + * and drop an references that were taken. It should only be called + * if start_creating_path() returned a non-error. + * If vfs_mkdir() was called and it returned an error, that error *should* + * be passed to end_creating_path() together with the path. + */ void end_creating_path(const struct path *path, struct dentry *dentry) { - if (!IS_ERR(dentry)) - dput(dentry); - inode_unlock(path->dentry->d_inode); + end_creating(dentry); mnt_drop_write(path->mnt); path_put(path); } @@ -4278,13 +4937,15 @@ inline struct dentry *start_creating_user_path( } EXPORT_SYMBOL(start_creating_user_path); + /** * vfs_mknod - create device node or file - * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory - * @dentry: dentry of the child device node - * @mode: mode of the child device node - * @dev: device number of device to create + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child device node + * @mode: mode of the child device node + * @dev: device number of device to create + * @delegated_inode: returns parent inode, if the inode is delegated. * * Create a device node or file. * @@ -4295,7 +4956,8 @@ EXPORT_SYMBOL(start_creating_user_path); * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) + struct dentry *dentry, umode_t mode, dev_t dev, + struct delegated_inode *delegated_inode) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(idmap, dir, dentry); @@ -4319,6 +4981,10 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); @@ -4346,6 +5012,7 @@ static int may_mknod(umode_t mode) static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { + struct delegated_inode di = { }; struct mnt_idmap *idmap; struct dentry *dentry; struct path path; @@ -4369,22 +5036,26 @@ retry: idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(idmap, path.dentry->d_inode, - dentry, mode, true); + error = vfs_create(idmap, dentry, mode, &di); if (!error) security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, - dentry, mode, new_decode_dev(dev)); + dentry, mode, new_decode_dev(dev), &di); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(idmap, path.dentry->d_inode, - dentry, mode, 0); + dentry, mode, 0, &di); break; } out2: end_creating_path(&path, dentry); + if (is_delegated(&di)) { + error = break_deleg_wait(&di); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4407,10 +5078,11 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d /** * vfs_mkdir - create directory returning correct dentry if possible - * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory - * @dentry: dentry of the child directory - * @mode: mode of the child directory + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @mode: mode of the child directory + * @delegated_inode: returns parent inode, if the inode is delegated. * * Create a directory. * @@ -4427,7 +5099,8 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * In case of an error the dentry is dput() and an ERR_PTR() is returned. */ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct dentry *dentry, umode_t mode, + struct delegated_inode *delegated_inode) { int error; unsigned max_links = dir->i_sb->s_max_links; @@ -4450,6 +5123,10 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (max_links && dir->i_nlink >= max_links) goto err; + error = try_break_deleg(dir, delegated_inode); + if (error) + goto err; + de = dir->i_op->mkdir(idmap, dir, dentry, mode); error = PTR_ERR(de); if (IS_ERR(de)) @@ -4462,7 +5139,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, return dentry; err: - dput(dentry); + end_creating(dentry); return ERR_PTR(error); } EXPORT_SYMBOL(vfs_mkdir); @@ -4473,6 +5150,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode) struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; + struct delegated_inode delegated_inode = { }; retry: dentry = filename_create(dfd, name, &path, lookup_flags); @@ -4484,11 +5162,16 @@ retry: mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, mode); + dentry, mode, &delegated_inode); if (IS_ERR(dentry)) error = PTR_ERR(dentry); } end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4510,9 +5193,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /** * vfs_rmdir - remove directory - * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory - * @dentry: dentry of the child directory + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @delegated_inode: returns parent inode, if it's delegated. * * Remove a directory. * @@ -4523,7 +5207,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry) + struct dentry *dentry, struct delegated_inode *delegated_inode) { int error = may_delete(idmap, dir, dentry, 1); @@ -4545,6 +5229,10 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, if (error) goto out; + error = try_break_deleg(dir, delegated_inode); + if (error) + goto out; + error = dir->i_op->rmdir(dir, dentry); if (error) goto out; @@ -4571,6 +5259,7 @@ int do_rmdir(int dfd, struct filename *name) struct qstr last; int type; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) @@ -4592,22 +5281,26 @@ retry: if (error) goto exit2; - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; error = security_path_rmdir(&path, dentry); if (error) goto exit4; - error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); + error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); exit4: - dput(dentry); + end_dirop(dentry); exit3: - inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4648,7 +5341,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, struct inode **delegated_inode) + struct dentry *dentry, struct delegated_inode *delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(idmap, dir, dentry, 0); @@ -4667,6 +5360,9 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, else { error = security_inode_unlink(dir, dentry); if (!error) { + error = try_break_deleg(dir, delegated_inode); + if (error) + goto out; error = try_break_deleg(target, delegated_inode); if (error) goto out; @@ -4705,67 +5401,62 @@ int do_unlinkat(int dfd, struct filename *name) struct path path; struct qstr last; int type; - struct inode *inode = NULL; - struct inode *delegated_inode = NULL; + struct inode *inode; + struct delegated_inode delegated_inode = { }; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) - goto exit1; + goto exit_putname; error = -EISDIR; if (type != LAST_NORM) - goto exit2; + goto exit_path_put; error = mnt_want_write(path.mnt); if (error) - goto exit2; + goto exit_path_put; retry_deleg: - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { + if (IS_ERR(dentry)) + goto exit_drop_write; - /* Why not before? Because we want correct error value */ - if (last.name[last.len]) - goto slashes; - inode = dentry->d_inode; - ihold(inode); - error = security_path_unlink(&path, dentry); - if (error) - goto exit3; - error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, &delegated_inode); -exit3: - dput(dentry); + /* Why not before? Because we want correct error value */ + if (unlikely(last.name[last.len])) { + if (d_is_dir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; + end_dirop(dentry); + goto exit_drop_write; } - inode_unlock(path.dentry->d_inode); - if (inode) - iput(inode); /* truncate the inode here */ - inode = NULL; - if (delegated_inode) { + inode = dentry->d_inode; + ihold(inode); + error = security_path_unlink(&path, dentry); + if (error) + goto exit_end_dirop; + error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); +exit_end_dirop: + end_dirop(dentry); + iput(inode); /* truncate the inode here */ + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } +exit_drop_write: mnt_drop_write(path.mnt); -exit2: +exit_path_put: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; - inode = NULL; goto retry; } -exit1: +exit_putname: putname(name); return error; - -slashes: - if (d_is_dir(dentry)) - error = -EISDIR; - else - error = -ENOTDIR; - goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) @@ -4789,6 +5480,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * @dir: inode of the parent directory * @dentry: dentry of the child symlink file * @oldname: name of the file to link to + * @delegated_inode: returns victim inode, if the inode is delegated. * * Create a symlink. * @@ -4799,7 +5491,8 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, const char *oldname) + struct dentry *dentry, const char *oldname, + struct delegated_inode *delegated_inode) { int error; @@ -4814,6 +5507,10 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); @@ -4827,6 +5524,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to) struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; if (IS_ERR(from)) { error = PTR_ERR(from); @@ -4841,8 +5539,13 @@ retry: error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, from->name); + dentry, from->name, &delegated_inode); end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4892,7 +5595,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, - struct inode **delegated_inode) + struct delegated_inode *delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -4931,19 +5634,21 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { - error = try_break_deleg(inode, delegated_inode); + error = try_break_deleg(dir, delegated_inode); + if (!error) + error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } - if (!error && (inode->i_state & I_LINKABLE)) { + if (!error && (inode_state_read_once(inode) & I_LINKABLE)) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_LINKABLE; + inode_state_clear(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } inode_unlock(inode); @@ -4968,7 +5673,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int how = 0; int error; @@ -5012,7 +5717,7 @@ retry: new_dentry, &delegated_inode); out_dput: end_creating_path(&new_path, new_dentry); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); @@ -5098,7 +5803,7 @@ int vfs_rename(struct renamedata *rd) struct inode *new_dir = d_inode(rd->new_parent); struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; - struct inode **delegated_inode = rd->delegated_inode; + struct delegated_inode *delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; @@ -5203,6 +5908,14 @@ int vfs_rename(struct renamedata *rd) old_dir->i_nlink >= max_links) goto out; } + error = try_break_deleg(old_dir, delegated_inode); + if (error) + goto out; + if (new_dir != old_dir) { + error = try_break_deleg(new_dir, delegated_inode); + if (error) + goto out; + } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) @@ -5256,14 +5969,11 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; - struct dentry *old_dentry, *new_dentry; - struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; - struct inode *delegated_inode = NULL; - unsigned int lookup_flags = 0, target_flags = - LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + struct delegated_inode delegated_inode = { }; + unsigned int lookup_flags = 0; bool should_retry = false; int error = -EINVAL; @@ -5274,11 +5984,6 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, (flags & RENAME_EXCHANGE)) goto put_names; - if (flags & RENAME_EXCHANGE) - target_flags = 0; - if (flags & RENAME_NOREPLACE) - target_flags |= LOOKUP_EXCL; - retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); @@ -5308,68 +6013,42 @@ retry: goto exit2; retry_deleg: - trap = lock_rename(new_path.dentry, old_path.dentry); - if (IS_ERR(trap)) { - error = PTR_ERR(trap); + rd.old_parent = old_path.dentry; + rd.mnt_idmap = mnt_idmap(old_path.mnt); + rd.new_parent = new_path.dentry; + rd.delegated_inode = &delegated_inode; + rd.flags = flags; + + error = __start_renaming(&rd, lookup_flags, &old_last, &new_last); + if (error) goto exit_lock_rename; - } - old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, - lookup_flags); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; - new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, - lookup_flags | target_flags); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; if (flags & RENAME_EXCHANGE) { - if (!d_is_dir(new_dentry)) { + if (!d_is_dir(rd.new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) - goto exit5; + goto exit_unlock; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!d_is_dir(old_dentry)) { + if (!d_is_dir(rd.old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) - goto exit5; + goto exit_unlock; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) - goto exit5; + goto exit_unlock; } - /* source should not be ancestor of target */ - error = -EINVAL; - if (old_dentry == trap) - goto exit5; - /* target should not be an ancestor of source */ - if (!(flags & RENAME_EXCHANGE)) - error = -ENOTEMPTY; - if (new_dentry == trap) - goto exit5; - error = security_path_rename(&old_path, old_dentry, - &new_path, new_dentry, flags); + error = security_path_rename(&old_path, rd.old_dentry, + &new_path, rd.new_dentry, flags); if (error) - goto exit5; + goto exit_unlock; - rd.old_parent = old_path.dentry; - rd.old_dentry = old_dentry; - rd.mnt_idmap = mnt_idmap(old_path.mnt); - rd.new_parent = new_path.dentry; - rd.new_dentry = new_dentry; - rd.delegated_inode = &delegated_inode; - rd.flags = flags; error = vfs_rename(&rd); -exit5: - dput(new_dentry); -exit4: - dput(old_dentry); -exit3: - unlock_rename(new_path.dentry, old_path.dentry); +exit_unlock: + end_renaming(&rd); exit_lock_rename: - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/namespace.c b/fs/namespace.c index d82910f33dc4..c58674a20cad 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -132,16 +132,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) -{ - struct ns_common *ns; - - if (!node) - return NULL; - ns = rb_entry(node, struct ns_common, ns_tree_node); - return container_of(ns, struct mnt_namespace, ns); -} - static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ @@ -151,7 +141,8 @@ static void mnt_ns_release(struct mnt_namespace *ns) kfree(ns); } } -DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, + if (!IS_ERR(_T)) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { @@ -1345,26 +1336,12 @@ static void delayed_mntput(struct work_struct *unused) } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); -static void mntput_no_expire(struct mount *mnt) +static void noinline mntput_no_expire_slowpath(struct mount *mnt) { LIST_HEAD(list); int count; - rcu_read_lock(); - if (likely(READ_ONCE(mnt->mnt_ns))) { - /* - * Since we don't do lock_mount_hash() here, - * ->mnt_ns can change under us. However, if it's - * non-NULL, then there's a reference that won't - * be dropped until after an RCU delay done after - * turning ->mnt_ns NULL. So if we observe it - * non-NULL under rcu_read_lock(), the reference - * we are dropping is not the final one. - */ - mnt_add_count(mnt, -1); - rcu_read_unlock(); - return; - } + VFS_BUG_ON(mnt->mnt_ns); lock_mount_hash(); /* * make sure that if __legitimize_mnt() has not seen us grab @@ -1415,6 +1392,26 @@ static void mntput_no_expire(struct mount *mnt) cleanup_mnt(mnt); } +static void mntput_no_expire(struct mount *mnt) +{ + rcu_read_lock(); + if (likely(READ_ONCE(mnt->mnt_ns))) { + /* + * Since we don't do lock_mount_hash() here, + * ->mnt_ns can change under us. However, if it's + * non-NULL, then there's a reference that won't + * be dropped until after an RCU delay done after + * turning ->mnt_ns NULL. So if we observe it + * non-NULL under rcu_read_lock(), the reference + * we are dropping is not the final one. + */ + mnt_add_count(mnt, -1); + rcu_read_unlock(); + return; + } + mntput_no_expire_slowpath(mnt); +} + void mntput(struct vfsmount *mnt) { if (mnt) { @@ -3103,19 +3100,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) { - int fd; - struct file *file __free(fput) = NULL; - - file = vfs_open_tree(dfd, filename, flags); - if (IS_ERR(file)) - return PTR_ERR(file); - - fd = get_unused_fd_flags(flags & O_CLOEXEC); - if (fd < 0) - return fd; - - fd_install(fd, no_free_ptr(file)); - return fd; + return FD_ADD(flags, vfs_open_tree(dfd, filename, flags)); } /* @@ -4093,8 +4078,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } - if (!anon) - ns_tree_gen_id(&new_ns->ns); + ns_tree_gen_id(new_ns); + + new_ns->is_anon = anon; refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; init_waitqueue_head(&new_ns->poll); @@ -4283,10 +4269,10 @@ static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int, attr_flags) { + struct path new_path __free(path_put) = {}; struct mnt_namespace *ns; struct fs_context *fc; - struct file *file; - struct path newmount; + struct vfsmount *new_mnt; struct mount *mnt; unsigned int mnt_flags = 0; long ret; @@ -4324,35 +4310,36 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, fc = fd_file(f)->private_data; - ret = mutex_lock_interruptible(&fc->uapi_mutex); - if (ret < 0) + ACQUIRE(mutex_intr, uapi_mutex)(&fc->uapi_mutex); + ret = ACQUIRE_ERR(mutex_intr, &uapi_mutex); + if (ret) return ret; /* There must be a valid superblock or we can't mount it */ ret = -EINVAL; if (!fc->root) - goto err_unlock; + return ret; ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { errorfcp(fc, "VFS", "Mount too revealing"); - goto err_unlock; + return ret; } ret = -EBUSY; if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) - goto err_unlock; + return ret; if (fc->sb_flags & SB_MANDLOCK) warn_mandlock(); - newmount.mnt = vfs_create_mount(fc); - if (IS_ERR(newmount.mnt)) { - ret = PTR_ERR(newmount.mnt); - goto err_unlock; - } - newmount.dentry = dget(fc->root); - newmount.mnt->mnt_flags = mnt_flags; + new_mnt = vfs_create_mount(fc); + if (IS_ERR(new_mnt)) + return PTR_ERR(new_mnt); + new_mnt->mnt_flags = mnt_flags; + + new_path.dentry = dget(fc->root); + new_path.mnt = new_mnt; /* We've done the mount bit - now move the file context into more or * less the same state as if we'd done an fspick(). We don't want to @@ -4362,38 +4349,27 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, vfs_clean_context(fc); ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); - if (IS_ERR(ns)) { - ret = PTR_ERR(ns); - goto err_path; - } - mnt = real_mount(newmount.mnt); + if (IS_ERR(ns)) + return PTR_ERR(ns); + mnt = real_mount(new_path.mnt); ns->root = mnt; ns->nr_mounts = 1; mnt_add_to_ns(ns, mnt); - mntget(newmount.mnt); + mntget(new_path.mnt); - /* Attach to an apparent O_PATH fd with a note that we need to unmount - * it, not just simply put it. - */ - file = dentry_open(&newmount, O_PATH, fc->cred); - if (IS_ERR(file)) { - dissolve_on_fput(newmount.mnt); - ret = PTR_ERR(file); - goto err_path; + FD_PREPARE(fdf, (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0, + dentry_open(&new_path, O_PATH, fc->cred)); + if (fdf.err) { + dissolve_on_fput(new_path.mnt); + return fdf.err; } - file->f_mode |= FMODE_NEED_UNMOUNT; - - ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0); - if (ret >= 0) - fd_install(ret, file); - else - fput(file); -err_path: - path_put(&newmount); -err_unlock: - mutex_unlock(&fc->uapi_mutex); - return ret; + /* + * Attach to an apparent O_PATH fd with a note that we + * need to unmount it, not just simply put it. + */ + fd_prepare_file(fdf)->f_mode |= FMODE_NEED_UNMOUNT; + return fd_publish(fdf); } static inline int vfs_move_mount(const struct path *from_path, @@ -5035,19 +5011,17 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, unsigned, flags, struct mount_attr __user *, uattr, size_t, usize) { - struct file __free(fput) *file = NULL; - int fd; - if (!uattr && usize) return -EINVAL; - file = vfs_open_tree(dfd, filename, flags); - if (IS_ERR(file)) - return PTR_ERR(file); + FD_PREPARE(fdf, flags, vfs_open_tree(dfd, filename, flags)); + if (fdf.err) + return fdf.err; if (uattr) { - int ret; struct mount_kattr kattr = {}; + struct file *file = fd_prepare_file(fdf); + int ret; if (flags & OPEN_TREE_CLONE) kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; @@ -5063,12 +5037,7 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, return ret; } - fd = get_unused_fd_flags(flags & O_CLOEXEC); - if (fd < 0) - return fd; - - fd_install(fd, no_free_ptr(file)); - return fd; + return fd_publish(fdf); } int show_path(struct seq_file *m, struct dentry *root) @@ -5150,6 +5119,12 @@ static u64 mnt_to_propagation_flags(struct mount *m) return propagation; } +u64 vfsmount_to_propagation_flags(struct vfsmount *mnt) +{ + return mnt_to_propagation_flags(real_mount(mnt)); +} +EXPORT_SYMBOL_GPL(vfsmount_to_propagation_flags); + static void statmount_sb_basic(struct kstatmount *s) { struct super_block *sb = s->mnt->mnt_sb; @@ -5454,11 +5429,11 @@ static int statmount_string(struct kstatmount *s, u64 flag) ret = statmount_sb_source(s, seq); break; case STATMOUNT_MNT_UIDMAP: - sm->mnt_uidmap = start; + offp = &sm->mnt_uidmap; ret = statmount_mnt_uidmap(s, seq); break; case STATMOUNT_MNT_GIDMAP: - sm->mnt_gidmap = start; + offp = &sm->mnt_gidmap; ret = statmount_mnt_gidmap(s, seq); break; default: @@ -5736,7 +5711,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; - if (kreq->spare != 0) + if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) return -EINVAL; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5753,16 +5728,14 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq { struct mnt_namespace *mnt_ns; - if (kreq->mnt_ns_id && kreq->spare) - return ERR_PTR(-EINVAL); - - if (kreq->mnt_ns_id) - return lookup_mnt_ns(kreq->mnt_ns_id); - - if (kreq->spare) { + if (kreq->mnt_ns_id) { + mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id); + if (!mnt_ns) + return ERR_PTR(-ENOENT); + } else if (kreq->mnt_ns_fd) { struct ns_common *ns; - CLASS(fd, f)(kreq->spare); + CLASS(fd, f)(kreq->mnt_ns_fd); if (fd_empty(f)) return ERR_PTR(-EBADF); @@ -5774,11 +5747,12 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); + refcount_inc(&mnt_ns->passive); } else { mnt_ns = current->nsproxy->mnt_ns; + refcount_inc(&mnt_ns->passive); } - refcount_inc(&mnt_ns->passive); return mnt_ns; } @@ -5801,8 +5775,8 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, return ret; ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + if (IS_ERR(ns)) + return PTR_ERR(ns); if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) @@ -5912,8 +5886,8 @@ static void __free_klistmount_free(const struct klistmount *kls) static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, size_t nr_mnt_ids) { - u64 last_mnt_id = kreq->param; + struct mnt_namespace *ns; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5927,9 +5901,10 @@ static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req * if (!kls->kmnt_ids) return -ENOMEM; - kls->ns = grab_requested_mnt_ns(kreq); - if (!kls->ns) - return -ENOENT; + ns = grab_requested_mnt_ns(kreq); + if (IS_ERR(ns)) + return PTR_ERR(ns); + kls->ns = ns; kls->mnt_parent_id = kreq->mnt_id; return 0; @@ -5985,11 +5960,8 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, } struct mnt_namespace init_mnt_ns = { - .ns.inum = ns_init_inum(&init_mnt_ns), - .ns.ops = &mntns_operations, + .ns = NS_COMMON_INIT(init_mnt_ns), .user_ns = &init_user_ns, - .ns.__ns_ref = REFCOUNT_INIT(1), - .ns.ns_type = ns_common_type(&init_mnt_ns), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 09394ac2c180..f9d62abef2ac 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -535,7 +535,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr folio_unlock(folio); err = filemap_fdatawrite_range(mapping, folio_pos(folio), - folio_pos(folio) + folio_size(folio)); + folio_next_pos(folio)); switch (err) { case 0: ret = VM_FAULT_RETRY; diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 486166460e17..6df89c92b10b 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -147,10 +147,10 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) if (!fscache_cookie_valid(cookie)) return true; - if (!(inode->i_state & I_PINNING_NETFS_WB)) { + if (!(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) { spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_NETFS_WB)) { - inode->i_state |= I_PINNING_NETFS_WB; + if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) { + inode_state_set(inode, I_PINNING_NETFS_WB); need_use = true; } spin_unlock(&inode->i_lock); @@ -192,7 +192,7 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux) { struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); - if (inode->i_state & I_PINNING_NETFS_WB) { + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) { loff_t i_size = i_size_read(inode); fscache_unuse_cookie(cookie, aux, &i_size); } @@ -298,7 +298,7 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) if (folio_test_dirty(folio)) return false; - end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode)); + end = umin(folio_next_pos(folio), i_size_read(&ctx->inode)); if (end > ctx->zero_point) ctx->zero_point = end; diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index 5c0dc4efc792..8e6264f62a8f 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -36,12 +36,12 @@ void netfs_single_mark_inode_dirty(struct inode *inode) mark_inode_dirty(inode); - if (caching && !(inode->i_state & I_PINNING_NETFS_WB)) { + if (caching && !(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) { bool need_use = false; spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_NETFS_WB)) { - inode->i_state |= I_PINNING_NETFS_WB; + if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) { + inode_state_set(inode, I_PINNING_NETFS_WB); need_use = true; } spin_unlock(&inode->i_lock); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 13ad70fc00d8..f76fe406937a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -475,7 +475,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) goto out_no_inode; } - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 656976b4f42c..49ed90c6b9f2 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -615,18 +615,12 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ } -static void nfs_local_call_read(struct work_struct *work) +static void do_nfs_local_call_read(struct nfs_local_kiocb *iocb, struct file *filp) { - struct nfs_local_kiocb *iocb = - container_of(work, struct nfs_local_kiocb, work); - struct file *filp = iocb->kiocb.ki_filp; - const struct cred *save_cred; bool force_done = false; ssize_t status; int n_iters; - save_cred = override_creds(filp->f_cred); - n_iters = atomic_read(&iocb->n_iters); for (int i = 0; i < n_iters ; i++) { if (iocb->iter_is_dio_aligned[i]) { @@ -649,8 +643,16 @@ static void nfs_local_call_read(struct work_struct *work) } } } +} - revert_creds(save_cred); +static void nfs_local_call_read(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; + + scoped_with_creds(filp->f_cred) + do_nfs_local_call_read(iocb, filp); } static int @@ -820,20 +822,13 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ } -static void nfs_local_call_write(struct work_struct *work) +static ssize_t do_nfs_local_call_write(struct nfs_local_kiocb *iocb, + struct file *filp) { - struct nfs_local_kiocb *iocb = - container_of(work, struct nfs_local_kiocb, work); - struct file *filp = iocb->kiocb.ki_filp; - unsigned long old_flags = current->flags; - const struct cred *save_cred; bool force_done = false; ssize_t status; int n_iters; - current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; - save_cred = override_creds(filp->f_cred); - file_start_write(filp); n_iters = atomic_read(&iocb->n_iters); for (int i = 0; i < n_iters ; i++) { @@ -859,7 +854,22 @@ static void nfs_local_call_write(struct work_struct *work) } file_end_write(filp); - revert_creds(save_cred); + return status; +} + +static void nfs_local_call_write(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; + unsigned long old_flags = current->flags; + ssize_t status; + + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + + scoped_with_creds(filp->f_cred) + status = do_nfs_local_call_write(iocb, filp); + current->flags = old_flags; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 7f43e890d356..7317f26892c5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -431,6 +431,8 @@ void nfs42_ssc_unregister_ops(void) static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { + if (!S_ISREG(file_inode(file)->i_mode)) + return -EINVAL; return nfs4_proc_setlease(file, arg, lease, priv); } diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 00932500fce4..9e1c48c5c0b8 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -306,15 +306,12 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, const char *type, void *data, size_t data_size, struct idmap *idmap) { - const struct cred *saved_cred; struct key *rkey; const struct user_key_payload *payload; ssize_t ret; - saved_cred = override_creds(id_resolver_cache); - rkey = nfs_idmap_request_key(name, namelen, type, idmap); - revert_creds(saved_cred); - + scoped_with_creds(id_resolver_cache) + rkey = nfs_idmap_request_key(name, namelen, type, idmap); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a3135b5af7ee..f157d43d1312 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -317,7 +317,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); /* Notify pnfs_destroy_layout_final() that we're done */ - if (inode->i_state & (I_FREEING | I_CLEAR)) + if (inode_state_read(inode) & (I_FREEING | I_CLEAR)) wake_up_var_locked(lo, &inode->i_lock); spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index a238b6725008..93798575b807 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1086,7 +1086,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net, struct auth_domain *client, struct svc_fh *fhp, unsigned int may_flags, struct file *file, - struct nfsd_file **pnf, bool want_gc) + umode_t type, bool want_gc, struct nfsd_file **pnf) { unsigned char need = may_flags & NFSD_FILE_MAY_MASK; struct nfsd_file *new, *nf; @@ -1097,13 +1097,13 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net, int ret; retry: - if (rqstp) { - status = fh_verify(rqstp, fhp, S_IFREG, + if (rqstp) + status = fh_verify(rqstp, fhp, type, may_flags|NFSD_MAY_OWNER_OVERRIDE); - } else { - status = fh_verify_local(net, cred, client, fhp, S_IFREG, + else + status = fh_verify_local(net, cred, client, fhp, type, may_flags|NFSD_MAY_OWNER_OVERRIDE); - } + if (status != nfs_ok) return status; inode = d_inode(fhp->fh_dentry); @@ -1176,15 +1176,18 @@ out: open_file: trace_nfsd_file_alloc(nf); - nf->nf_mark = nfsd_file_mark_find_or_create(inode); - if (nf->nf_mark) { + + if (type == S_IFREG) + nf->nf_mark = nfsd_file_mark_find_or_create(inode); + + if (type != S_IFREG || nf->nf_mark) { if (file) { get_file(file); nf->nf_file = file; status = nfs_ok; trace_nfsd_file_opened(nf, status); } else { - ret = nfsd_open_verified(fhp, may_flags, &nf->nf_file); + ret = nfsd_open_verified(fhp, type, may_flags, &nf->nf_file); if (ret == -EOPENSTALE && stale_retry) { stale_retry = false; nfsd_file_unhash(nf); @@ -1246,7 +1249,7 @@ nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, - fhp, may_flags, NULL, pnf, true); + fhp, may_flags, NULL, S_IFREG, true, pnf); } /** @@ -1271,7 +1274,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, - fhp, may_flags, NULL, pnf, false); + fhp, may_flags, NULL, S_IFREG, false, pnf); } /** @@ -1314,8 +1317,8 @@ nfsd_file_acquire_local(struct net *net, struct svc_cred *cred, const struct cred *save_cred = get_current_cred(); __be32 beres; - beres = nfsd_file_do_acquire(NULL, net, cred, client, - fhp, may_flags, NULL, pnf, false); + beres = nfsd_file_do_acquire(NULL, net, cred, client, fhp, may_flags, + NULL, S_IFREG, false, pnf); put_cred(revert_creds(save_cred)); return beres; } @@ -1344,7 +1347,33 @@ nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file **pnf) { return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, - fhp, may_flags, file, pnf, false); + fhp, may_flags, file, S_IFREG, false, pnf); +} + +/** + * nfsd_file_acquire_dir - Get a struct nfsd_file with an open directory + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @pnf: OUT: new or found "struct nfsd_file" object + * + * The nfsd_file_object returned by this API is reference-counted + * but not garbage-collected. The object is unhashed after the + * final nfsd_file_put(). This opens directories only, and only + * in O_RDONLY mode. + * + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. + */ +__be32 +nfsd_file_acquire_dir(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file **pnf) +{ + return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, fhp, + NFSD_MAY_READ|NFSD_MAY_64BIT_COOKIE, + NULL, S_IFDIR, false, pnf); } /* diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index e3d6ca2b6030..b383dbc5b921 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -82,5 +82,7 @@ __be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd_file_acquire_local(struct net *net, struct svc_cred *cred, struct auth_domain *client, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf); +__be32 nfsd_file_acquire_dir(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file **pnf); int nfsd_file_cache_stats_show(struct seq_file *m, void *v); #endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c index ca54aa583530..ac51a44e1065 100644 --- a/fs/nfsd/netlink.c +++ b/fs/nfsd/netlink.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/nfsd.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h index 8eb903f24c41..478117ff6b8c 100644 --- a/fs/nfsd/netlink.h +++ b/fs/nfsd/netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/nfsd.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_NFSD_GEN_H #define _LINUX_NFSD_GEN_H diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index b6d03e1ef5f7..42adc5461db0 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -281,14 +281,11 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err) return nfserrno(host_err); - inode_lock_nested(inode, I_MUTEX_PARENT); - - child = lookup_one(&nop_mnt_idmap, - &QSTR_LEN(argp->name, argp->len), - parent); + child = start_creating(&nop_mnt_idmap, parent, + &QSTR_LEN(argp->name, argp->len)); if (IS_ERR(child)) { status = nfserrno(PTR_ERR(child)); - goto out; + goto out_write; } if (d_really_is_negative(child)) { @@ -344,7 +341,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = fh_fill_pre_attrs(fhp); if (status != nfs_ok) goto out; - host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode, NULL); if (host_err < 0) { status = nfserrno(host_err); goto out; @@ -367,9 +364,8 @@ set_attr: status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs); out: - inode_unlock(inode); - if (child && !IS_ERR(child)) - dput(child); + end_creating(child); +out_write: fh_drop_write(fhp); return status; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 7f7e6bb23a90..b74800917583 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -264,14 +264,11 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (is_create_with_attrs(open)) nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs); - inode_lock_nested(inode, I_MUTEX_PARENT); - - child = lookup_one(&nop_mnt_idmap, - &QSTR_LEN(open->op_fname, open->op_fnamelen), - parent); + child = start_creating(&nop_mnt_idmap, parent, + &QSTR_LEN(open->op_fname, open->op_fnamelen)); if (IS_ERR(child)) { status = nfserrno(PTR_ERR(child)); - goto out; + goto out_write; } if (d_really_is_negative(child)) { @@ -379,10 +376,9 @@ set_attr: if (attrs.na_aclerr) open->op_bmval[0] &= ~FATTR4_WORD0_ACL; out: - inode_unlock(inode); + end_creating(child); nfsd_attrs_free(&attrs); - if (child && !IS_ERR(child)) - dput(child); +out_write: fh_drop_write(fhp); return status; } @@ -2342,6 +2338,13 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp, union nfsd4_op_u *u) { struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation; + struct nfs4_delegation *dd; + struct nfsd_file *nf; + __be32 status; + + status = nfsd_file_acquire_dir(rqstp, &cstate->current_fh, &nf); + if (status != nfs_ok) + return status; /* * RFC 8881, section 18.39.3 says: @@ -2355,7 +2358,20 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp, * return NFS4_OK with a non-fatal status of GDD4_UNAVAIL in this * situation. */ - gdd->gddrnf_status = GDD4_UNAVAIL; + dd = nfsd_get_dir_deleg(cstate, gdd, nf); + nfsd_file_put(nf); + if (IS_ERR(dd)) { + int err = PTR_ERR(dd); + + if (err != -EAGAIN) + return nfserrno(err); + gdd->gddrnf_status = GDD4_UNAVAIL; + return nfs_ok; + } + + gdd->gddrnf_status = GDD4_OK; + memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, sizeof(gdd->gddr_stateid)); + nfs4_put_stid(&dd->dl_stid); return nfs_ok; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e2b9472e5c78..b39d4cbdfd35 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -195,13 +195,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) goto out_creds; dir = nn->rec_file->f_path.dentry; - /* lock the parent */ - inode_lock(d_inode(dir)); - dentry = lookup_one(&nop_mnt_idmap, &QSTR(dname), dir); + dentry = start_creating(&nop_mnt_idmap, dir, &QSTR(dname)); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); - goto out_unlock; + goto out; } if (d_really_is_positive(dentry)) /* @@ -212,15 +210,13 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * In the 4.0 case, we should never get here; but we may * as well be forgiving and just succeed silently. */ - goto out_put; - dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); + goto out_end; + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, 0700, NULL); if (IS_ERR(dentry)) status = PTR_ERR(dentry); -out_put: - if (!status) - dput(dentry); -out_unlock: - inode_unlock(d_inode(dir)); +out_end: + end_creating(dentry); +out: if (status == 0) { if (nn->in_grace) __nfsd4_create_reclaim_record_grace(clp, dname, @@ -328,20 +324,12 @@ nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %s\n", name); dir = nn->rec_file->f_path.dentry; - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - dentry = lookup_one(&nop_mnt_idmap, &QSTR(name), dir); - if (IS_ERR(dentry)) { - status = PTR_ERR(dentry); - goto out_unlock; - } - status = -ENOENT; - if (d_really_is_negative(dentry)) - goto out; - status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry); -out: - dput(dentry); -out_unlock: - inode_unlock(d_inode(dir)); + dentry = start_removing(&nop_mnt_idmap, dir, &QSTR(name)); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry, NULL); + end_removing(dentry); return status; } @@ -427,7 +415,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(name, nn)) goto out_free; - status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child, NULL); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8a6960500217..6791fc239dbd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -7859,7 +7859,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) + status = fh_verify(rqstp, &cstate->current_fh, 0, 0); + if (status) return status; status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED, &s, nn); @@ -9377,3 +9378,103 @@ out_status: nfs4_put_stid(&dp->dl_stid); return status; } + +/** + * nfsd_get_dir_deleg - attempt to get a directory delegation + * @cstate: compound state + * @gdd: GET_DIR_DELEGATION arg/resp structure + * @nf: nfsd_file opened on the directory + * + * Given a GET_DIR_DELEGATION request @gdd, attempt to acquire a delegation + * on the directory to which @nf refers. Note that this does not set up any + * sort of async notifications for the delegation. + */ +struct nfs4_delegation * +nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, + struct nfsd4_get_dir_delegation *gdd, + struct nfsd_file *nf) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_delegation *dp; + struct file_lease *fl; + struct nfs4_file *fp, *rfp; + int status = 0; + + fp = nfsd4_alloc_file(); + if (!fp) + return ERR_PTR(-ENOMEM); + + nfsd4_file_init(&cstate->current_fh, fp); + + rfp = nfsd4_file_hash_insert(fp, &cstate->current_fh); + if (unlikely(!rfp)) { + put_nfs4_file(fp); + return ERR_PTR(-ENOMEM); + } + + if (rfp != fp) { + put_nfs4_file(fp); + fp = rfp; + } + + /* if this client already has one, return that it's unavailable */ + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + /* existing delegation? */ + if (nfs4_delegation_exists(clp, fp)) { + status = -EAGAIN; + } else if (!fp->fi_deleg_file) { + fp->fi_deleg_file = nfsd_file_get(nf); + fp->fi_delegees = 1; + } else { + ++fp->fi_delegees; + } + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + + if (status) { + put_nfs4_file(fp); + return ERR_PTR(status); + } + + /* Try to set up the lease */ + status = -ENOMEM; + dp = alloc_init_deleg(clp, fp, NULL, NFS4_OPEN_DELEGATE_READ); + if (!dp) + goto out_delegees; + + fl = nfs4_alloc_init_lease(dp); + if (!fl) + goto out_put_stid; + + status = kernel_setlease(nf->nf_file, + fl->c.flc_type, &fl, NULL); + if (fl) + locks_free_lease(fl); + if (status) + goto out_put_stid; + + /* + * Now, try to hash it. This can fail if we race another nfsd task + * trying to set a delegation on the same file. If that happens, + * then just say UNAVAIL. + */ + spin_lock(&state_lock); + spin_lock(&clp->cl_lock); + spin_lock(&fp->fi_lock); + status = hash_delegation_locked(dp, fp); + spin_unlock(&fp->fi_lock); + spin_unlock(&clp->cl_lock); + spin_unlock(&state_lock); + + if (!status) + return dp; + + /* Something failed. Drop the lease and clean up the stid */ + kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); +out_put_stid: + nfs4_put_stid(&dp->dl_stid); +out_delegees: + put_deleg_file(fp); + return ERR_PTR(status); +} diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 8f71f5748c75..481e789a7697 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -306,18 +306,16 @@ nfsd_proc_create(struct svc_rqst *rqstp) goto done; } - inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT); - dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(argp->name, argp->len), - dirfhp->fh_dentry); + dchild = start_creating(&nop_mnt_idmap, dirfhp->fh_dentry, + &QSTR_LEN(argp->name, argp->len)); if (IS_ERR(dchild)) { resp->status = nfserrno(PTR_ERR(dchild)); - goto out_unlock; + goto out_write; } fh_init(newfhp, NFS_FHSIZE); resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); if (!resp->status && d_really_is_negative(dchild)) resp->status = nfserr_noent; - dput(dchild); if (resp->status) { if (resp->status != nfserr_noent) goto out_unlock; @@ -409,6 +407,9 @@ nfsd_proc_create(struct svc_rqst *rqstp) /* File doesn't exist. Create it and set attrs */ resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type, rdev, newfhp); + /* nfsd_create_locked() unlocked the parent */ + dput(dchild); + goto out_write; } else if (type == S_IFREG) { dprintk("nfsd: existing %s, valid=%x, size=%ld\n", argp->name, attr->ia_valid, (long) attr->ia_size); @@ -423,7 +424,8 @@ nfsd_proc_create(struct svc_rqst *rqstp) } out_unlock: - inode_unlock(dirfhp->fh_dentry->d_inode); + end_creating(dchild); +out_write: fh_drop_write(dirfhp); done: fh_put(dirfhp); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 1e736f402426..b052c1effdc5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -867,4 +867,9 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_delegation **pdp); + +struct nfsd4_get_dir_delegation; +struct nfs4_delegation *nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, + struct nfsd4_get_dir_delegation *gdd, + struct nfsd_file *nf); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9cb20d4aeab1..31cbf46b47b1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -959,15 +959,16 @@ retry: /** * nfsd_open_verified - Open a regular file for the filecache * @fhp: NFS filehandle of the file to open + * @type: S_IFMT inode type allowed (0 means any type is allowed) * @may_flags: internal permission flags * @filp: OUT: open "struct file *" * * Returns zero on success, or a negative errno value. */ int -nfsd_open_verified(struct svc_fh *fhp, int may_flags, struct file **filp) +nfsd_open_verified(struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { - return __nfsd_open(fhp, S_IFREG, may_flags, filp); + return __nfsd_open(fhp, type, may_flags, filp); } /* @@ -1159,7 +1160,7 @@ static int wait_for_concurrent_writes(struct file *file) dprintk("nfsd: write resume %d\n", task_pid_nr(current)); } - if (inode->i_state & I_DIRTY) { + if (inode_state_read_once(inode) & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); err = vfs_fsync(file, 0); } @@ -1521,7 +1522,7 @@ nfsd_check_ignore_resizing(struct iattr *iap) iap->ia_valid &= ~ATTR_SIZE; } -/* The parent directory should already be locked: */ +/* The parent directory should already be locked - we will unlock */ __be32 nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_attrs *attrs, @@ -1552,13 +1553,12 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(&nop_mnt_idmap, dirp, dchild, - iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode, NULL); if (!host_err) nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); + dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode, NULL); if (IS_ERR(dchild)) { host_err = PTR_ERR(dchild); } else if (d_is_negative(dchild)) { @@ -1574,7 +1574,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFIFO: case S_IFSOCK: host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild, - iap->ia_mode, rdev); + iap->ia_mode, rdev, NULL); break; default: printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", @@ -1587,8 +1587,9 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs); out: - if (!IS_ERR(dchild)) - dput(dchild); + if (!err) + fh_fill_post_attrs(fhp); + end_creating(dchild); return err; out_nfserr: @@ -1626,28 +1627,24 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err) return nfserrno(host_err); - inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT); - dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); + dchild = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) { - err = nfserrno(host_err); - goto out_unlock; - } + if (IS_ERR(dchild)) + return nfserrno(host_err); + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - /* - * We unconditionally drop our ref to dchild as fh_compose will have - * already grabbed its own ref for it. - */ - dput(dchild); if (err) goto out_unlock; err = fh_fill_pre_attrs(fhp); if (err != nfs_ok) goto out_unlock; err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); - fh_fill_post_attrs(fhp); + /* nfsd_create_locked() unlocked the parent */ + dput(dchild); + return err; + out_unlock: - inode_unlock(dentry->d_inode); + end_creating(dchild); return err; } @@ -1733,28 +1730,26 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, } dentry = fhp->fh_dentry; - inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT); - dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); + dnew = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); if (IS_ERR(dnew)) { err = nfserrno(PTR_ERR(dnew)); - inode_unlock(dentry->d_inode); goto out_drop_write; } err = fh_fill_pre_attrs(fhp); if (err != nfs_ok) goto out_unlock; - host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); + host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path, NULL); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) nfsd_create_setattr(rqstp, fhp, resfhp, attrs); fh_fill_post_attrs(fhp); out_unlock: - inode_unlock(dentry->d_inode); + end_creating(dnew); if (!err) err = nfserrno(commit_metadata(fhp)); - dput(dnew); - if (err==0) err = cerr; + if (!err) + err = cerr; out_drop_write: fh_drop_write(fhp); out: @@ -1809,32 +1804,31 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, ddir = ffhp->fh_dentry; dirp = d_inode(ddir); - inode_lock_nested(dirp, I_MUTEX_PARENT); + dnew = start_creating(&nop_mnt_idmap, ddir, &QSTR_LEN(name, len)); - dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(name, len), ddir); if (IS_ERR(dnew)) { host_err = PTR_ERR(dnew); - goto out_unlock; + goto out_drop_write; } dold = tfhp->fh_dentry; err = nfserr_noent; if (d_really_is_negative(dold)) - goto out_dput; + goto out_unlock; err = fh_fill_pre_attrs(ffhp); if (err != nfs_ok) - goto out_dput; + goto out_unlock; host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); - inode_unlock(dirp); +out_unlock: + end_creating(dnew); if (!host_err) { host_err = commit_metadata(ffhp); if (!host_err) host_err = commit_metadata(tfhp); } - dput(dnew); out_drop_write: fh_drop_write(tfhp); if (host_err == -EBUSY) { @@ -1849,12 +1843,6 @@ out_drop_write: } out: return err != nfs_ok ? err : nfserrno(host_err); - -out_dput: - dput(dnew); -out_unlock: - inode_unlock(dirp); - goto out_drop_write; } static void @@ -1895,11 +1883,12 @@ __be32 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct svc_fh *tfhp, char *tname, int tlen) { - struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; + struct dentry *fdentry, *tdentry; int type = S_IFDIR; + struct renamedata rd = {}; __be32 err; int host_err; - bool close_cached = false; + struct dentry *close_cached; trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen); @@ -1925,15 +1914,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; retry: + close_cached = NULL; host_err = fh_want_write(ffhp); if (host_err) { err = nfserrno(host_err); goto out; } - trap = lock_rename(tdentry, fdentry); - if (IS_ERR(trap)) { - err = nfserr_xdev; + rd.mnt_idmap = &nop_mnt_idmap; + rd.old_parent = fdentry; + rd.new_parent = tdentry; + + host_err = start_renaming(&rd, 0, &QSTR_LEN(fname, flen), + &QSTR_LEN(tname, tlen)); + + if (host_err) { + err = nfserrno(host_err); goto out_want_write; } err = fh_fill_pre_attrs(ffhp); @@ -1943,48 +1939,23 @@ retry: if (err != nfs_ok) goto out_unlock; - odentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), fdentry); - host_err = PTR_ERR(odentry); - if (IS_ERR(odentry)) - goto out_nfserr; + type = d_inode(rd.old_dentry)->i_mode & S_IFMT; + + if (d_inode(rd.new_dentry)) + type = d_inode(rd.new_dentry)->i_mode & S_IFMT; - host_err = -ENOENT; - if (d_really_is_negative(odentry)) - goto out_dput_old; - host_err = -EINVAL; - if (odentry == trap) - goto out_dput_old; - type = d_inode(odentry)->i_mode & S_IFMT; - - ndentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(tname, tlen), tdentry); - host_err = PTR_ERR(ndentry); - if (IS_ERR(ndentry)) - goto out_dput_old; - if (d_inode(ndentry)) - type = d_inode(ndentry)->i_mode & S_IFMT; - host_err = -ENOTEMPTY; - if (ndentry == trap) - goto out_dput_new; - - if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && - nfsd_has_cached_files(ndentry)) { - close_cached = true; - goto out_dput_old; + if ((rd.new_dentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && + nfsd_has_cached_files(rd.new_dentry)) { + close_cached = dget(rd.new_dentry); + goto out_unlock; } else { - struct renamedata rd = { - .mnt_idmap = &nop_mnt_idmap, - .old_parent = fdentry, - .old_dentry = odentry, - .new_parent = tdentry, - .new_dentry = ndentry, - }; int retries; for (retries = 1;;) { host_err = vfs_rename(&rd); if (host_err != -EAGAIN || !retries--) break; - if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry))) + if (!nfsd_wait_for_delegreturn(rqstp, d_inode(rd.old_dentry))) break; } if (!host_err) { @@ -1993,11 +1964,6 @@ retry: host_err = commit_metadata(ffhp); } } - out_dput_new: - dput(ndentry); - out_dput_old: - dput(odentry); - out_nfserr: if (host_err == -EBUSY) { /* * See RFC 8881 Section 18.26.4 para 1-3: NFSv4 RENAME @@ -2016,7 +1982,7 @@ retry: fh_fill_post_attrs(tfhp); } out_unlock: - unlock_rename(tdentry, fdentry); + end_renaming(&rd); out_want_write: fh_drop_write(ffhp); @@ -2027,9 +1993,8 @@ out_want_write: * until this point and then reattempt the whole shebang. */ if (close_cached) { - close_cached = false; - nfsd_close_cached_files(ndentry); - dput(ndentry); + nfsd_close_cached_files(close_cached); + dput(close_cached); goto retry; } out: @@ -2054,7 +2019,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, { struct dentry *dentry, *rdentry; struct inode *dirp; - struct inode *rinode; + struct inode *rinode = NULL; __be32 err; int host_err; @@ -2073,24 +2038,21 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dentry = fhp->fh_dentry; dirp = d_inode(dentry); - inode_lock_nested(dirp, I_MUTEX_PARENT); - rdentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); + rdentry = start_removing(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); + host_err = PTR_ERR(rdentry); if (IS_ERR(rdentry)) - goto out_unlock; + goto out_drop_write; - if (d_really_is_negative(rdentry)) { - dput(rdentry); - host_err = -ENOENT; - goto out_unlock; - } - rinode = d_inode(rdentry); err = fh_fill_pre_attrs(fhp); if (err != nfs_ok) goto out_unlock; + rinode = d_inode(rdentry); + /* Prevent truncation until after locks dropped */ ihold(rinode); + if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; @@ -2108,14 +2070,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, break; } } else { - host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry); + host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry, NULL); } fh_fill_post_attrs(fhp); - inode_unlock(dirp); - if (!host_err) +out_unlock: + end_removing(rdentry); + if (!err && !host_err) host_err = commit_metadata(fhp); - dput(rdentry); iput(rinode); /* truncate the inode here */ out_drop_write: @@ -2133,9 +2095,6 @@ out_nfserr: } out: return err != nfs_ok ? err : nfserrno(host_err); -out_unlock: - inode_unlock(dirp); - goto out_drop_write; } /* diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 0c0292611c6d..09de48c50cbe 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -114,7 +114,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -int nfsd_open_verified(struct svc_fh *fhp, int may_flags, +int nfsd_open_verified(struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index bcc7d76269ac..4bbdc832d7f2 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -1148,7 +1148,7 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); if (unlikely(!cpfile)) return -ENOMEM; - if (!(cpfile->i_state & I_NEW)) + if (!(inode_state_read_once(cpfile) & I_NEW)) goto out; err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index c664daba56ae..674380837ab9 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -506,7 +506,7 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); if (unlikely(!dat)) return -ENOMEM; - if (!(dat->i_state & I_NEW)) + if (!(inode_state_read_once(dat) & I_NEW)) goto out; err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di)); diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index c4cd4a4dedd0..99eb8a59009e 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -188,7 +188,7 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO); if (unlikely(!ifile)) return -ENOMEM; - if (!(ifile->i_state & I_NEW)) + if (!(inode_state_read_once(ifile) & I_NEW)) goto out; err = nilfs_mdt_init(ifile, NILFS_MDT_GFP, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 87ddde159f0c..51bde45d5865 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -365,7 +365,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) failed_after_creation: clear_nlink(inode); - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); iput(inode); /* * raw_inode will be deleted through @@ -562,7 +562,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { if (!inode->i_nlink) { iput(inode); return ERR_PTR(-ESTALE); @@ -591,7 +591,7 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; err = nilfs_init_gcinode(inode); @@ -631,7 +631,7 @@ int nilfs_attach_btree_node_cache(struct inode *inode) nilfs_iget_set, &args); if (unlikely(!btnc_inode)) return -ENOMEM; - if (btnc_inode->i_state & I_NEW) { + if (inode_state_read_once(btnc_inode) & I_NEW) { nilfs_init_btnc_inode(btnc_inode); unlock_new_inode(btnc_inode); } @@ -686,7 +686,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode) nilfs_iget_set, &args); if (unlikely(!s_inode)) return ERR_PTR(-ENOMEM); - if (!(s_inode->i_state & I_NEW)) + if (!(inode_state_read_once(s_inode) & I_NEW)) return inode; NILFS_I(s_inode)->i_flags = 0; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index f466daa39440..b7e3d91b6243 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -14,6 +14,7 @@ #include <linux/buffer_head.h> #include <linux/spinlock.h> #include <linux/blkdev.h> +#include <linux/fs_struct.h> #include <linux/nilfs2_api.h> #include <linux/nilfs2_ondisk.h> #include "the_nilfs.h" diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 330f269abedf..83f93337c01b 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1226,7 +1226,7 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); if (unlikely(!sufile)) return -ENOMEM; - if (!(sufile->i_state & I_NEW)) + if (!(inode_state_read_once(sufile) & I_NEW)) goto out; err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui)); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1dadda82cae5..d0b9b984002f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1597,16 +1597,20 @@ static struct hlist_head *fanotify_alloc_merge_hash(void) return hash; } +DEFINE_CLASS(fsnotify_group, + struct fsnotify_group *, + if (!IS_ERR_OR_NULL(_T)) fsnotify_destroy_group(_T), + fsnotify_alloc_group(ops, flags), + const struct fsnotify_ops *ops, int flags) + /* fanotify syscalls */ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { struct user_namespace *user_ns = current_user_ns(); - struct fsnotify_group *group; int f_flags, fd; unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; unsigned int internal_flags = 0; - struct file *file; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); @@ -1690,36 +1694,29 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (flags & FAN_NONBLOCK) f_flags |= O_NONBLOCK; - /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ - group = fsnotify_alloc_group(&fanotify_fsnotify_ops, + CLASS(fsnotify_group, group)(&fanotify_fsnotify_ops, FSNOTIFY_GROUP_USER); - if (IS_ERR(group)) { + /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ + if (IS_ERR(group)) return PTR_ERR(group); - } /* Enforce groups limits per user in all containing user ns */ group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(), UCOUNT_FANOTIFY_GROUPS); - if (!group->fanotify_data.ucounts) { - fd = -EMFILE; - goto out_destroy_group; - } + if (!group->fanotify_data.ucounts) + return -EMFILE; group->fanotify_data.flags = flags | internal_flags; group->memcg = get_mem_cgroup_from_mm(current->mm); group->user_ns = get_user_ns(user_ns); group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); - if (!group->fanotify_data.merge_hash) { - fd = -ENOMEM; - goto out_destroy_group; - } + if (!group->fanotify_data.merge_hash) + return -ENOMEM; group->overflow_event = fanotify_alloc_overflow_event(); - if (unlikely(!group->overflow_event)) { - fd = -ENOMEM; - goto out_destroy_group; - } + if (unlikely(!group->overflow_event)) + return -ENOMEM; if (force_o_largefile()) event_f_flags |= O_LARGEFILE; @@ -1738,8 +1735,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->priority = FSNOTIFY_PRIO_PRE_CONTENT; break; default: - fd = -EINVAL; - goto out_destroy_group; + return -EINVAL; } BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE)); @@ -1750,27 +1746,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } if (flags & FAN_ENABLE_AUDIT) { - fd = -EPERM; if (!capable(CAP_AUDIT_WRITE)) - goto out_destroy_group; - } - - fd = get_unused_fd_flags(f_flags); - if (fd < 0) - goto out_destroy_group; - - file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group, - f_flags, FMODE_NONOTIFY); - if (IS_ERR(file)) { - put_unused_fd(fd); - fd = PTR_ERR(file); - goto out_destroy_group; + return -EPERM; } - fd_install(fd, file); - return fd; -out_destroy_group: - fsnotify_destroy_group(group); + fd = FD_ADD(f_flags, + anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, + group, f_flags, FMODE_NONOTIFY)); + if (fd >= 0) + retain_and_null_ptr(group); return fd; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 46bfc543f946..d27ff5e5f165 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -52,7 +52,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/nsfs.c b/fs/nsfs.c index 79b026a36fb6..bf27d5da91f1 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -58,6 +58,8 @@ const struct dentry_operations ns_dentry_operations = { static void nsfs_evict(struct inode *inode) { struct ns_common *ns = inode->i_private; + + __ns_ref_active_put(ns); clear_inode(inode); ns->ops->put(ns); } @@ -108,7 +110,6 @@ int ns_get_path(struct path *path, struct task_struct *task, int open_namespace(struct ns_common *ns) { struct path path __free(path_put) = {}; - struct file *f; int err; /* call first to consume reference */ @@ -116,16 +117,7 @@ int open_namespace(struct ns_common *ns) if (err < 0) return err; - CLASS(get_unused_fd, fd)(O_CLOEXEC); - if (fd < 0) - return fd; - - f = dentry_open(&path, O_RDONLY, current_cred()); - if (IS_ERR(f)) - return PTR_ERR(f); - - fd_install(fd, f); - return take_fd(fd); + return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred())); } int open_related_ns(struct ns_common *ns, @@ -311,7 +303,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info kinfo = {}; struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; struct path path __free(path_put) = {}; - struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); if (ns->ns_type != CLONE_NEWNS) @@ -330,28 +321,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (ret) return ret; - CLASS(get_unused_fd, fd)(O_CLOEXEC); - if (fd < 0) - return fd; - - f = dentry_open(&path, O_RDONLY, current_cred()); - if (IS_ERR(f)) - return PTR_ERR(f); - - if (uinfo) { - /* - * If @uinfo is passed return all information about the - * mount namespace as well. - */ - ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); - if (ret) - return ret; - } - - /* Transfer reference of @f to caller's fdtable. */ - fd_install(fd, no_free_ptr(f)); - /* File descriptor is live so hand it off to the caller. */ - return take_fd(fd); + FD_PREPARE(fdf, O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred())); + if (fdf.err) + return fdf.err; + /* + * If @uinfo is passed return all information about the + * mount namespace as well. + */ + ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); + if (ret) + return ret; + ret = fd_publish(fdf); + break; } default: ret = -ENOTTY; @@ -408,6 +389,7 @@ static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, .show_path = nsfs_show_path, + .drop_inode = inode_just_drop, }; static int nsfs_init_inode(struct inode *inode, void *data) @@ -418,6 +400,16 @@ static int nsfs_init_inode(struct inode *inode, void *data) inode->i_mode |= S_IRUGO; inode->i_fop = &ns_file_operations; inode->i_ino = ns->inum; + + /* + * Bring the namespace subtree back to life if we have to. This + * can happen when e.g., all processes using a network namespace + * and all namespace files or namespace file bind-mounts have + * died but there are still sockets pinning it. The SIOCGSKNS + * ioctl on such a socket will resurrect the relevant namespace + * subtree. + */ + __ns_ref_active_get(ns); return 0; } @@ -458,6 +450,45 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return FILEID_NSFS; } +bool is_current_namespace(struct ns_common *ns) +{ + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + return current_in_namespace(to_cg_ns(ns)); +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + return current_in_namespace(to_ipc_ns(ns)); +#endif + case CLONE_NEWNS: + return current_in_namespace(to_mnt_ns(ns)); +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + return current_in_namespace(to_net_ns(ns)); +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + return current_in_namespace(to_pid_ns(ns)); +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + return current_in_namespace(to_time_ns(ns)); +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + return current_in_namespace(to_user_ns(ns)); +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + return current_in_namespace(to_uts_ns(ns)); +#endif + default: + VFS_WARN_ON_ONCE(true); + return false; + } +} + static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, int fh_len, int fh_type) { @@ -483,18 +514,35 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return NULL; } + if (!fid->ns_id) + return NULL; + /* Either both are set or both are unset. */ + if (!fid->ns_inum != !fid->ns_type) + return NULL; + scoped_guard(rcu) { ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); if (!ns) return NULL; VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); - VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); - if (ns->inum != fid->ns_inum) + if (fid->ns_inum && (fid->ns_inum != ns->inum)) + return NULL; + if (fid->ns_type && (fid->ns_type != ns->ns_type)) return NULL; - if (!__ns_ref_get(ns)) + /* + * This is racy because we're not actually taking an + * active reference. IOW, it could happen that the + * namespace becomes inactive after this check. + * We don't care because nsfs_init_inode() will just + * resurrect the relevant namespace tree for us. If it + * has been active here we just allow it's resurrection. + * We could try to take an active reference here and + * then drop it again. But really, why bother. + */ + if (!ns_get_unless_inactive(ns)) return NULL; } @@ -590,6 +638,8 @@ static int nsfs_init_fs_context(struct fs_context *fc) struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; + fc->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; + ctx->s_d_flags |= DCACHE_DONTCACHE; ctx->ops = &nsfs_ops; ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; @@ -612,3 +662,27 @@ void __init nsfs_init(void) nsfs_root_path.mnt = nsfs_mnt; nsfs_root_path.dentry = nsfs_mnt->mnt_root; } + +void nsproxy_ns_active_get(struct nsproxy *ns) +{ + ns_ref_active_get(ns->mnt_ns); + ns_ref_active_get(ns->uts_ns); + ns_ref_active_get(ns->ipc_ns); + ns_ref_active_get(ns->pid_ns_for_children); + ns_ref_active_get(ns->cgroup_ns); + ns_ref_active_get(ns->net_ns); + ns_ref_active_get(ns->time_ns); + ns_ref_active_get(ns->time_ns_for_children); +} + +void nsproxy_ns_active_put(struct nsproxy *ns) +{ + ns_ref_active_put(ns->mnt_ns); + ns_ref_active_put(ns->uts_ns); + ns_ref_active_put(ns->ipc_ns); + ns_ref_active_put(ns->pid_ns_for_children); + ns_ref_active_put(ns->cgroup_ns); + ns_ref_active_put(ns->net_ns); + ns_ref_active_put(ns->time_ns); + ns_ref_active_put(ns->time_ns_for_children); +} diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 3959f23c487a..08266adc42ba 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -537,7 +537,7 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, return ERR_PTR(-ENOMEM); /* If this is a freshly allocated inode, need to read it now. */ - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) inode = ntfs_read_mft(inode, name, ref); else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) { /* diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index ddff94c091b8..8d09dfec970a 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -51,6 +51,7 @@ #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/log2.h> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 62464d194da3..af1e2cedb217 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/fs_struct.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 162711cc5b20..b267ec580da9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6892,7 +6892,7 @@ static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start, ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1, &phys); - start = folio_next_index(folio) << PAGE_SHIFT; + start = folio_next_pos(folio); } out: if (folios) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index b05d4e9d13b2..79b281e32f4c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1615,7 +1615,7 @@ static void o2net_start_connect(struct work_struct *work) myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; myaddr.sin_port = htons(0); /* any port */ - ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&myaddr, sizeof(myaddr)); if (ret) { mlog(ML_ERROR, "bind failed with %d at address %pI4\n", @@ -1638,7 +1638,7 @@ static void o2net_start_connect(struct work_struct *work) remoteaddr.sin_port = node->nd_ipv4_port; ret = sc->sc_sock->ops->connect(sc->sc_sock, - (struct sockaddr *)&remoteaddr, + (struct sockaddr_unsized *)&remoteaddr, sizeof(remoteaddr), O_NONBLOCK); if (ret == -EINPROGRESS) @@ -2002,7 +2002,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) INIT_WORK(&o2net_listen_work, o2net_accept_many); sock->sk->sk_reuse = SK_CAN_REUSE; - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); if (ret < 0) { printk(KERN_ERR "o2net: Error %d while binding socket at " "%pI4:%u\n", ret, &addr, ntohs(port)); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 92a6149da9c1..619ff03b15d6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2487,7 +2487,7 @@ update: * which hasn't been populated yet, so clear the refresh flag * and let the caller handle it. */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { status = 0; if (lockres) ocfs2_complete_lock_res_refresh(lockres, 0); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index fcc89856ab95..78f81950c9ee 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -152,8 +152,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, mlog_errno(PTR_ERR(inode)); goto bail; } - trace_ocfs2_iget5_locked(inode->i_state); - if (inode->i_state & I_NEW) { + trace_ocfs2_iget5_locked(inode_state_read_once(inode)); + if (inode_state_read_once(inode) & I_NEW) { rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } @@ -1290,6 +1290,8 @@ static void ocfs2_clear_inode(struct inode *inode) void ocfs2_evict_inode(struct inode *inode) { + write_inode_now(inode, 1); + if (!inode->i_nlink || (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); @@ -1299,27 +1301,6 @@ void ocfs2_evict_inode(struct inode *inode) ocfs2_clear_inode(inode); } -/* Called under inode_lock, with no more references on the - * struct inode, so it's safe here to check the flags field - * and to manipulate i_nlink without any other locks. */ -int ocfs2_drop_inode(struct inode *inode) -{ - struct ocfs2_inode_info *oi = OCFS2_I(inode); - - trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, - inode->i_nlink, oi->ip_flags); - - assert_spin_locked(&inode->i_lock); - inode->i_state |= I_WILL_FREE; - spin_unlock(&inode->i_lock); - write_inode_now(inode, 1); - spin_lock(&inode->i_lock); - WARN_ON(inode->i_state & I_NEW); - inode->i_state &= ~I_WILL_FREE; - - return 1; -} - /* * This is called from our getattr. */ diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index accf03d4765e..07bd838e7843 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -116,7 +116,6 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode) } void ocfs2_evict_inode(struct inode *inode); -int ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e5f58ff2175f..85239807dec7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -902,15 +902,8 @@ bail: static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - struct address_space *mapping = jinode->i_vfs_inode->i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); + return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + jinode->i_dirty_start, jinode->i_dirty_end); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 54ed1495de9a..4b32fb5658ad 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1569,8 +1569,6 @@ DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode); DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode); -DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode); - TRACE_EVENT(ocfs2_inode_revalidate, TP_PROTO(void *inode, unsigned long long ino, unsigned int flags), diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 53daa4482406..2c7ba1480f7a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -129,7 +129,7 @@ static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, .free_inode = ocfs2_free_inode, - .drop_inode = ocfs2_drop_inode, + .drop_inode = inode_just_drop, .evict_inode = ocfs2_evict_inode, .sync_fs = ocfs2_sync_fs, .put_super = ocfs2_put_super, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 135c49c5d848..701ed85d9831 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -14,6 +14,7 @@ #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/crc-itu-t.h> +#include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "omfs.h" @@ -212,7 +213,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; bh = omfs_bread(inode->i_sb, ino); diff --git a/fs/open.c b/fs/open.c index 3d64372ecc67..f328622061c5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -191,12 +191,9 @@ int do_ftruncate(struct file *file, loff_t length, int small) if (error) return error; - sb_start_write(inode->i_sb); - error = do_truncate(file_mnt_idmap(file), dentry, length, - ATTR_MTIME | ATTR_CTIME, file); - sb_end_write(inode->i_sb); - - return error; + scoped_guard(super_write, inode->i_sb) + return do_truncate(file_mnt_idmap(file), dentry, length, + ATTR_MTIME | ATTR_CTIME, file); } int do_sys_ftruncate(unsigned int fd, loff_t length, int small) @@ -631,7 +628,7 @@ out: int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; struct iattr newattrs; int error; @@ -651,7 +648,7 @@ retry_deleg: &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -756,7 +753,7 @@ int chown_common(const struct path *path, uid_t user, gid_t group) struct mnt_idmap *idmap; struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int error; struct iattr newattrs; kuid_t uid; @@ -791,7 +788,7 @@ retry_deleg: error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -940,7 +937,7 @@ static int do_dentry_open(struct file *f, } error = security_file_open(f); - if (error) + if (unlikely(error)) goto cleanup_all; /* @@ -950,11 +947,11 @@ static int do_dentry_open(struct file *f, * pseudo file, this call will not change the mode. */ error = fsnotify_open_perm_and_set_mode(f); - if (error) + if (unlikely(error)) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); - if (error) + if (unlikely(error)) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ @@ -1171,9 +1168,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, if (IS_ERR(f)) return f; - error = vfs_create(mnt_idmap(path->mnt), - d_inode(path->dentry->d_parent), - path->dentry, mode, true); + error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode, NULL); if (!error) error = vfs_open(path, f); @@ -1421,8 +1416,8 @@ static int do_sys_openat2(int dfd, const char __user *filename, struct open_how *how) { struct open_flags op; - struct filename *tmp; - int err, fd; + struct filename *tmp __free(putname) = NULL; + int err; err = build_open_flags(how, &op); if (unlikely(err)) @@ -1432,18 +1427,7 @@ static int do_sys_openat2(int dfd, const char __user *filename, if (IS_ERR(tmp)) return PTR_ERR(tmp); - fd = get_unused_fd_flags(how->flags); - if (likely(fd >= 0)) { - struct file *f = do_filp_open(dfd, tmp, &op); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else { - fd_install(fd, f); - } - } - putname(tmp); - return fd; + return FD_ADD(how->flags, do_filp_open(dfd, tmp, &op)); } int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 26ecda0e4d19..fb8d84bdedfb 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -236,7 +236,7 @@ found: mutex_unlock(&op_mutex); if (IS_ERR(inode)) return ERR_CAST(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { simple_inode_init_ts(inode); ent_oi = OP_I(inode); ent_oi->type = ent_type; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a01400cd41fd..d7275990ffa4 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -878,7 +878,9 @@ int orangefs_update_time(struct inode *inode, int flags) gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", get_khandle_from_ino(inode)); - flags = generic_update_time(inode, flags); + + flags = inode_update_timestamps(inode, flags); + memset(&iattr, 0, sizeof iattr); if (flags & S_ATIME) iattr.ia_valid |= ATTR_ATIME; @@ -1041,7 +1043,7 @@ struct inode *orangefs_iget(struct super_block *sb, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 0fdceb00ca07..9ab1119ebd28 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -247,7 +247,7 @@ again: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { + orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) { if (orangefs_inode->attr_valid) { spin_unlock(&inode->i_lock); write_inode_now(inode, 1); @@ -281,13 +281,13 @@ again2: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { + orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) { if (orangefs_inode->attr_valid) { spin_unlock(&inode->i_lock); write_inode_now(inode, 1); goto again2; } - if (inode->i_state & I_DIRTY_PAGES) { + if (inode_state_read(inode) & I_DIRTY_PAGES) { ret = 0; goto out_unlock; } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 604a82acd164..758611ee4475 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -523,8 +523,8 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *indexdir = ovl_indexdir(dentry->d_sb); - struct dentry *index = NULL; struct dentry *temp = NULL; + struct renamedata rd = {}; struct qstr name = { }; int err; @@ -556,17 +556,15 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, if (err) goto out; - err = ovl_parent_lock(indexdir, temp); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = indexdir; + rd.new_parent = indexdir; + err = start_renaming_dentry(&rd, 0, temp, &name); if (err) goto out; - index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); - if (IS_ERR(index)) { - err = PTR_ERR(index); - } else { - err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0); - dput(index); - } - ovl_parent_unlock(indexdir); + + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); out: if (err) ovl_cleanup(ofs, indexdir, temp); @@ -613,9 +611,9 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) if (err) goto out; - inode_lock_nested(udir, I_MUTEX_PARENT); - upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir, - c->dentry->d_name.len); + upper = ovl_start_creating_upper(ofs, upperdir, + &QSTR_LEN(c->dentry->d_name.name, + c->dentry->d_name.len)); err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); @@ -626,9 +624,8 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, upper); } - dput(upper); + end_creating(upper); } - inode_unlock(udir); if (err) goto out; @@ -727,34 +724,33 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } -struct ovl_cu_creds { - const struct cred *old; - struct cred *new; -}; - -static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc) +static const struct cred *ovl_prepare_copy_up_creds(struct dentry *dentry) { + struct cred *copy_up_cred = NULL; int err; - cc->old = cc->new = NULL; - err = security_inode_copy_up(dentry, &cc->new); + err = security_inode_copy_up(dentry, ©_up_cred); if (err < 0) - return err; + return ERR_PTR(err); - if (cc->new) - cc->old = override_creds(cc->new); + if (!copy_up_cred) + return NULL; - return 0; + return override_creds(copy_up_cred); } -static void ovl_revert_cu_creds(struct ovl_cu_creds *cc) +static void ovl_revert_copy_up_creds(const struct cred *orig_cred) { - if (cc->new) { - revert_creds(cc->old); - put_cred(cc->new); - } + const struct cred *copy_up_cred; + + copy_up_cred = revert_creds(orig_cred); + put_cred(copy_up_cred); } +DEFINE_CLASS(copy_up_creds, const struct cred *, + if (!IS_ERR_OR_NULL(_T)) ovl_revert_copy_up_creds(_T), + ovl_prepare_copy_up_creds(dentry), struct dentry *dentry) + /* * Copyup using workdir to prepare temp file. Used when copying up directories, * special files or when upper fs doesn't support O_TMPFILE. @@ -764,8 +760,8 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; struct path path = { .mnt = ovl_upper_mnt(ofs) }; - struct dentry *temp, *upper, *trap; - struct ovl_cu_creds cc; + struct renamedata rd = {}; + struct dentry *temp; int err; struct ovl_cattr cattr = { /* Can't properly set mode on creation because of the umask */ @@ -774,14 +770,14 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) .link = c->link }; - err = ovl_prep_cu_creds(c->dentry, &cc); - if (err) - return err; + scoped_class(copy_up_creds, copy_up_creds, c->dentry) { + if (IS_ERR(copy_up_creds)) + return PTR_ERR(copy_up_creds); - ovl_start_write(c->dentry); - temp = ovl_create_temp(ofs, c->workdir, &cattr); - ovl_end_write(c->dentry); - ovl_revert_cu_creds(&cc); + ovl_start_write(c->dentry); + temp = ovl_create_temp(ofs, c->workdir, &cattr); + ovl_end_write(c->dentry); + } if (IS_ERR(temp)) return PTR_ERR(temp); @@ -808,29 +804,24 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) * ovl_copy_up_data(), so lock workdir and destdir and make sure that * temp wasn't moved before copy up completion or cleanup. */ - trap = lock_rename(c->workdir, c->destdir); - if (trap || temp->d_parent != c->workdir) { - /* temp or workdir moved underneath us? abort without cleanup */ - dput(temp); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = c->workdir; + rd.new_parent = c->destdir; + rd.flags = 0; + err = start_renaming_dentry(&rd, 0, temp, + &QSTR_LEN(c->destname.name, c->destname.len)); + if (err) { + /* temp or workdir moved underneath us? map to -EIO */ err = -EIO; - if (!IS_ERR(trap)) - unlock_rename(c->workdir, c->destdir); - goto out; } - - err = ovl_copy_up_metadata(c, temp); if (err) - goto cleanup; + goto cleanup_unlocked; - upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, - c->destname.len); - err = PTR_ERR(upper); - if (IS_ERR(upper)) - goto cleanup; + err = ovl_copy_up_metadata(c, temp); + if (!err) + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); - err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0); - unlock_rename(c->workdir, c->destdir); - dput(upper); if (err) goto cleanup_unlocked; @@ -851,8 +842,6 @@ out: return err; -cleanup: - unlock_rename(c->workdir, c->destdir); cleanup_unlocked: ovl_cleanup(ofs, c->workdir, temp); dput(temp); @@ -866,17 +855,17 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) struct inode *udir = d_inode(c->destdir); struct dentry *temp, *upper; struct file *tmpfile; - struct ovl_cu_creds cc; int err; - err = ovl_prep_cu_creds(c->dentry, &cc); - if (err) - return err; + scoped_class(copy_up_creds, copy_up_creds, c->dentry) { + if (IS_ERR(copy_up_creds)) + return PTR_ERR(copy_up_creds); + + ovl_start_write(c->dentry); + tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + ovl_end_write(c->dentry); + } - ovl_start_write(c->dentry); - tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); - ovl_end_write(c->dentry); - ovl_revert_cu_creds(&cc); if (IS_ERR(tmpfile)) return PTR_ERR(tmpfile); @@ -894,16 +883,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) goto out; - inode_lock_nested(udir, I_MUTEX_PARENT); - - upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, - c->destname.len); + upper = ovl_start_creating_upper(ofs, c->destdir, + &QSTR_LEN(c->destname.name, + c->destname.len)); err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, temp, udir, upper); - dput(upper); + end_creating(upper); } - inode_unlock(udir); if (err) goto out; @@ -1214,7 +1201,6 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, static int ovl_copy_up_flags(struct dentry *dentry, int flags) { int err = 0; - const struct cred *old_cred; bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED); /* @@ -1234,7 +1220,6 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) if (err) return err; - old_cred = ovl_override_creds(dentry->d_sb); while (!err) { struct dentry *next; struct dentry *parent = NULL; @@ -1254,12 +1239,12 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) next = parent; } - err = ovl_copy_up_one(parent, next, flags); + with_ovl_creds(dentry->d_sb) + err = ovl_copy_up_one(parent, next, flags); dput(parent); dput(next); } - ovl_revert_creds(old_cred); return err; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index a5e9ddf3023b..06b860b9ded6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -47,79 +47,70 @@ static int ovl_cleanup_locked(struct ovl_fs *ofs, struct inode *wdir, int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *wdentry) { - int err; - - err = ovl_parent_lock(workdir, wdentry); - if (err) - return err; + wdentry = start_removing_dentry(workdir, wdentry); + if (IS_ERR(wdentry)) + return PTR_ERR(wdentry); ovl_cleanup_locked(ofs, workdir->d_inode, wdentry); - ovl_parent_unlock(workdir); + end_removing(wdentry); return 0; } -struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) +void ovl_tempname(char name[OVL_TEMPNAME_SIZE]) { - struct dentry *temp; - char name[20]; static atomic_t temp_id = ATOMIC_INIT(0); /* counter is allowed to wrap, since temp dentries are ephemeral */ - snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); + snprintf(name, OVL_TEMPNAME_SIZE, "#%x", atomic_inc_return(&temp_id)); +} - temp = ovl_lookup_upper(ofs, name, workdir, strlen(name)); - if (!IS_ERR(temp) && temp->d_inode) { - pr_err("workdir/%s already exists\n", name); - dput(temp); - temp = ERR_PTR(-EIO); - } +static struct dentry *ovl_start_creating_temp(struct ovl_fs *ofs, + struct dentry *workdir) +{ + char name[OVL_TEMPNAME_SIZE]; - return temp; + ovl_tempname(name); + return start_creating(ovl_upper_mnt_idmap(ofs), workdir, + &QSTR(name)); } static struct dentry *ovl_whiteout(struct ovl_fs *ofs) { int err; - struct dentry *whiteout; + struct dentry *whiteout, *link; struct dentry *workdir = ofs->workdir; struct inode *wdir = workdir->d_inode; guard(mutex)(&ofs->whiteout_lock); if (!ofs->whiteout) { - inode_lock_nested(wdir, I_MUTEX_PARENT); - whiteout = ovl_lookup_temp(ofs, workdir); - if (!IS_ERR(whiteout)) { - err = ovl_do_whiteout(ofs, wdir, whiteout); - if (err) { - dput(whiteout); - whiteout = ERR_PTR(err); - } - } - inode_unlock(wdir); + whiteout = ovl_start_creating_temp(ofs, workdir); if (IS_ERR(whiteout)) return whiteout; - ofs->whiteout = whiteout; + err = ovl_do_whiteout(ofs, wdir, whiteout); + if (!err) + ofs->whiteout = dget(whiteout); + end_creating(whiteout); + if (err) + return ERR_PTR(err); } if (!ofs->no_shared_whiteout) { - inode_lock_nested(wdir, I_MUTEX_PARENT); - whiteout = ovl_lookup_temp(ofs, workdir); - if (!IS_ERR(whiteout)) { - err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout); - if (err) { - dput(whiteout); - whiteout = ERR_PTR(err); - } - } - inode_unlock(wdir); - if (!IS_ERR(whiteout)) + link = ovl_start_creating_temp(ofs, workdir); + if (IS_ERR(link)) + return link; + err = ovl_do_link(ofs, ofs->whiteout, wdir, link); + if (!err) + whiteout = dget(link); + end_creating(link); + if (!err) return whiteout; - if (PTR_ERR(whiteout) != -EMLINK) { - pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%lu)\n", + + if (err != -EMLINK) { + pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%u)\n", ofs->whiteout->d_inode->i_nlink, - PTR_ERR(whiteout)); + err); ofs->no_shared_whiteout = true; } } @@ -132,6 +123,7 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, struct dentry *dentry) { struct dentry *whiteout; + struct renamedata rd = {}; int err; int flags = 0; @@ -143,10 +135,14 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, if (d_is_dir(dentry)) flags = RENAME_EXCHANGE; - err = ovl_lock_rename_workdir(ofs->workdir, whiteout, dir, dentry); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = ofs->workdir; + rd.new_parent = dir; + rd.flags = flags; + err = start_renaming_two_dentries(&rd, whiteout, dentry); if (!err) { - err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags); - unlock_rename(ofs->workdir, dir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); } if (err) goto kill_whiteout; @@ -191,7 +187,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, if (!err && ofs->casefold != ovl_dentry_casefolded(newdentry)) { pr_warn_ratelimited("wrong inherited casefold (%pd2)\n", newdentry); - dput(newdentry); + end_creating(newdentry); err = -EINVAL; } break; @@ -241,8 +237,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, } out: if (err) { - if (!IS_ERR(newdentry)) - dput(newdentry); + end_creating(newdentry); return ERR_PTR(err); } return newdentry; @@ -252,11 +247,11 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr) { struct dentry *ret; - inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT); - ret = ovl_create_real(ofs, workdir, - ovl_lookup_temp(ofs, workdir), attr); - inode_unlock(workdir->d_inode); - return ret; + ret = ovl_start_creating_temp(ofs, workdir); + if (IS_ERR(ret)) + return ret; + ret = ovl_create_real(ofs, workdir, ret, attr); + return end_creating_keep(ret); } static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, @@ -354,18 +349,19 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); - struct inode *udir = upperdir->d_inode; struct dentry *newdentry; int err; - inode_lock_nested(udir, I_MUTEX_PARENT); - newdentry = ovl_create_real(ofs, upperdir, - ovl_lookup_upper(ofs, dentry->d_name.name, - upperdir, dentry->d_name.len), - attr); - inode_unlock(udir); + newdentry = ovl_start_creating_upper(ofs, upperdir, + &QSTR_LEN(dentry->d_name.name, + dentry->d_name.len)); if (IS_ERR(newdentry)) return PTR_ERR(newdentry); + newdentry = ovl_create_real(ofs, upperdir, newdentry, attr); + if (IS_ERR(newdentry)) + return PTR_ERR(newdentry); + + end_creating_keep(newdentry); if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) && !ovl_allow_offline_changes(ofs)) { @@ -391,6 +387,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct renamedata rd = {}; struct path upperpath; struct dentry *upper; struct dentry *opaquedir; @@ -416,7 +413,11 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (IS_ERR(opaquedir)) goto out; - err = ovl_lock_rename_workdir(workdir, opaquedir, upperdir, upper); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = workdir; + rd.new_parent = upperdir; + rd.flags = RENAME_EXCHANGE; + err = start_renaming_two_dentries(&rd, opaquedir, upper); if (err) goto out_cleanup_unlocked; @@ -434,8 +435,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE); - unlock_rename(workdir, upperdir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) goto out_cleanup_unlocked; @@ -448,7 +449,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, return opaquedir; out_cleanup: - unlock_rename(workdir, upperdir); + end_renaming(&rd); out_cleanup_unlocked: ovl_cleanup(ofs, workdir, opaquedir); dput(opaquedir); @@ -471,6 +472,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct renamedata rd = {}; struct dentry *upper; struct dentry *newdentry; int err; @@ -502,7 +504,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (IS_ERR(newdentry)) goto out_dput; - err = ovl_lock_rename_workdir(workdir, newdentry, upperdir, upper); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = workdir; + rd.new_parent = upperdir; + rd.flags = 0; + err = start_renaming_two_dentries(&rd, newdentry, upper); if (err) goto out_cleanup_unlocked; @@ -539,16 +545,16 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, - RENAME_EXCHANGE); - unlock_rename(workdir, upperdir); + rd.flags = RENAME_EXCHANGE; + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) goto out_cleanup_unlocked; ovl_cleanup(ofs, workdir, upper); } else { - err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0); - unlock_rename(workdir, upperdir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) goto out_cleanup_unlocked; } @@ -568,66 +574,76 @@ out: return err; out_cleanup: - unlock_rename(workdir, upperdir); + end_renaming(&rd); out_cleanup_unlocked: ovl_cleanup(ofs, workdir, newdentry); dput(newdentry); goto out_dput; } -static const struct cred *ovl_setup_cred_for_create(struct dentry *dentry, - struct inode *inode, - umode_t mode, - const struct cred *old_cred) +static const struct cred *ovl_override_creator_creds(struct dentry *dentry, struct inode *inode, umode_t mode) { int err; - struct cred *override_cred; - override_cred = prepare_creds(); + if (WARN_ON_ONCE(current->cred != ovl_creds(dentry->d_sb))) + return ERR_PTR(-EINVAL); + + CLASS(prepare_creds, override_cred)(); if (!override_cred) return ERR_PTR(-ENOMEM); override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; + err = security_dentry_create_files_as(dentry, mode, &dentry->d_name, - old_cred, override_cred); - if (err) { - put_cred(override_cred); + current->cred, override_cred); + if (err) return ERR_PTR(err); - } - /* - * Caller is going to match this with revert_creds() and drop - * referenec on the returned creds. - * We must be called with creator creds already, otherwise we risk - * leaking creds. - */ - old_cred = override_creds(override_cred); - WARN_ON_ONCE(old_cred != ovl_creds(dentry->d_sb)); + return override_creds(no_free_ptr(override_cred)); +} + +static void ovl_revert_creator_creds(const struct cred *old_cred) +{ + const struct cred *override_cred; - return override_cred; + override_cred = revert_creds(old_cred); + put_cred(override_cred); +} + +DEFINE_CLASS(ovl_override_creator_creds, + const struct cred *, + if (!IS_ERR_OR_NULL(_T)) ovl_revert_creator_creds(_T), + ovl_override_creator_creds(dentry, inode, mode), + struct dentry *dentry, struct inode *inode, umode_t mode) + +static int ovl_create_handle_whiteouts(struct dentry *dentry, + struct inode *inode, + struct ovl_cattr *attr) +{ + if (!ovl_dentry_is_whiteout(dentry)) + return ovl_create_upper(dentry, inode, attr); + + return ovl_create_over_whiteout(dentry, inode, attr); } static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, struct ovl_cattr *attr, bool origin) { int err; - const struct cred *old_cred, *new_cred = NULL; struct dentry *parent = dentry->d_parent; - old_cred = ovl_override_creds(dentry->d_sb); - - /* - * When linking a file with copy up origin into a new parent, mark the - * new parent dir "impure". - */ - if (origin) { - err = ovl_set_impure(parent, ovl_dentry_upper(parent)); - if (err) - goto out_revert_creds; - } + with_ovl_creds(dentry->d_sb) { + /* + * When linking a file with copy up origin into a new parent, mark the + * new parent dir "impure". + */ + if (origin) { + err = ovl_set_impure(parent, ovl_dentry_upper(parent)); + if (err) + return err; + } - if (!attr->hardlink) { /* * In the creation cases(create, mkdir, mknod, symlink), * ovl should transfer current's fs{u,g}id to underlying @@ -641,23 +657,16 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, * create a new inode, so just use the ovl mounter's * fs{u,g}id. */ - new_cred = ovl_setup_cred_for_create(dentry, inode, attr->mode, - old_cred); - err = PTR_ERR(new_cred); - if (IS_ERR(new_cred)) { - new_cred = NULL; - goto out_revert_creds; - } - } - if (!ovl_dentry_is_whiteout(dentry)) - err = ovl_create_upper(dentry, inode, attr); - else - err = ovl_create_over_whiteout(dentry, inode, attr); + if (attr->hardlink) + return ovl_create_handle_whiteouts(dentry, inode, attr); -out_revert_creds: - ovl_revert_creds(old_cred); - put_cred(new_cred); + scoped_class(ovl_override_creator_creds, cred, dentry, inode, attr->mode) { + if (IS_ERR(cred)) + return PTR_ERR(cred); + return ovl_create_handle_whiteouts(dentry, inode, attr); + } + } return err; } @@ -686,7 +695,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, goto out_drop_write; spin_lock(&inode->i_lock); - inode->i_state |= I_CREATING; + inode_state_set(inode, I_CREATING); spin_unlock(&inode->i_lock); inode_init_owner(&nop_mnt_idmap, inode, dentry->d_parent->d_inode, mode); @@ -733,14 +742,8 @@ static int ovl_symlink(struct mnt_idmap *idmap, struct inode *dir, static int ovl_set_link_redirect(struct dentry *dentry) { - const struct cred *old_cred; - int err; - - old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_set_redirect(dentry, false); - ovl_revert_creds(old_cred); - - return err; + with_ovl_creds(dentry->d_sb) + return ovl_set_redirect(dentry, false); } static int ovl_link(struct dentry *old, struct inode *newdir, @@ -850,17 +853,17 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, goto out; } - inode_lock_nested(dir, I_MUTEX_PARENT); - upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_start_removing_upper(ofs, upperdir, + &QSTR_LEN(dentry->d_name.name, + dentry->d_name.len)); err = PTR_ERR(upper); if (IS_ERR(upper)) - goto out_unlock; + goto out_dput; err = -ESTALE; if ((opaquedir && upper != opaquedir) || (!opaquedir && !ovl_matches_upper(dentry, upper))) - goto out_dput_upper; + goto out_unlock; if (is_dir) err = ovl_do_rmdir(ofs, dir, upper); @@ -876,10 +879,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, */ if (!err) d_drop(dentry); -out_dput_upper: - dput(upper); out_unlock: - inode_unlock(dir); + end_removing(upper); +out_dput: dput(opaquedir); out: return err; @@ -916,7 +918,6 @@ static void ovl_drop_nlink(struct dentry *dentry) static int ovl_do_remove(struct dentry *dentry, bool is_dir) { int err; - const struct cred *old_cred; bool lower_positive = ovl_lower_positive(dentry); LIST_HEAD(list); @@ -935,12 +936,12 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (err) goto out; - old_cred = ovl_override_creds(dentry->d_sb); - if (!lower_positive) - err = ovl_remove_upper(dentry, is_dir, &list); - else - err = ovl_remove_and_whiteout(dentry, &list); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) { + if (!lower_positive) + err = ovl_remove_upper(dentry, is_dir, &list); + else + err = ovl_remove_and_whiteout(dentry, &list); + } if (!err) { if (is_dir) clear_nlink(dentry->d_inode); @@ -1104,102 +1105,107 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) return err; } -static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, - struct dentry *old, struct inode *newdir, - struct dentry *new, unsigned int flags) +struct ovl_renamedata { + struct renamedata; + struct dentry *opaquedir; + bool cleanup_whiteout; + bool update_nlink; + bool overwrite; +}; + +static int ovl_rename_start(struct ovl_renamedata *ovlrd, struct list_head *list) { - int err; - struct dentry *old_upperdir; - struct dentry *new_upperdir; - struct dentry *olddentry = NULL; - struct dentry *newdentry = NULL; - struct dentry *trap, *de; - bool old_opaque; - bool new_opaque; - bool cleanup_whiteout = false; - bool update_nlink = false; - bool overwrite = !(flags & RENAME_EXCHANGE); + struct dentry *old = ovlrd->old_dentry; + struct dentry *new = ovlrd->new_dentry; bool is_dir = d_is_dir(old); bool new_is_dir = d_is_dir(new); - bool samedir = olddir == newdir; - struct dentry *opaquedir = NULL; - const struct cred *old_cred = NULL; - struct ovl_fs *ofs = OVL_FS(old->d_sb); - LIST_HEAD(list); + int err; - err = -EINVAL; - if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) - goto out; + if (ovlrd->flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) + return -EINVAL; - flags &= ~RENAME_NOREPLACE; + ovlrd->flags &= ~RENAME_NOREPLACE; /* Don't copy up directory trees */ err = -EXDEV; if (!ovl_can_move(old)) - goto out; - if (!overwrite && !ovl_can_move(new)) - goto out; + return err; + if (!ovlrd->overwrite && !ovl_can_move(new)) + return err; - if (overwrite && new_is_dir && !ovl_pure_upper(new)) { - err = ovl_check_empty_dir(new, &list); + if (ovlrd->overwrite && new_is_dir && !ovl_pure_upper(new)) { + err = ovl_check_empty_dir(new, list); if (err) - goto out; + return err; } - if (overwrite) { + if (ovlrd->overwrite) { if (ovl_lower_positive(old)) { if (!ovl_dentry_is_whiteout(new)) { /* Whiteout source */ - flags |= RENAME_WHITEOUT; + ovlrd->flags |= RENAME_WHITEOUT; } else { /* Switch whiteouts */ - flags |= RENAME_EXCHANGE; + ovlrd->flags |= RENAME_EXCHANGE; } } else if (is_dir && ovl_dentry_is_whiteout(new)) { - flags |= RENAME_EXCHANGE; - cleanup_whiteout = true; + ovlrd->flags |= RENAME_EXCHANGE; + ovlrd->cleanup_whiteout = true; } } err = ovl_copy_up(old); if (err) - goto out; + return err; err = ovl_copy_up(new->d_parent); if (err) - goto out; - if (!overwrite) { + return err; + + if (!ovlrd->overwrite) { err = ovl_copy_up(new); if (err) - goto out; + return err; } else if (d_inode(new)) { err = ovl_nlink_start(new); if (err) - goto out; + return err; - update_nlink = true; + ovlrd->update_nlink = true; } - if (!update_nlink) { + if (!ovlrd->update_nlink) { /* ovl_nlink_start() took ovl_want_write() */ err = ovl_want_write(old); if (err) - goto out; + return err; } - old_cred = ovl_override_creds(old->d_sb); + return 0; +} - if (!list_empty(&list)) { - opaquedir = ovl_clear_empty(new, &list); - err = PTR_ERR(opaquedir); - if (IS_ERR(opaquedir)) { - opaquedir = NULL; - goto out_revert_creds; - } - } +static int ovl_rename_upper(struct ovl_renamedata *ovlrd, struct list_head *list) +{ + struct dentry *old = ovlrd->old_dentry; + struct dentry *new = ovlrd->new_dentry; + struct ovl_fs *ofs = OVL_FS(old->d_sb); + struct dentry *old_upperdir = ovl_dentry_upper(old->d_parent); + struct dentry *new_upperdir = ovl_dentry_upper(new->d_parent); + bool is_dir = d_is_dir(old); + bool new_is_dir = d_is_dir(new); + bool samedir = old->d_parent == new->d_parent; + struct renamedata rd = {}; + struct dentry *de; + struct dentry *whiteout = NULL; + bool old_opaque, new_opaque; + int err; - old_upperdir = ovl_dentry_upper(old->d_parent); - new_upperdir = ovl_dentry_upper(new->d_parent); + if (!list_empty(list)) { + de = ovl_clear_empty(new, list); + if (IS_ERR(de)) + return PTR_ERR(de); + ovlrd->opaquedir = de; + } if (!samedir) { /* @@ -1211,95 +1217,88 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, if (ovl_type_origin(old)) { err = ovl_set_impure(new->d_parent, new_upperdir); if (err) - goto out_revert_creds; + return err; } - if (!overwrite && ovl_type_origin(new)) { + if (!ovlrd->overwrite && ovl_type_origin(new)) { err = ovl_set_impure(old->d_parent, old_upperdir); if (err) - goto out_revert_creds; + return err; } } - trap = lock_rename(new_upperdir, old_upperdir); - if (IS_ERR(trap)) { - err = PTR_ERR(trap); - goto out_revert_creds; - } + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = old_upperdir; + rd.new_parent = new_upperdir; + rd.flags = ovlrd->flags; - de = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, - old->d_name.len); - err = PTR_ERR(de); - if (IS_ERR(de)) - goto out_unlock; - olddentry = de; + err = start_renaming(&rd, 0, + &QSTR_LEN(old->d_name.name, old->d_name.len), + &QSTR_LEN(new->d_name.name, new->d_name.len)); + if (err) + return err; err = -ESTALE; - if (!ovl_matches_upper(old, olddentry)) + if (!ovl_matches_upper(old, rd.old_dentry)) goto out_unlock; - de = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir, - new->d_name.len); - err = PTR_ERR(de); - if (IS_ERR(de)) - goto out_unlock; - newdentry = de; - old_opaque = ovl_dentry_is_opaque(old); new_opaque = ovl_dentry_is_opaque(new); err = -ESTALE; if (d_inode(new) && ovl_dentry_upper(new)) { - if (opaquedir) { - if (newdentry != opaquedir) + if (ovlrd->opaquedir) { + if (rd.new_dentry != ovlrd->opaquedir) goto out_unlock; } else { - if (!ovl_matches_upper(new, newdentry)) + if (!ovl_matches_upper(new, rd.new_dentry)) goto out_unlock; } } else { - if (!d_is_negative(newdentry)) { - if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry)) + if (!d_is_negative(rd.new_dentry)) { + if (!new_opaque || !ovl_upper_is_whiteout(ofs, rd.new_dentry)) goto out_unlock; } else { - if (flags & RENAME_EXCHANGE) + if (ovlrd->flags & RENAME_EXCHANGE) goto out_unlock; } } - if (olddentry == trap) - goto out_unlock; - if (newdentry == trap) - goto out_unlock; - - if (olddentry->d_inode == newdentry->d_inode) + if (rd.old_dentry->d_inode == rd.new_dentry->d_inode) goto out_unlock; err = 0; if (ovl_type_merge_or_lower(old)) err = ovl_set_redirect(old, samedir); else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent)) - err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); + err = ovl_set_opaque_xerr(old, rd.old_dentry, -EXDEV); if (err) goto out_unlock; - if (!overwrite && ovl_type_merge_or_lower(new)) + if (!ovlrd->overwrite && ovl_type_merge_or_lower(new)) err = ovl_set_redirect(new, samedir); - else if (!overwrite && new_is_dir && !new_opaque && + else if (!ovlrd->overwrite && new_is_dir && !new_opaque && ovl_type_merge(old->d_parent)) - err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); + err = ovl_set_opaque_xerr(new, rd.new_dentry, -EXDEV); if (err) goto out_unlock; - err = ovl_do_rename(ofs, old_upperdir, olddentry, - new_upperdir, newdentry, flags); - unlock_rename(new_upperdir, old_upperdir); + err = ovl_do_rename_rd(&rd); + + if (!err && ovlrd->cleanup_whiteout) + whiteout = dget(rd.new_dentry); + +out_unlock: + end_renaming(&rd); + if (err) - goto out_revert_creds; + return err; - if (cleanup_whiteout) - ovl_cleanup(ofs, old_upperdir, newdentry); + if (whiteout) { + ovl_cleanup(ofs, old_upperdir, whiteout); + dput(whiteout); + } - if (overwrite && d_inode(new)) { + if (ovlrd->overwrite && d_inode(new)) { if (new_is_dir) clear_nlink(d_inode(new)); else @@ -1307,7 +1306,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } ovl_dir_modified(old->d_parent, ovl_type_origin(old) || - (!overwrite && ovl_type_origin(new))); + (!ovlrd->overwrite && ovl_type_origin(new))); ovl_dir_modified(new->d_parent, ovl_type_origin(old) || (d_inode(new) && ovl_type_origin(new))); @@ -1316,28 +1315,47 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, if (d_inode(new) && ovl_dentry_upper(new)) ovl_copyattr(d_inode(new)); -out_revert_creds: - ovl_revert_creds(old_cred); - if (update_nlink) - ovl_nlink_end(new); + return err; +} + +static void ovl_rename_end(struct ovl_renamedata *ovlrd) +{ + if (ovlrd->update_nlink) + ovl_nlink_end(ovlrd->new_dentry); else - ovl_drop_write(old); -out: - dput(newdentry); - dput(olddentry); - dput(opaquedir); + ovl_drop_write(ovlrd->old_dentry); +} + +static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, + struct dentry *old, struct inode *newdir, + struct dentry *new, unsigned int flags) +{ + struct ovl_renamedata ovlrd = { + .old_parent = old->d_parent, + .old_dentry = old, + .new_parent = new->d_parent, + .new_dentry = new, + .flags = flags, + .overwrite = !(flags & RENAME_EXCHANGE), + }; + LIST_HEAD(list); + int err; + + err = ovl_rename_start(&ovlrd, &list); + if (!err) { + with_ovl_creds(old->d_sb) + err = ovl_rename_upper(&ovlrd, &list); + ovl_rename_end(&ovlrd); + } + + dput(ovlrd.opaquedir); ovl_cache_free(&list); return err; - -out_unlock: - unlock_rename(new_upperdir, old_upperdir); - goto out_revert_creds; } static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, struct inode *inode, umode_t mode) { - const struct cred *old_cred, *new_cred = NULL; struct path realparentpath; struct file *realfile; struct ovl_file *of; @@ -1346,41 +1364,36 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, int flags = file->f_flags | OVL_OPEN_FLAGS; int err; - old_cred = ovl_override_creds(dentry->d_sb); - new_cred = ovl_setup_cred_for_create(dentry, inode, mode, old_cred); - err = PTR_ERR(new_cred); - if (IS_ERR(new_cred)) { - new_cred = NULL; - goto out_revert_creds; - } + with_ovl_creds(dentry->d_sb) { + scoped_class(ovl_override_creator_creds, cred, dentry, inode, mode) { + if (IS_ERR(cred)) + return PTR_ERR(cred); - ovl_path_upper(dentry->d_parent, &realparentpath); - realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath, - mode, current_cred()); - err = PTR_ERR_OR_ZERO(realfile); - pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err); - if (err) - goto out_revert_creds; + ovl_path_upper(dentry->d_parent, &realparentpath); + realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath, + mode, current_cred()); + err = PTR_ERR_OR_ZERO(realfile); + pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err); + if (err) + return err; - of = ovl_file_alloc(realfile); - if (!of) { - fput(realfile); - err = -ENOMEM; - goto out_revert_creds; - } + of = ovl_file_alloc(realfile); + if (!of) { + fput(realfile); + return -ENOMEM; + } - /* ovl_instantiate() consumes the newdentry reference on success */ - newdentry = dget(realfile->f_path.dentry); - err = ovl_instantiate(dentry, inode, newdentry, false, file); - if (!err) { - file->private_data = of; - } else { - dput(newdentry); - ovl_file_free(of); + /* ovl_instantiate() consumes the newdentry reference on success */ + newdentry = dget(realfile->f_path.dentry); + err = ovl_instantiate(dentry, inode, newdentry, false, file); + if (!err) { + file->private_data = of; + } else { + dput(newdentry); + ovl_file_free(of); + } + } } -out_revert_creds: - ovl_revert_creds(old_cred); - put_cred(new_cred); return err; } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 7ab2c9daffd0..cbae89457234 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -31,7 +31,6 @@ static struct file *ovl_open_realfile(const struct file *file, struct inode *inode = file_inode(file); struct mnt_idmap *real_idmap; struct file *realfile; - const struct cred *old_cred; int flags = file->f_flags | OVL_OPEN_FLAGS; int acc_mode = ACC_MODE(flags); int err; @@ -39,19 +38,19 @@ static struct file *ovl_open_realfile(const struct file *file, if (flags & O_APPEND) acc_mode |= MAY_APPEND; - old_cred = ovl_override_creds(inode->i_sb); - real_idmap = mnt_idmap(realpath->mnt); - err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode); - if (err) { - realfile = ERR_PTR(err); - } else { - if (!inode_owner_or_capable(real_idmap, realinode)) - flags &= ~O_NOATIME; - - realfile = backing_file_open(file_user_path(file), - flags, realpath, current_cred()); + with_ovl_creds(inode->i_sb) { + real_idmap = mnt_idmap(realpath->mnt); + err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode); + if (err) { + realfile = ERR_PTR(err); + } else { + if (!inode_owner_or_capable(real_idmap, realinode)) + flags &= ~O_NOATIME; + + realfile = backing_file_open(file_user_path(file), + flags, realpath, current_cred()); + } } - ovl_revert_creds(old_cred); pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", file, file, ovl_whatisit(inode, realinode), file->f_flags, @@ -244,7 +243,6 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); struct file *realfile; - const struct cred *old_cred; loff_t ret; /* @@ -273,9 +271,8 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) ovl_inode_lock(inode); realfile->f_pos = file->f_pos; - old_cred = ovl_override_creds(inode->i_sb); - ret = vfs_llseek(realfile, offset, whence); - ovl_revert_creds(old_cred); + with_ovl_creds(inode->i_sb) + ret = vfs_llseek(realfile, offset, whence); file->f_pos = realfile->f_pos; ovl_inode_unlock(inode); @@ -447,7 +444,6 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) enum ovl_path_type type; struct path upperpath; struct file *upperfile; - const struct cred *old_cred; int ret; ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); @@ -464,11 +460,8 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (IS_ERR(upperfile)) return PTR_ERR(upperfile); - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_fsync_range(upperfile, start, end, datasync); - ovl_revert_creds(old_cred); - - return ret; + with_ovl_creds(file_inode(file)->i_sb) + return vfs_fsync_range(upperfile, start, end, datasync); } static int ovl_mmap(struct file *file, struct vm_area_struct *vma) @@ -486,7 +479,6 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len { struct inode *inode = file_inode(file); struct file *realfile; - const struct cred *old_cred; int ret; inode_lock(inode); @@ -501,9 +493,8 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len if (IS_ERR(realfile)) goto out_unlock; - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_fallocate(realfile, mode, offset, len); - ovl_revert_creds(old_cred); + with_ovl_creds(inode->i_sb) + ret = vfs_fallocate(realfile, mode, offset, len); /* Update size */ ovl_file_modified(file); @@ -517,18 +508,13 @@ out_unlock: static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { struct file *realfile; - const struct cred *old_cred; - int ret; realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_fadvise(realfile, offset, len, advice); - ovl_revert_creds(old_cred); - - return ret; + with_ovl_creds(file_inode(file)->i_sb) + return vfs_fadvise(realfile, offset, len, advice); } enum ovl_copyop { @@ -543,7 +529,6 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, { struct inode *inode_out = file_inode(file_out); struct file *realfile_in, *realfile_out; - const struct cred *old_cred; loff_t ret; inode_lock(inode_out); @@ -565,25 +550,25 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, if (IS_ERR(realfile_in)) goto out_unlock; - old_cred = ovl_override_creds(file_inode(file_out)->i_sb); - switch (op) { - case OVL_COPY: - ret = vfs_copy_file_range(realfile_in, pos_in, - realfile_out, pos_out, len, flags); - break; - - case OVL_CLONE: - ret = vfs_clone_file_range(realfile_in, pos_in, - realfile_out, pos_out, len, flags); - break; - - case OVL_DEDUPE: - ret = vfs_dedupe_file_range_one(realfile_in, pos_in, - realfile_out, pos_out, len, - flags); - break; + with_ovl_creds(file_inode(file_out)->i_sb) { + switch (op) { + case OVL_COPY: + ret = vfs_copy_file_range(realfile_in, pos_in, + realfile_out, pos_out, len, flags); + break; + + case OVL_CLONE: + ret = vfs_clone_file_range(realfile_in, pos_in, + realfile_out, pos_out, len, flags); + break; + + case OVL_DEDUPE: + ret = vfs_dedupe_file_range_one(realfile_in, pos_in, + realfile_out, pos_out, len, + flags); + break; + } } - ovl_revert_creds(old_cred); /* Update size */ ovl_file_modified(file_out); @@ -632,7 +617,6 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, static int ovl_flush(struct file *file, fl_owner_t id) { struct file *realfile; - const struct cred *old_cred; int err = 0; realfile = ovl_real_file(file); @@ -640,9 +624,8 @@ static int ovl_flush(struct file *file, fl_owner_t id) return PTR_ERR(realfile); if (realfile->f_op->flush) { - old_cred = ovl_override_creds(file_inode(file)->i_sb); - err = realfile->f_op->flush(realfile, id); - ovl_revert_creds(old_cred); + with_ovl_creds(file_inode(file)->i_sb) + err = realfile->f_op->flush(realfile, id); } return err; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index e11f310ce092..bdbf86b56a9b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -25,7 +25,6 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool full_copy_up = false; struct dentry *upperdentry; - const struct cred *old_cred; err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) @@ -78,9 +77,8 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out_put_write; inode_lock(upperdentry->d_inode); - old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_do_notify_change(ofs, upperdentry, attr); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = ovl_do_notify_change(ofs, upperdentry, attr); if (!err) ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); @@ -153,13 +151,22 @@ static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) } } +static inline int ovl_real_getattr_nosec(struct super_block *sb, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int flags) +{ + with_ovl_creds(sb) + return vfs_getattr_nosec(path, stat, request_mask, flags); +} + int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; + struct super_block *sb = dentry->d_sb; enum ovl_path_type type; struct path realpath; - const struct cred *old_cred; struct inode *inode = d_inode(dentry); bool is_dir = S_ISDIR(inode->i_mode); int fsid = 0; @@ -169,10 +176,9 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, metacopy_blocks = ovl_is_metacopy_dentry(dentry); type = ovl_path_real(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr_nosec(&realpath, stat, request_mask, flags); + err = ovl_real_getattr_nosec(sb, &realpath, stat, request_mask, flags); if (err) - goto out; + return err; /* Report the effective immutable/append-only STATX flags */ generic_fill_statx_attr(inode, stat); @@ -195,10 +201,9 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); - err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask, - flags); + err = ovl_real_getattr_nosec(sb, &realpath, &lowerstat, lowermask, flags); if (err) - goto out; + return err; /* * Lower hardlinks may be broken on copy up to different @@ -248,10 +253,10 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, ovl_path_lowerdata(dentry, &realpath); if (realpath.dentry) { - err = vfs_getattr_nosec(&realpath, &lowerdatastat, - lowermask, flags); + err = ovl_real_getattr_nosec(sb, &realpath, &lowerdatastat, + lowermask, flags); if (err) - goto out; + return err; } else { lowerdatastat.blocks = round_up(stat->size, stat->blksize) >> 9; @@ -279,9 +284,6 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) stat->nlink = dentry->d_inode->i_nlink; -out: - ovl_revert_creds(old_cred); - return err; } @@ -291,7 +293,6 @@ int ovl_permission(struct mnt_idmap *idmap, struct inode *upperinode = ovl_inode_upper(inode); struct inode *realinode; struct path realpath; - const struct cred *old_cred; int err; /* Careful in RCU walk mode */ @@ -309,33 +310,26 @@ int ovl_permission(struct mnt_idmap *idmap, if (err) return err; - old_cred = ovl_override_creds(inode->i_sb); if (!upperinode && !special_file(realinode->i_mode) && mask & MAY_WRITE) { mask &= ~(MAY_WRITE | MAY_APPEND); /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask); - ovl_revert_creds(old_cred); - return err; + with_ovl_creds(inode->i_sb) + return inode_permission(mnt_idmap(realpath.mnt), realinode, mask); } static const char *ovl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - const struct cred *old_cred; - const char *p; - if (!dentry) return ERR_PTR(-ECHILD); - old_cred = ovl_override_creds(dentry->d_sb); - p = vfs_get_link(ovl_dentry_real(dentry), done); - ovl_revert_creds(old_cred); - return p; + with_ovl_creds(dentry->d_sb) + return vfs_get_link(ovl_dentry_real(dentry), done); } #ifdef CONFIG_FS_POSIX_ACL @@ -465,11 +459,8 @@ struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, acl = get_cached_acl_rcu(realinode, type); } else { - const struct cred *old_cred; - - old_cred = ovl_override_creds(inode->i_sb); - acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); - ovl_revert_creds(old_cred); + with_ovl_creds(inode->i_sb) + acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); } return acl; @@ -481,7 +472,6 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, int err; struct path realpath; const char *acl_name; - const struct cred *old_cred; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); @@ -495,10 +485,8 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, struct posix_acl *real_acl; ovl_path_lower(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, - acl_name); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); if (IS_ERR(real_acl)) { err = PTR_ERR(real_acl); goto out; @@ -518,12 +506,12 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, if (err) goto out; - old_cred = ovl_override_creds(dentry->d_sb); - if (acl) - err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); - else - err = ovl_do_remove_acl(ofs, realdentry, acl_name); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) { + if (acl) + err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); + else + err = ovl_do_remove_acl(ofs, realdentry, acl_name); + } ovl_drop_write(dentry); /* copy c/mtime */ @@ -588,9 +576,7 @@ int ovl_update_time(struct inode *inode, int flags) static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - int err; struct inode *realinode = ovl_inode_realdata(inode); - const struct cred *old_cred; if (!realinode) return -EIO; @@ -598,11 +584,8 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (!realinode->i_op->fiemap) return -EOPNOTSUPP; - old_cred = ovl_override_creds(inode->i_sb); - err = realinode->i_op->fiemap(realinode, fieinfo, start, len); - ovl_revert_creds(old_cred); - - return err; + with_ovl_creds(inode->i_sb) + return realinode->i_op->fiemap(realinode, fieinfo, start, len); } /* @@ -653,7 +636,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, { struct inode *inode = d_inode(dentry); struct path upperpath; - const struct cred *old_cred; unsigned int flags; int err; @@ -665,18 +647,18 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, if (err) goto out; - old_cred = ovl_override_creds(inode->i_sb); - /* - * Store immutable/append-only flags in xattr and clear them - * in upper fileattr (in case they were set by older kernel) - * so children of "ovl-immutable" directories lower aliases of - * "ovl-immutable" hardlinks could be copied up. - * Clear xattr when flags are cleared. - */ - err = ovl_set_protattr(inode, upperpath.dentry, fa); - if (!err) - err = ovl_real_fileattr_set(&upperpath, fa); - ovl_revert_creds(old_cred); + with_ovl_creds(inode->i_sb) { + /* + * Store immutable/append-only flags in xattr and clear them + * in upper fileattr (in case they were set by older kernel) + * so children of "ovl-immutable" directories lower aliases of + * "ovl-immutable" hardlinks could be copied up. + * Clear xattr when flags are cleared. + */ + err = ovl_set_protattr(inode, upperpath.dentry, fa); + if (!err) + err = ovl_real_fileattr_set(&upperpath, fa); + } ovl_drop_write(dentry); /* @@ -730,15 +712,13 @@ int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct path realpath; - const struct cred *old_cred; int err; ovl_path_real(dentry, &realpath); - old_cred = ovl_override_creds(inode->i_sb); - err = ovl_real_fileattr_get(&realpath, fa); + with_ovl_creds(inode->i_sb) + err = ovl_real_fileattr_get(&realpath, fa); ovl_fileattr_prot_flags(inode, fa); - ovl_revert_creds(old_cred); return err; } @@ -1152,7 +1132,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) if (!trap) return ERR_PTR(-ENOMEM); - if (!(trap->i_state & I_NEW)) { + if (!(inode_state_read_once(trap) & I_NEW)) { /* Conflicting layer roots? */ iput(trap); return ERR_PTR(-ELOOP); @@ -1243,7 +1223,7 @@ struct inode *ovl_get_inode(struct super_block *sb, inode = ovl_iget5(sb, oip->newinode, key); if (!inode) goto out_err; - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { /* * Verify that the underlying files stored in the inode * match those in the dentry. @@ -1303,7 +1283,7 @@ struct inode *ovl_get_inode(struct super_block *sb, if (upperdentry) ovl_check_protattr(inode, upperdentry); - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); out: return inode; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index e93bcc5727bc..e9a69c95be91 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -979,15 +979,10 @@ static int ovl_maybe_validate_verity(struct dentry *dentry) return err; if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) { - const struct cred *old_cred; - - old_cred = ovl_override_creds(dentry->d_sb); - - err = ovl_validate_verity(ofs, &metapath, &datapath); + with_ovl_creds(dentry->d_sb) + err = ovl_validate_verity(ofs, &metapath, &datapath); if (err == 0) ovl_set_flag(OVL_VERIFIED_DIGEST, inode); - - ovl_revert_creds(old_cred); } ovl_inode_unlock(inode); @@ -1001,7 +996,6 @@ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) struct inode *inode = d_inode(dentry); const char *redirect = ovl_lowerdata_redirect(inode); struct ovl_path datapath = {}; - const struct cred *old_cred; int err; if (!redirect || ovl_dentry_lowerdata(dentry)) @@ -1019,9 +1013,8 @@ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) if (ovl_dentry_lowerdata(dentry)) goto out; - old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_lookup_data_layers(dentry, redirect, &datapath); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = ovl_lookup_data_layers(dentry, redirect, &datapath); if (err) goto out_err; @@ -1077,57 +1070,44 @@ static bool ovl_check_follow_redirect(struct ovl_lookup_data *d) return true; } -struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) +struct ovl_lookup_ctx { + struct dentry *dentry; + struct ovl_entry *oe; + struct ovl_path *stack; + struct ovl_path *origin_path; + struct dentry *upperdentry; + struct dentry *index; + struct inode *inode; + unsigned int ctr; +}; + +static int ovl_lookup_layers(struct ovl_lookup_ctx *ctx, struct ovl_lookup_data *d) { - struct ovl_entry *oe = NULL; - const struct cred *old_cred; + struct dentry *dentry = ctx->dentry; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_entry *poe = OVL_E(dentry->d_parent); struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root); - struct ovl_path *stack = NULL, *origin_path = NULL; - struct dentry *upperdir, *upperdentry = NULL; - struct dentry *origin = NULL; - struct dentry *index = NULL; - unsigned int ctr = 0; - struct inode *inode = NULL; - bool upperopaque = false; bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer); + struct dentry *upperdir; struct dentry *this; - unsigned int i; - int err; + struct dentry *origin = NULL; + bool upperopaque = false; bool uppermetacopy = false; int metacopy_size = 0; - struct ovl_lookup_data d = { - .sb = dentry->d_sb, - .dentry = dentry, - .name = dentry->d_name, - .is_dir = false, - .opaque = false, - .stop = false, - .last = check_redirect ? false : !ovl_numlower(poe), - .redirect = NULL, - .upperredirect = NULL, - .metacopy = 0, - }; - - if (dentry->d_name.len > ofs->namelen) - return ERR_PTR(-ENAMETOOLONG); + unsigned int i; + int err; - old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { - d.layer = &ofs->layers[0]; - err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); + d->layer = &ofs->layers[0]; + err = ovl_lookup_layer(upperdir, d, &ctx->upperdentry, true); if (err) - goto out; + return err; - if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) { - dput(upperdentry); - err = -EREMOTE; - goto out; - } - if (upperdentry && !d.is_dir) { + if (ctx->upperdentry && ctx->upperdentry->d_flags & DCACHE_OP_REAL) + return -EREMOTE; + + if (ctx->upperdentry && !d->is_dir) { /* * Lookup copy up origin by decoding origin file handle. * We may get a disconnected dentry, which is fine, @@ -1138,50 +1118,50 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * number - it's the same as if we held a reference * to a dentry in lower layer that was moved under us. */ - err = ovl_check_origin(ofs, upperdentry, &origin_path); + err = ovl_check_origin(ofs, ctx->upperdentry, &ctx->origin_path); if (err) - goto out_put_upper; + return err; - if (d.metacopy) + if (d->metacopy) uppermetacopy = true; - metacopy_size = d.metacopy; + metacopy_size = d->metacopy; } - if (d.redirect) { + if (d->redirect) { err = -ENOMEM; - d.upperredirect = kstrdup(d.redirect, GFP_KERNEL); - if (!d.upperredirect) - goto out_put_upper; - if (d.redirect[0] == '/') + d->upperredirect = kstrdup(d->redirect, GFP_KERNEL); + if (!d->upperredirect) + return err; + if (d->redirect[0] == '/') poe = roe; } - upperopaque = d.opaque; + upperopaque = d->opaque; } - if (!d.stop && ovl_numlower(poe)) { + if (!d->stop && ovl_numlower(poe)) { err = -ENOMEM; - stack = ovl_stack_alloc(ofs->numlayer - 1); - if (!stack) - goto out_put_upper; + ctx->stack = ovl_stack_alloc(ofs->numlayer - 1); + if (!ctx->stack) + return err; } - for (i = 0; !d.stop && i < ovl_numlower(poe); i++) { + for (i = 0; !d->stop && i < ovl_numlower(poe); i++) { struct ovl_path lower = ovl_lowerstack(poe)[i]; - if (!ovl_check_follow_redirect(&d)) { + if (!ovl_check_follow_redirect(d)) { err = -EPERM; - goto out_put; + return err; } if (!check_redirect) - d.last = i == ovl_numlower(poe) - 1; - else if (d.is_dir || !ofs->numdatalayer) - d.last = lower.layer->idx == ovl_numlower(roe); + d->last = i == ovl_numlower(poe) - 1; + else if (d->is_dir || !ofs->numdatalayer) + d->last = lower.layer->idx == ovl_numlower(roe); - d.layer = lower.layer; - err = ovl_lookup_layer(lower.dentry, &d, &this, false); + d->layer = lower.layer; + err = ovl_lookup_layer(lower.dentry, d, &this, false); if (err) - goto out_put; + return err; if (!this) continue; @@ -1190,11 +1170,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * If no origin fh is stored in upper of a merge dir, store fh * of lower dir and set upper parent "impure". */ - if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) { - err = ovl_fix_origin(ofs, dentry, this, upperdentry); + if (ctx->upperdentry && !ctx->ctr && !ofs->noxattr && d->is_dir) { + err = ovl_fix_origin(ofs, dentry, this, ctx->upperdentry); if (err) { dput(this); - goto out_put; + return err; } } @@ -1207,23 +1187,23 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * matches the dentry found using path based lookup, * otherwise error out. */ - if (upperdentry && !ctr && - ((d.is_dir && ovl_verify_lower(dentry->d_sb)) || - (!d.is_dir && ofs->config.index && origin_path))) { - err = ovl_verify_origin(ofs, upperdentry, this, false); + if (ctx->upperdentry && !ctx->ctr && + ((d->is_dir && ovl_verify_lower(dentry->d_sb)) || + (!d->is_dir && ofs->config.index && ctx->origin_path))) { + err = ovl_verify_origin(ofs, ctx->upperdentry, this, false); if (err) { dput(this); - if (d.is_dir) + if (d->is_dir) break; - goto out_put; + return err; } origin = this; } - if (!upperdentry && !d.is_dir && !ctr && d.metacopy) - metacopy_size = d.metacopy; + if (!ctx->upperdentry && !d->is_dir && !ctx->ctr && d->metacopy) + metacopy_size = d->metacopy; - if (d.metacopy && ctr) { + if (d->metacopy && ctx->ctr) { /* * Do not store intermediate metacopy dentries in * lower chain, except top most lower metacopy dentry. @@ -1233,15 +1213,15 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, dput(this); this = NULL; } else { - stack[ctr].dentry = this; - stack[ctr].layer = lower.layer; - ctr++; + ctx->stack[ctx->ctr].dentry = this; + ctx->stack[ctx->ctr].layer = lower.layer; + ctx->ctr++; } - if (d.stop) + if (d->stop) break; - if (d.redirect && d.redirect[0] == '/' && poe != roe) { + if (d->redirect && d->redirect[0] == '/' && poe != roe) { poe = roe; /* Find the current layer on the root dentry */ i = lower.layer->idx - 1; @@ -1252,12 +1232,12 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * Defer lookup of lowerdata in data-only layers to first access. * Don't require redirect=follow and metacopy=on in this case. */ - if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { - d.metacopy = 0; - ctr++; - } else if (!ovl_check_follow_redirect(&d)) { + if (d->metacopy && ctx->ctr && ofs->numdatalayer && d->absolute_redirect) { + d->metacopy = 0; + ctx->ctr++; + } else if (!ovl_check_follow_redirect(d)) { err = -EPERM; - goto out_put; + return err; } /* @@ -1268,20 +1248,20 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * For metacopy dentry, path based lookup will find lower dentries. * Just make sure a corresponding data dentry has been found. */ - if (d.metacopy || (uppermetacopy && !ctr)) { + if (d->metacopy || (uppermetacopy && !ctx->ctr)) { pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n", dentry); err = -EIO; - goto out_put; - } else if (!d.is_dir && upperdentry && !ctr && origin_path) { - if (WARN_ON(stack != NULL)) { + return err; + } else if (!d->is_dir && ctx->upperdentry && !ctx->ctr && ctx->origin_path) { + if (WARN_ON(ctx->stack != NULL)) { err = -EIO; - goto out_put; + return err; } - stack = origin_path; - ctr = 1; - origin = origin_path->dentry; - origin_path = NULL; + ctx->stack = ctx->origin_path; + ctx->ctr = 1; + origin = ctx->origin_path->dentry; + ctx->origin_path = NULL; } /* @@ -1303,38 +1283,39 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * is enabled and if upper had an ORIGIN xattr. * */ - if (!upperdentry && ctr) - origin = stack[0].dentry; + if (!ctx->upperdentry && ctx->ctr) + origin = ctx->stack[0].dentry; if (origin && ovl_indexdir(dentry->d_sb) && - (!d.is_dir || ovl_index_all(dentry->d_sb))) { - index = ovl_lookup_index(ofs, upperdentry, origin, true); - if (IS_ERR(index)) { - err = PTR_ERR(index); - index = NULL; - goto out_put; + (!d->is_dir || ovl_index_all(dentry->d_sb))) { + ctx->index = ovl_lookup_index(ofs, ctx->upperdentry, origin, true); + if (IS_ERR(ctx->index)) { + err = PTR_ERR(ctx->index); + ctx->index = NULL; + return err; } } - if (ctr) { - oe = ovl_alloc_entry(ctr); + if (ctx->ctr) { + ctx->oe = ovl_alloc_entry(ctx->ctr); err = -ENOMEM; - if (!oe) - goto out_put; + if (!ctx->oe) + return err; - ovl_stack_cpy(ovl_lowerstack(oe), stack, ctr); + ovl_stack_cpy(ovl_lowerstack(ctx->oe), ctx->stack, ctx->ctr); } if (upperopaque) ovl_dentry_set_opaque(dentry); - if (d.xwhiteouts) + if (d->xwhiteouts) ovl_dentry_set_xwhiteouts(dentry); - if (upperdentry) + if (ctx->upperdentry) ovl_dentry_set_upper_alias(dentry); - else if (index) { + else if (ctx->index) { + char *upperredirect; struct path upperpath = { - .dentry = upperdentry = dget(index), + .dentry = ctx->upperdentry = dget(ctx->index), .mnt = ovl_upper_mnt(ofs), }; @@ -1343,84 +1324,100 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * assignment happens only if upperdentry is non-NULL, and * this one only if upperdentry is NULL. */ - d.upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); - if (IS_ERR(d.upperredirect)) { - err = PTR_ERR(d.upperredirect); - d.upperredirect = NULL; - goto out_free_oe; - } + upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); + if (IS_ERR(upperredirect)) + return PTR_ERR(upperredirect); + d->upperredirect = upperredirect; err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL); if (err < 0) - goto out_free_oe; - d.metacopy = uppermetacopy = err; + return err; + d->metacopy = uppermetacopy = err; metacopy_size = err; - if (!ovl_check_follow_redirect(&d)) { + if (!ovl_check_follow_redirect(d)) { err = -EPERM; - goto out_free_oe; + return err; } } - if (upperdentry || ctr) { + if (ctx->upperdentry || ctx->ctr) { + struct inode *inode; struct ovl_inode_params oip = { - .upperdentry = upperdentry, - .oe = oe, - .index = index, - .redirect = d.upperredirect, + .upperdentry = ctx->upperdentry, + .oe = ctx->oe, + .index = ctx->index, + .redirect = d->upperredirect, }; /* Store lowerdata redirect for lazy lookup */ - if (ctr > 1 && !d.is_dir && !stack[ctr - 1].dentry) { - oip.lowerdata_redirect = d.redirect; - d.redirect = NULL; + if (ctx->ctr > 1 && !d->is_dir && !ctx->stack[ctx->ctr - 1].dentry) { + oip.lowerdata_redirect = d->redirect; + d->redirect = NULL; } + inode = ovl_get_inode(dentry->d_sb, &oip); - err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out_free_oe; - if (upperdentry && !uppermetacopy) - ovl_set_flag(OVL_UPPERDATA, inode); + return PTR_ERR(inode); + + ctx->inode = inode; + if (ctx->upperdentry && !uppermetacopy) + ovl_set_flag(OVL_UPPERDATA, ctx->inode); if (metacopy_size > OVL_METACOPY_MIN_SIZE) - ovl_set_flag(OVL_HAS_DIGEST, inode); + ovl_set_flag(OVL_HAS_DIGEST, ctx->inode); } - ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode)); + ovl_dentry_init_reval(dentry, ctx->upperdentry, OVL_I_E(ctx->inode)); + + return 0; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct ovl_entry *poe = OVL_E(dentry->d_parent); + bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer); + int err; + struct ovl_lookup_ctx ctx = { + .dentry = dentry, + }; + struct ovl_lookup_data d = { + .sb = dentry->d_sb, + .dentry = dentry, + .name = dentry->d_name, + .last = check_redirect ? false : !ovl_numlower(poe), + }; + + if (dentry->d_name.len > ofs->namelen) + return ERR_PTR(-ENAMETOOLONG); + + with_ovl_creds(dentry->d_sb) + err = ovl_lookup_layers(&ctx, &d); - ovl_revert_creds(old_cred); - if (origin_path) { - dput(origin_path->dentry); - kfree(origin_path); + if (ctx.origin_path) { + dput(ctx.origin_path->dentry); + kfree(ctx.origin_path); } - dput(index); - ovl_stack_free(stack, ctr); + dput(ctx.index); + ovl_stack_free(ctx.stack, ctx.ctr); kfree(d.redirect); - return d_splice_alias(inode, dentry); -out_free_oe: - ovl_free_entry(oe); -out_put: - dput(index); - ovl_stack_free(stack, ctr); -out_put_upper: - if (origin_path) { - dput(origin_path->dentry); - kfree(origin_path); + if (err) { + ovl_free_entry(ctx.oe); + dput(ctx.upperdentry); + kfree(d.upperredirect); + return ERR_PTR(err); } - dput(upperdentry); - kfree(d.upperredirect); -out: - kfree(d.redirect); - ovl_revert_creds(old_cred); - return ERR_PTR(err); + + return d_splice_alias(ctx.inode, dentry); } bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); const struct qstr *name = &dentry->d_name; - const struct cred *old_cred; unsigned int i; bool positive = false; bool done = false; @@ -1436,46 +1433,45 @@ bool ovl_lower_positive(struct dentry *dentry) if (!ovl_dentry_upper(dentry)) return true; - old_cred = ovl_override_creds(dentry->d_sb); - /* Positive upper -> have to look up lower to see whether it exists */ - for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) { - struct dentry *this; - struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; + with_ovl_creds(dentry->d_sb) { + /* Positive upper -> have to look up lower to see whether it exists */ + for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) { + struct dentry *this; + struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; - /* - * We need to make a non-const copy of dentry->d_name, - * because lookup_one_positive_unlocked() will hash name - * with parentpath base, which is on another (lower fs). - */ - this = lookup_one_positive_unlocked( - mnt_idmap(parentpath->layer->mnt), - &QSTR_LEN(name->name, name->len), - parentpath->dentry); - if (IS_ERR(this)) { - switch (PTR_ERR(this)) { - case -ENOENT: - case -ENAMETOOLONG: - break; - - default: - /* - * Assume something is there, we just couldn't - * access it. - */ - positive = true; - break; + /* + * We need to make a non-const copy of dentry->d_name, + * because lookup_one_positive_unlocked() will hash name + * with parentpath base, which is on another (lower fs). + */ + this = lookup_one_positive_unlocked(mnt_idmap(parentpath->layer->mnt), + &QSTR_LEN(name->name, name->len), + parentpath->dentry); + if (IS_ERR(this)) { + switch (PTR_ERR(this)) { + case -ENOENT: + case -ENAMETOOLONG: + break; + + default: + /* + * Assume something is there, we just couldn't + * access it. + */ + positive = true; + break; + } + } else { + struct path path = { + .dentry = this, + .mnt = parentpath->layer->mnt, + }; + positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path); + done = true; + dput(this); } - } else { - struct path path = { - .dentry = this, - .mnt = parentpath->layer->mnt, - }; - positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path); - done = true; - dput(this); } } - ovl_revert_creds(old_cred); return positive; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index c8fd5951fc5e..f9ac9bdde830 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -206,7 +206,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, static inline int ovl_do_rmdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry); + int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; @@ -235,7 +235,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true); + int err = vfs_create(ovl_upper_mnt_idmap(ofs), dentry, mode, NULL); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; @@ -248,7 +248,7 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, { struct dentry *ret; - ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); + ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, NULL); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret)); return ret; } @@ -257,7 +257,7 @@ static inline int ovl_do_mknod(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev); + int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev, NULL); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; @@ -267,7 +267,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname); + int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname, NULL); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; @@ -355,11 +355,24 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } +static inline int ovl_do_rename_rd(struct renamedata *rd) +{ + int err; + + pr_debug("rename(%pd2, %pd2, 0x%x)\n", rd->old_dentry, rd->new_dentry, + rd->flags); + err = vfs_rename(rd); + if (err) { + pr_debug("...rename(%pd2, %pd2, ...) = %i\n", + rd->old_dentry, rd->new_dentry, err); + } + return err; +} + static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, struct dentry *olddentry, struct dentry *newdir, struct dentry *newdentry, unsigned int flags) { - int err; struct renamedata rd = { .mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_parent = olddir, @@ -369,13 +382,7 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, .flags = flags, }; - pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); - err = vfs_rename(&rd); - if (err) { - pr_debug("...rename(%pd2, %pd2, ...) = %i\n", - olddentry, newdentry, err); - } - return err; + return ovl_do_rename_rd(&rd); } static inline int ovl_do_whiteout(struct ovl_fs *ofs, @@ -415,6 +422,22 @@ static inline struct dentry *ovl_lookup_upper_unlocked(struct ovl_fs *ofs, &QSTR_LEN(name, len), base); } +static inline struct dentry *ovl_start_creating_upper(struct ovl_fs *ofs, + struct dentry *parent, + struct qstr *name) +{ + return start_creating(ovl_upper_mnt_idmap(ofs), + parent, name); +} + +static inline struct dentry *ovl_start_removing_upper(struct ovl_fs *ofs, + struct dentry *parent, + struct qstr *name) +{ + return start_removing(ovl_upper_mnt_idmap(ofs), + parent, name); +} + static inline bool ovl_open_flags_need_copy_up(int flags) { if (!flags) @@ -424,11 +447,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags) } /* util.c */ -int ovl_parent_lock(struct dentry *parent, struct dentry *child); -static inline void ovl_parent_unlock(struct dentry *parent) -{ - inode_unlock(parent->d_inode); -} int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); void ovl_start_write(struct dentry *dentry); @@ -437,7 +455,11 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); -void ovl_revert_creds(const struct cred *old_cred); + +EXTEND_CLASS(override_creds, _ovl, ovl_override_creds(sb), struct super_block *sb) + +#define with_ovl_creds(sb) \ + scoped_class(override_creds_ovl, __UNIQUE_ID(label), sb) static inline const struct cred *ovl_creds(struct super_block *sb) { @@ -865,7 +887,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, struct dentry *newdentry, struct ovl_cattr *attr); int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *dentry); -struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir); +#define OVL_TEMPNAME_SIZE 20 +void ovl_tempname(char name[OVL_TEMPNAME_SIZE]); struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 1e9792cc557b..160960bb0ad0 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -348,11 +348,7 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { - int err = 0; struct dentry *dentry, *dir = path->dentry; - const struct cred *old_cred; - - old_cred = ovl_override_creds(rdd->dentry->d_sb); while (rdd->first_maybe_whiteout) { struct ovl_cache_entry *p = @@ -365,13 +361,11 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); } else if (PTR_ERR(dentry) == -EINTR) { - err = -EINTR; - break; + return -EINTR; } } - ovl_revert_creds(old_cred); - return err; + return 0; } static inline int ovl_dir_read(const struct path *realpath, @@ -838,36 +832,12 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) return err; } - -static int ovl_iterate(struct file *file, struct dir_context *ctx) +static int ovl_iterate_merged(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; - struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_cache_entry *p; - const struct cred *old_cred; - int err; - - old_cred = ovl_override_creds(dentry->d_sb); - if (!ctx->pos) - ovl_dir_reset(file); - - if (od->is_real) { - /* - * If parent is merge, then need to adjust d_ino for '..', if - * dir is impure then need to adjust d_ino for copied up - * entries. - */ - if (ovl_xino_bits(ofs) || - (ovl_same_fs(ofs) && - (ovl_is_impure_dir(file) || - OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { - err = ovl_iterate_real(file, ctx); - } else { - err = iterate_dir(od->realfile, ctx); - } - goto out; - } + int err = 0; if (!od->cache) { struct ovl_dir_cache *cache; @@ -875,7 +845,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) cache = ovl_cache_get(dentry); err = PTR_ERR(cache); if (IS_ERR(cache)) - goto out; + return err; od->cache = cache; ovl_seek_cursor(od, ctx->pos); @@ -887,7 +857,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) if (!p->ino || p->check_xwhiteout) { err = ovl_cache_update(&file->f_path, p, !p->ino); if (err) - goto out; + return err; } } /* ovl_cache_update() sets is_whiteout on stale entry */ @@ -898,12 +868,50 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) od->cursor = p->l_node.next; ctx->pos++; } - err = 0; -out: - ovl_revert_creds(old_cred); return err; } +static bool ovl_need_adjust_d_ino(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + + /* If parent is merge, then need to adjust d_ino for '..' */ + if (ovl_xino_bits(ofs)) + return true; + + /* Can't do consistent inode numbering */ + if (!ovl_same_fs(ofs)) + return false; + + /* If dir is impure then need to adjust d_ino for copied up entries */ + if (ovl_is_impure_dir(file) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))) + return true; + + /* Pure: no need to adjust d_ino */ + return false; +} + + +static int ovl_iterate(struct file *file, struct dir_context *ctx) +{ + struct ovl_dir_file *od = file->private_data; + + if (!ctx->pos) + ovl_dir_reset(file); + + with_ovl_creds(file_dentry(file)->d_sb) { + if (!od->is_real) + return ovl_iterate_merged(file, ctx); + + if (ovl_need_adjust_d_ino(file)) + return ovl_iterate_real(file, ctx); + + return iterate_dir(od->realfile, ctx); + } +} + static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) { loff_t res; @@ -947,14 +955,8 @@ out_unlock: static struct file *ovl_dir_open_realfile(const struct file *file, const struct path *realpath) { - struct file *res; - const struct cred *old_cred; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); - ovl_revert_creds(old_cred); - - return res; + with_ovl_creds(file_inode(file)->i_sb) + return ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); } /* @@ -1075,11 +1077,9 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) int err; struct ovl_cache_entry *p, *n; struct rb_root root = RB_ROOT; - const struct cred *old_cred; - old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_dir_read_merged(dentry, list, &root); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = ovl_dir_read_merged(dentry, list, &root); if (err) return err; @@ -1242,11 +1242,11 @@ int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent, if (!d_is_dir(dentry) || level > 1) return ovl_cleanup(ofs, parent, dentry); - err = ovl_parent_lock(parent, dentry); - if (err) - return err; + dentry = start_removing_dentry(parent, dentry); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); err = ovl_do_rmdir(ofs, parent->d_inode, dentry); - ovl_parent_unlock(parent); + end_removing(dentry); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 43ee4c7296a7..28b2f707cfbc 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -310,8 +310,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, bool retried = false; retry: - inode_lock_nested(dir, I_MUTEX_PARENT); - work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name)); + work = ovl_start_creating_upper(ofs, ofs->workbasedir, &QSTR(name)); if (!IS_ERR(work)) { struct iattr attr = { @@ -320,14 +319,12 @@ retry: }; if (work->d_inode) { + end_creating_keep(work); + if (persist) + return work; err = -EEXIST; - inode_unlock(dir); if (retried) goto out_dput; - - if (persist) - return work; - retried = true; err = ovl_workdir_cleanup(ofs, ofs->workbasedir, mnt, work, 0); dput(work); @@ -338,7 +335,7 @@ retry: } work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); - inode_unlock(dir); + end_creating_keep(work); err = PTR_ERR(work); if (IS_ERR(work)) goto out_err; @@ -376,7 +373,6 @@ retry: if (err) goto out_dput; } else { - inode_unlock(dir); err = PTR_ERR(work); goto out_err; } @@ -567,9 +563,10 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) { struct dentry *workdir = ofs->workdir; struct dentry *temp; - struct dentry *dest; struct dentry *whiteout; struct name_snapshot name; + struct renamedata rd = {}; + char name2[OVL_TEMPNAME_SIZE]; int err; temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0)); @@ -577,23 +574,21 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) if (IS_ERR(temp)) return err; - err = ovl_parent_lock(workdir, temp); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = workdir; + rd.new_parent = workdir; + rd.flags = RENAME_WHITEOUT; + ovl_tempname(name2); + err = start_renaming_dentry(&rd, 0, temp, &QSTR(name2)); if (err) { dput(temp); return err; } - dest = ovl_lookup_temp(ofs, workdir); - err = PTR_ERR(dest); - if (IS_ERR(dest)) { - dput(temp); - ovl_parent_unlock(workdir); - return err; - } /* Name is inline and stable - using snapshot as a copy helper */ take_dentry_name_snapshot(&name, temp); - err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT); - ovl_parent_unlock(workdir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) { if (err == -EINVAL) err = 0; @@ -617,7 +612,6 @@ cleanup_temp: ovl_cleanup(ofs, workdir, temp); release_dentry_name_snapshot(&name); dput(temp); - dput(dest); return err; } @@ -626,14 +620,15 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, struct dentry *parent, const char *name, umode_t mode) { - size_t len = strlen(name); struct dentry *child; - inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - child = ovl_lookup_upper(ofs, name, parent, len); - if (!IS_ERR(child) && !child->d_inode) - child = ovl_create_real(ofs, parent, child, OVL_CATTR(mode)); - inode_unlock(parent->d_inode); + child = ovl_start_creating_upper(ofs, parent, &QSTR(name)); + if (!IS_ERR(child)) { + if (!child->d_inode) + child = ovl_create_real(ofs, parent, child, + OVL_CATTR(mode)); + end_creating_keep(child); + } dput(parent); return child; @@ -1369,53 +1364,35 @@ static void ovl_set_d_op(struct super_block *sb) set_default_d_op(sb, &ovl_dentry_operations); } -int ovl_fill_super(struct super_block *sb, struct fs_context *fc) +static int ovl_fill_super_creds(struct fs_context *fc, struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; + struct cred *creator_cred = (struct cred *)ofs->creator_cred; struct ovl_fs_context *ctx = fc->fs_private; - const struct cred *old_cred = NULL; - struct dentry *root_dentry; - struct ovl_entry *oe; struct ovl_layer *layers; - struct cred *cred; + struct ovl_entry *oe = NULL; int err; - err = -EIO; - if (WARN_ON(fc->user_ns != current_user_ns())) - goto out_err; - - ovl_set_d_op(sb); - - err = -ENOMEM; - if (!ofs->creator_cred) - ofs->creator_cred = cred = prepare_creds(); - else - cred = (struct cred *)ofs->creator_cred; - if (!cred) - goto out_err; - - old_cred = ovl_override_creds(sb); - err = ovl_fs_params_verify(ctx, &ofs->config); if (err) - goto out_err; + return err; err = -EINVAL; if (ctx->nr == 0) { if (!(fc->sb_flags & SB_SILENT)) pr_err("missing 'lowerdir'\n"); - goto out_err; + return err; } err = -ENOMEM; layers = kcalloc(ctx->nr + 1, sizeof(struct ovl_layer), GFP_KERNEL); if (!layers) - goto out_err; + return err; ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL); if (!ofs->config.lowerdirs) { kfree(layers); - goto out_err; + return err; } ofs->layers = layers; /* @@ -1448,12 +1425,12 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) err = -EINVAL; if (!ofs->config.workdir) { pr_err("missing 'workdir'\n"); - goto out_err; + return err; } err = ovl_get_upper(sb, ofs, &layers[0], &ctx->upper); if (err) - goto out_err; + return err; upper_sb = ovl_upper_mnt(ofs)->mnt_sb; if (!ovl_should_sync(ofs)) { @@ -1461,13 +1438,13 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) { err = -EIO; pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n"); - goto out_err; + return err; } } err = ovl_get_workdir(sb, ofs, &ctx->upper, &ctx->work); if (err) - goto out_err; + return err; if (!ofs->workdir) sb->s_flags |= SB_RDONLY; @@ -1478,7 +1455,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) oe = ovl_get_lowerstack(sb, ctx, ofs, layers); err = PTR_ERR(oe); if (IS_ERR(oe)) - goto out_err; + return err; /* If the upper fs is nonexistent, we mark overlayfs r/o too */ if (!ovl_upper_mnt(ofs)) @@ -1531,7 +1508,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_export_op = &ovl_export_fid_operations; /* Never override disk quota limits or use reserved space */ - cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); + cap_lower(creator_cred->cap_effective, CAP_SYS_RESOURCE); sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_xattr = ovl_xattr_handlers(ofs); @@ -1549,27 +1526,44 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_iflags |= SB_I_EVM_HMAC_UNSUPPORTED; err = -ENOMEM; - root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); - if (!root_dentry) + sb->s_root = ovl_get_root(sb, ctx->upper.dentry, oe); + if (!sb->s_root) goto out_free_oe; - sb->s_root = root_dentry; - - ovl_revert_creds(old_cred); return 0; out_free_oe: ovl_free_entry(oe); + return err; +} + +int ovl_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct ovl_fs *ofs = sb->s_fs_info; + int err; + + err = -EIO; + if (WARN_ON(fc->user_ns != current_user_ns())) + goto out_err; + + ovl_set_d_op(sb); + + if (!ofs->creator_cred) { + err = -ENOMEM; + ofs->creator_cred = prepare_creds(); + if (!ofs->creator_cred) + goto out_err; + } + + with_ovl_creds(sb) + err = ovl_fill_super_creds(fc, sb); + out_err: - /* - * Revert creds before calling ovl_free_fs() which will call - * put_cred() and put_cred() requires that the cred's that are - * put are not the caller's creds, i.e., current->cred. - */ - if (old_cred) - ovl_revert_creds(old_cred); - ovl_free_fs(ofs); - sb->s_fs_info = NULL; + if (err) { + ovl_free_fs(ofs); + sb->s_fs_info = NULL; + } + return err; } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f76672f2e686..94986d11a166 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -69,11 +69,6 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } -void ovl_revert_creds(const struct cred *old_cred) -{ - revert_creds(old_cred); -} - /* * Check if underlying fs supports file handles and try to determine encoding * type, in order to deduce maximum inode number used by fs. @@ -1019,8 +1014,8 @@ bool ovl_inuse_trylock(struct dentry *dentry) bool locked = false; spin_lock(&inode->i_lock); - if (!(inode->i_state & I_OVL_INUSE)) { - inode->i_state |= I_OVL_INUSE; + if (!(inode_state_read(inode) & I_OVL_INUSE)) { + inode_state_set(inode, I_OVL_INUSE); locked = true; } spin_unlock(&inode->i_lock); @@ -1034,8 +1029,8 @@ void ovl_inuse_unlock(struct dentry *dentry) struct inode *inode = d_inode(dentry); spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_OVL_INUSE)); - inode->i_state &= ~I_OVL_INUSE; + WARN_ON(!(inode_state_read(inode) & I_OVL_INUSE)); + inode_state_clear(inode, I_OVL_INUSE); spin_unlock(&inode->i_lock); } } @@ -1046,7 +1041,7 @@ bool ovl_is_inuse(struct dentry *dentry) bool inuse; spin_lock(&inode->i_lock); - inuse = (inode->i_state & I_OVL_INUSE); + inuse = (inode_state_read(inode) & I_OVL_INUSE); spin_unlock(&inode->i_lock); return inuse; @@ -1147,7 +1142,6 @@ fail: int ovl_nlink_start(struct dentry *dentry) { struct inode *inode = d_inode(dentry); - const struct cred *old_cred; int err; if (WARN_ON(!inode)) @@ -1184,15 +1178,14 @@ int ovl_nlink_start(struct dentry *dentry) if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode)) return 0; - old_cred = ovl_override_creds(dentry->d_sb); /* * The overlay inode nlink should be incremented/decremented IFF the * upper operation succeeds, along with nlink change of upper inode. * Therefore, before link/unlink/rename, we store the union nlink * value relative to the upper inode nlink in an upper inode xattr. */ - err = ovl_set_nlink_upper(dentry); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = ovl_set_nlink_upper(dentry); if (err) goto out_drop_write; @@ -1213,11 +1206,8 @@ void ovl_nlink_end(struct dentry *dentry) ovl_drop_write(dentry); if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) { - const struct cred *old_cred; - - old_cred = ovl_override_creds(dentry->d_sb); - ovl_cleanup_index(dentry); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + ovl_cleanup_index(dentry); } ovl_inode_unlock(inode); @@ -1234,9 +1224,9 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work, goto err; if (trap) goto err_unlock; - if (work && work->d_parent != workdir) + if (work && (work->d_parent != workdir || d_unhashed(work))) goto err_unlock; - if (upper && upper->d_parent != upperdir) + if (upper && (upper->d_parent != upperdir || d_unhashed(upper))) goto err_unlock; return 0; @@ -1548,14 +1538,3 @@ void ovl_copyattr(struct inode *inode) i_size_write(inode, i_size_read(realinode)); spin_unlock(&inode->i_lock); } - -int ovl_parent_lock(struct dentry *parent, struct dentry *child) -{ - inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - if (!child || - (!d_unhashed(child) && child->d_parent == parent)) - return 0; - - inode_unlock(parent->d_inode); - return -EINVAL; -} diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c index 88055deca936..aa95855c7023 100644 --- a/fs/overlayfs/xattrs.c +++ b/fs/overlayfs/xattrs.c @@ -41,13 +41,11 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char struct dentry *upperdentry = ovl_i_dentry_upper(inode); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); struct path realpath; - const struct cred *old_cred; if (!value && !upperdentry) { ovl_path_lower(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); if (err < 0) goto out; } @@ -64,15 +62,14 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char if (err) goto out; - old_cred = ovl_override_creds(dentry->d_sb); - if (value) { - err = ovl_do_setxattr(ofs, realdentry, name, value, size, - flags); - } else { - WARN_ON(flags != XATTR_REPLACE); - err = ovl_do_removexattr(ofs, realdentry, name); + with_ovl_creds(dentry->d_sb) { + if (value) { + err = ovl_do_setxattr(ofs, realdentry, name, value, size, flags); + } else { + WARN_ON(flags != XATTR_REPLACE); + err = ovl_do_removexattr(ofs, realdentry, name); + } } - ovl_revert_creds(old_cred); ovl_drop_write(dentry); /* copy c/mtime */ @@ -84,15 +81,11 @@ out: static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { - ssize_t res; - const struct cred *old_cred; struct path realpath; ovl_i_path_real(inode, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); - ovl_revert_creds(old_cred); - return res; + with_ovl_creds(dentry->d_sb) + return vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); } static bool ovl_can_list(struct super_block *sb, const char *s) @@ -116,12 +109,10 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) ssize_t res; size_t len; char *s; - const struct cred *old_cred; size_t prefix_len, name_len; - old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_listxattr(realdentry, list, size); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + res = vfs_listxattr(realdentry, list, size); if (res <= 0 || size == 0) return res; diff --git a/fs/pidfs.c b/fs/pidfs.c index 0ef5b47d796a..dba703d4ce4a 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -39,20 +39,20 @@ void pidfs_get_root(struct path *path) path_get(path); } -/* - * Stashes information that userspace needs to access even after the - * process has been reaped. - */ -struct pidfs_exit_info { - __u64 cgroupid; - __s32 exit_code; - __u32 coredump_mask; +enum pidfs_attr_mask_bits { + PIDFS_ATTR_BIT_EXIT = 0, + PIDFS_ATTR_BIT_COREDUMP = 1, }; struct pidfs_attr { + unsigned long attr_mask; struct simple_xattrs *xattrs; - struct pidfs_exit_info __pei; - struct pidfs_exit_info *exit_info; + struct /* exit info */ { + __u64 cgroupid; + __s32 exit_code; + }; + __u32 coredump_mask; + __u32 coredump_signal; }; static struct rb_root pidfs_ino_tree = RB_ROOT; @@ -293,6 +293,15 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) return 0; } +/* This must be updated whenever a new flag is added */ +#define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \ + PIDFD_INFO_CREDS | \ + PIDFD_INFO_CGROUPID | \ + PIDFD_INFO_EXIT | \ + PIDFD_INFO_COREDUMP | \ + PIDFD_INFO_SUPPORTED_MASK | \ + PIDFD_INFO_COREDUMP_SIGNAL) + static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; @@ -300,12 +309,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; - struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; struct pidfs_attr *attr; const struct cred *c; __u64 mask; + BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2); + if (!uinfo) return -EINVAL; if (usize < PIDFD_INFO_SIZE_VER0) @@ -323,20 +333,24 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) attr = READ_ONCE(pid->attr); if (mask & PIDFD_INFO_EXIT) { - exit_info = READ_ONCE(attr->exit_info); - if (exit_info) { + if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) { + smp_rmb(); kinfo.mask |= PIDFD_INFO_EXIT; #ifdef CONFIG_CGROUPS - kinfo.cgroupid = exit_info->cgroupid; + kinfo.cgroupid = attr->cgroupid; kinfo.mask |= PIDFD_INFO_CGROUPID; #endif - kinfo.exit_code = exit_info->exit_code; + kinfo.exit_code = attr->exit_code; } } if (mask & PIDFD_INFO_COREDUMP) { - kinfo.mask |= PIDFD_INFO_COREDUMP; - kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask); + if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) { + smp_rmb(); + kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL; + kinfo.coredump_mask = attr->coredump_mask; + kinfo.coredump_signal = attr->coredump_signal; + } } task = get_pid_task(pid, PIDTYPE_PID); @@ -355,14 +369,15 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (!c) return -ESRCH; - if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) { - task_lock(task); + if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) { + guard(task_lock)(task); if (task->mm) { unsigned long flags = __mm_flags_get_dumpable(task->mm); kinfo.coredump_mask = pidfs_coredump_mask(flags); + kinfo.mask |= PIDFD_INFO_COREDUMP; + /* No coredump actually took place, so no coredump signal. */ } - task_unlock(task); } /* Unconditionally return identifiers and credentials, the rest only on request */ @@ -409,6 +424,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) return -ESRCH; copy_out: + if (mask & PIDFD_INFO_SUPPORTED_MASK) { + kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK; + kinfo.supported_mask = PIDFD_INFO_SUPPORTED; + } + + /* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */ + WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask); /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that @@ -454,7 +476,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct task_struct *task __free(put_task) = NULL; struct nsproxy *nsp __free(put_nsproxy) = NULL; struct ns_common *ns_common = NULL; - struct pid_namespace *pid_ns; if (!pidfs_ioctl_valid(cmd)) return -ENOIOCTLCMD; @@ -496,66 +517,64 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { /* Namespaces that hang of nsproxy. */ case PIDFD_GET_CGROUP_NAMESPACE: - if (IS_ENABLED(CONFIG_CGROUPS)) { - get_cgroup_ns(nsp->cgroup_ns); - ns_common = to_ns_common(nsp->cgroup_ns); - } + if (!ns_ref_get(nsp->cgroup_ns)) + break; + ns_common = to_ns_common(nsp->cgroup_ns); break; case PIDFD_GET_IPC_NAMESPACE: - if (IS_ENABLED(CONFIG_IPC_NS)) { - get_ipc_ns(nsp->ipc_ns); - ns_common = to_ns_common(nsp->ipc_ns); - } + if (!ns_ref_get(nsp->ipc_ns)) + break; + ns_common = to_ns_common(nsp->ipc_ns); break; case PIDFD_GET_MNT_NAMESPACE: - get_mnt_ns(nsp->mnt_ns); + if (!ns_ref_get(nsp->mnt_ns)) + break; ns_common = to_ns_common(nsp->mnt_ns); break; case PIDFD_GET_NET_NAMESPACE: - if (IS_ENABLED(CONFIG_NET_NS)) { - ns_common = to_ns_common(nsp->net_ns); - get_net_ns(ns_common); - } + if (!ns_ref_get(nsp->net_ns)) + break; + ns_common = to_ns_common(nsp->net_ns); break; case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: - if (IS_ENABLED(CONFIG_PID_NS)) { - get_pid_ns(nsp->pid_ns_for_children); - ns_common = to_ns_common(nsp->pid_ns_for_children); - } + if (!ns_ref_get(nsp->pid_ns_for_children)) + break; + ns_common = to_ns_common(nsp->pid_ns_for_children); break; case PIDFD_GET_TIME_NAMESPACE: - if (IS_ENABLED(CONFIG_TIME_NS)) { - get_time_ns(nsp->time_ns); - ns_common = to_ns_common(nsp->time_ns); - } + if (!ns_ref_get(nsp->time_ns)) + break; + ns_common = to_ns_common(nsp->time_ns); break; case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: - if (IS_ENABLED(CONFIG_TIME_NS)) { - get_time_ns(nsp->time_ns_for_children); - ns_common = to_ns_common(nsp->time_ns_for_children); - } + if (!ns_ref_get(nsp->time_ns_for_children)) + break; + ns_common = to_ns_common(nsp->time_ns_for_children); break; case PIDFD_GET_UTS_NAMESPACE: - if (IS_ENABLED(CONFIG_UTS_NS)) { - get_uts_ns(nsp->uts_ns); - ns_common = to_ns_common(nsp->uts_ns); - } + if (!ns_ref_get(nsp->uts_ns)) + break; + ns_common = to_ns_common(nsp->uts_ns); break; /* Namespaces that don't hang of nsproxy. */ case PIDFD_GET_USER_NAMESPACE: - if (IS_ENABLED(CONFIG_USER_NS)) { - rcu_read_lock(); - ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); - rcu_read_unlock(); + scoped_guard(rcu) { + struct user_namespace *user_ns; + + user_ns = task_cred_xxx(task, user_ns); + if (!ns_ref_get(user_ns)) + break; + ns_common = to_ns_common(user_ns); } break; case PIDFD_GET_PID_NAMESPACE: - if (IS_ENABLED(CONFIG_PID_NS)) { - rcu_read_lock(); + scoped_guard(rcu) { + struct pid_namespace *pid_ns; + pid_ns = task_active_pid_ns(task); - if (pid_ns) - ns_common = to_ns_common(get_pid_ns(pid_ns)); - rcu_read_unlock(); + if (!ns_ref_get(pid_ns)) + break; + ns_common = to_ns_common(pid_ns); } break; default: @@ -606,24 +625,25 @@ void pidfs_exit(struct task_struct *tsk) { struct pid *pid = task_pid(tsk); struct pidfs_attr *attr; - struct pidfs_exit_info *exit_info; #ifdef CONFIG_CGROUPS struct cgroup *cgrp; #endif might_sleep(); - guard(spinlock_irq)(&pid->wait_pidfd.lock); - attr = pid->attr; - if (!attr) { - /* - * No one ever held a pidfd for this struct pid. - * Mark it as dead so no one can add a pidfs - * entry anymore. We're about to be reaped and - * so no exit information would be available. - */ - pid->attr = PIDFS_PID_DEAD; - return; + /* Synchronize with pidfs_register_pid(). */ + scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { + attr = pid->attr; + if (!attr) { + /* + * No one ever held a pidfd for this struct pid. + * Mark it as dead so no one can add a pidfs + * entry anymore. We're about to be reaped and + * so no exit information would be available. + */ + pid->attr = PIDFS_PID_DEAD; + return; + } } /* @@ -634,41 +654,39 @@ void pidfs_exit(struct task_struct *tsk) * is put */ - exit_info = &attr->__pei; - #ifdef CONFIG_CGROUPS rcu_read_lock(); cgrp = task_dfl_cgroup(tsk); - exit_info->cgroupid = cgroup_id(cgrp); + attr->cgroupid = cgroup_id(cgrp); rcu_read_unlock(); #endif - exit_info->exit_code = tsk->exit_code; + attr->exit_code = tsk->exit_code; /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ - smp_store_release(&attr->exit_info, &attr->__pei); + smp_wmb(); + set_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask); } #ifdef CONFIG_COREDUMP void pidfs_coredump(const struct coredump_params *cprm) { struct pid *pid = cprm->pid; - struct pidfs_exit_info *exit_info; struct pidfs_attr *attr; - __u32 coredump_mask = 0; attr = READ_ONCE(pid->attr); VFS_WARN_ON_ONCE(!attr); VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); - exit_info = &attr->__pei; - /* Note how we were coredumped. */ - coredump_mask = pidfs_coredump_mask(cprm->mm_flags); - /* Note that we actually did coredump. */ - coredump_mask |= PIDFD_COREDUMPED; + /* Note how we were coredumped and that we coredumped. */ + attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) | + PIDFD_COREDUMPED; /* If coredumping is set to skip we should never end up here. */ - VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP); - smp_store_release(&exit_info->coredump_mask, coredump_mask); + VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP); + /* Expose the signal number that caused the coredump. */ + attr->coredump_signal = cprm->siginfo->si_signo; + smp_wmb(); + set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask); } #endif @@ -1022,6 +1040,7 @@ static int pidfs_init_fs_context(struct fs_context *fc) fc->s_iflags |= SB_I_NOEXEC; fc->s_iflags |= SB_I_NODEV; + ctx->s_d_flags |= DCACHE_DONTCACHE; ctx->ops = &pidfs_sops; ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; diff --git a/fs/pipe.c b/fs/pipe.c index 42fead1efe52..2d0fed2ecbfd 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -908,7 +908,7 @@ static struct inode * get_pipe_inode(void) * list because "mark_inode_dirty()" will think * that it already _is_ on the dirty list. */ - inode->i_state = I_DIRTY; + inode_state_assign_raw(inode, I_DIRTY); inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 4050942ab52f..768f027c1428 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1091,7 +1091,7 @@ int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, int acl_type; int error; struct inode *inode = d_inode(dentry); - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; acl_type = posix_acl_type(acl_name); if (acl_type < 0) @@ -1141,7 +1141,7 @@ retry_deleg: out_inode_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -1212,7 +1212,7 @@ int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, int acl_type; int error; struct inode *inode = d_inode(dentry); - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; acl_type = posix_acl_type(acl_name); if (acl_type < 0) @@ -1249,7 +1249,7 @@ retry_deleg: out_inode_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/proc/array.c b/fs/proc/array.c index 2ae63189091e..cbd4bc4a58e4 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -481,7 +481,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, unsigned long flags; int exit_code = task->exit_code; struct signal_struct *sig = task->signal; - unsigned int seq = 1; state = *get_task_state(task); vsize = eip = esp = 0; @@ -538,10 +537,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); - do { - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); - + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { cmin_flt = sig->cmin_flt; cmaj_flt = sig->cmaj_flt; cutime = sig->cutime; @@ -563,8 +559,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, } rcu_read_unlock(); } - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + } if (whole) { thread_group_cputime_adjusted(task, &utime, &stime); diff --git a/fs/proc/base.c b/fs/proc/base.c index 6299878e3d97..407b41cb6e7c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3043,21 +3043,14 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh if (whole) { struct signal_struct *sig = task->signal; struct task_struct *t; - unsigned int seq = 1; - unsigned long flags; - - rcu_read_lock(); - do { - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + guard(rcu)(); + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { acct = sig->ioac; __for_each_thread(sig, t) task_io_accounting_add(&acct, &t->ioac); - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); + } } else { acct = task->ioac; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index e399e2dd3a12..31d78da203ea 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -290,7 +290,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; qnx4_inode = qnx4_raw_inode(inode); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 3310d1ad4d0e..88d285005083 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -521,7 +521,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ei = QNX6_I(inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6c4a6ee1fa2b..376739f6420e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1033,7 +1033,7 @@ static int add_dquot_ref(struct super_block *sb, int type) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || !atomic_read(&inode->i_writecount) || !dqinit_needed(inode, type)) { spin_unlock(&inode->i_lock); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 0d0ef54fc4de..b2d178d3556e 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -24,7 +24,8 @@ #include "internal.h" struct rdt_parse_data { - struct rdtgroup *rdtgrp; + u32 closid; + enum rdtgrp_mode mode; char *buf; }; @@ -77,8 +78,8 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_ctrl_domain *d) { struct resctrl_staged_config *cfg; - u32 closid = data->rdtgrp->closid; struct rdt_resource *r = s->res; + u32 closid = data->closid; u32 bw_val; cfg = &d->staged_config[s->conf_type]; @@ -156,9 +157,10 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_ctrl_domain *d) { - struct rdtgroup *rdtgrp = data->rdtgrp; + enum rdtgrp_mode mode = data->mode; struct resctrl_staged_config *cfg; struct rdt_resource *r = s->res; + u32 closid = data->closid; u32 cbm_val; cfg = &d->staged_config[s->conf_type]; @@ -171,7 +173,7 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, * Cannot set up more than one pseudo-locked region in a cache * hierarchy. */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + if (mode == RDT_MODE_PSEUDO_LOCKSETUP && rdtgroup_pseudo_locked_in_hierarchy(d)) { rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); return -EINVAL; @@ -180,8 +182,7 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, if (!cbm_validate(data->buf, &cbm_val, r)) return -EINVAL; - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_SHAREABLE) && + if ((mode == RDT_MODE_EXCLUSIVE || mode == RDT_MODE_SHAREABLE) && rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); return -EINVAL; @@ -191,14 +192,14 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, * The CBM may not overlap with the CBM of another closid if * either is exclusive. */ - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { + if (rdtgroup_cbm_overlaps(s, d, cbm_val, closid, true)) { rdt_last_cmd_puts("Overlaps with exclusive group\n"); return -EINVAL; } - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + if (rdtgroup_cbm_overlaps(s, d, cbm_val, closid, false)) { + if (mode == RDT_MODE_EXCLUSIVE || + mode == RDT_MODE_PSEUDO_LOCKSETUP) { rdt_last_cmd_puts("Overlaps with other group\n"); return -EINVAL; } @@ -262,7 +263,8 @@ next: list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (d->hdr.id == dom_id) { data.buf = dom; - data.rdtgrp = rdtgrp; + data.closid = rdtgrp->closid; + data.mode = rdtgrp->mode; if (parse_ctrlval(&data, s, d)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { @@ -381,7 +383,8 @@ out: return ret ?: nbytes; } -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) +static void show_doms(struct seq_file *s, struct resctrl_schema *schema, + char *resource_name, int closid) { struct rdt_resource *r = schema->res; struct rdt_ctrl_domain *dom; @@ -391,7 +394,8 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - seq_printf(s, "%*s:", max_name_width, schema->name); + if (resource_name) + seq_printf(s, "%*s:", max_name_width, resource_name); list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_puts(s, ";"); @@ -437,7 +441,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, closid = rdtgrp->closid; list_for_each_entry(schema, &resctrl_schema_all, list) { if (closid < schema->num_closid) - show_doms(s, schema, closid); + show_doms(s, schema, schema->name, closid); } } } else { @@ -676,3 +680,280 @@ out: rdtgroup_kn_unlock(of->kn); return ret; } + +int resctrl_io_alloc_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + mutex_lock(&rdtgroup_mutex); + + if (r->cache.io_alloc_capable) { + if (resctrl_arch_get_io_alloc_enabled(r)) + seq_puts(seq, "enabled\n"); + else + seq_puts(seq, "disabled\n"); + } else { + seq_puts(seq, "not supported\n"); + } + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +/* + * resctrl_io_alloc_closid_supported() - io_alloc feature utilizes the + * highest CLOSID value to direct I/O traffic. Ensure that io_alloc_closid + * is in the supported range. + */ +static bool resctrl_io_alloc_closid_supported(u32 io_alloc_closid) +{ + return io_alloc_closid < closids_supported(); +} + +/* + * Initialize io_alloc CLOSID cache resource CBM with all usable (shared + * and unused) cache portions. + */ +static int resctrl_io_alloc_init_cbm(struct resctrl_schema *s, u32 closid) +{ + enum resctrl_conf_type peer_type; + struct rdt_resource *r = s->res; + struct rdt_ctrl_domain *d; + int ret; + + rdt_staged_configs_clear(); + + ret = rdtgroup_init_cat(s, closid); + if (ret < 0) + goto out; + + /* Keep CDP_CODE and CDP_DATA of io_alloc CLOSID's CBM in sync. */ + if (resctrl_arch_get_cdp_enabled(r->rid)) { + peer_type = resctrl_peer_type(s->conf_type); + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) + memcpy(&d->staged_config[peer_type], + &d->staged_config[s->conf_type], + sizeof(d->staged_config[0])); + } + + ret = resctrl_arch_update_domains(r, closid); +out: + rdt_staged_configs_clear(); + return ret; +} + +/* + * resctrl_io_alloc_closid() - io_alloc feature routes I/O traffic using + * the highest available CLOSID. Retrieve the maximum CLOSID supported by the + * resource. Note that if Code Data Prioritization (CDP) is enabled, the number + * of available CLOSIDs is reduced by half. + */ +u32 resctrl_io_alloc_closid(struct rdt_resource *r) +{ + if (resctrl_arch_get_cdp_enabled(r->rid)) + return resctrl_arch_get_num_closid(r) / 2 - 1; + else + return resctrl_arch_get_num_closid(r) - 1; +} + +ssize_t resctrl_io_alloc_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + char const *grp_name; + u32 io_alloc_closid; + bool enable; + int ret; + + ret = kstrtobool(buf, &enable); + if (ret) + return ret; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!r->cache.io_alloc_capable) { + rdt_last_cmd_printf("io_alloc is not supported on %s\n", s->name); + ret = -ENODEV; + goto out_unlock; + } + + /* If the feature is already up to date, no action is needed. */ + if (resctrl_arch_get_io_alloc_enabled(r) == enable) + goto out_unlock; + + io_alloc_closid = resctrl_io_alloc_closid(r); + if (!resctrl_io_alloc_closid_supported(io_alloc_closid)) { + rdt_last_cmd_printf("io_alloc CLOSID (ctrl_hw_id) %u is not available\n", + io_alloc_closid); + ret = -EINVAL; + goto out_unlock; + } + + if (enable) { + if (!closid_alloc_fixed(io_alloc_closid)) { + grp_name = rdtgroup_name_by_closid(io_alloc_closid); + WARN_ON_ONCE(!grp_name); + rdt_last_cmd_printf("CLOSID (ctrl_hw_id) %u for io_alloc is used by %s group\n", + io_alloc_closid, grp_name ? grp_name : "another"); + ret = -ENOSPC; + goto out_unlock; + } + + ret = resctrl_io_alloc_init_cbm(s, io_alloc_closid); + if (ret) { + rdt_last_cmd_puts("Failed to initialize io_alloc allocations\n"); + closid_free(io_alloc_closid); + goto out_unlock; + } + } else { + closid_free(io_alloc_closid); + } + + ret = resctrl_arch_io_alloc_enable(r, enable); + if (enable && ret) { + rdt_last_cmd_puts("Failed to enable io_alloc feature\n"); + closid_free(io_alloc_closid); + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_io_alloc_cbm_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!r->cache.io_alloc_capable) { + rdt_last_cmd_printf("io_alloc is not supported on %s\n", s->name); + ret = -ENODEV; + goto out_unlock; + } + + if (!resctrl_arch_get_io_alloc_enabled(r)) { + rdt_last_cmd_printf("io_alloc is not enabled on %s\n", s->name); + ret = -EINVAL; + goto out_unlock; + } + + /* + * When CDP is enabled, the CBMs of the highest CLOSID of CDP_CODE and + * CDP_DATA are kept in sync. As a result, the io_alloc CBMs shown for + * either CDP resource are identical and accurately represent the CBMs + * used for I/O. + */ + show_doms(seq, s, NULL, resctrl_io_alloc_closid(r)); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static int resctrl_io_alloc_parse_line(char *line, struct rdt_resource *r, + struct resctrl_schema *s, u32 closid) +{ + enum resctrl_conf_type peer_type; + struct rdt_parse_data data; + struct rdt_ctrl_domain *d; + char *dom = NULL, *id; + unsigned long dom_id; + +next: + if (!line || line[0] == '\0') + return 0; + + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); + return -EINVAL; + } + + dom = strim(dom); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (d->hdr.id == dom_id) { + data.buf = dom; + data.mode = RDT_MODE_SHAREABLE; + data.closid = closid; + if (parse_cbm(&data, s, d)) + return -EINVAL; + /* + * Keep io_alloc CLOSID's CBM of CDP_CODE and CDP_DATA + * in sync. + */ + if (resctrl_arch_get_cdp_enabled(r->rid)) { + peer_type = resctrl_peer_type(s->conf_type); + memcpy(&d->staged_config[peer_type], + &d->staged_config[s->conf_type], + sizeof(d->staged_config[0])); + } + goto next; + } + } + + return -EINVAL; +} + +ssize_t resctrl_io_alloc_cbm_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + u32 io_alloc_closid; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!r->cache.io_alloc_capable) { + rdt_last_cmd_printf("io_alloc is not supported on %s\n", s->name); + ret = -ENODEV; + goto out_unlock; + } + + if (!resctrl_arch_get_io_alloc_enabled(r)) { + rdt_last_cmd_printf("io_alloc is not enabled on %s\n", s->name); + ret = -EINVAL; + goto out_unlock; + } + + io_alloc_closid = resctrl_io_alloc_closid(r); + + rdt_staged_configs_clear(); + ret = resctrl_io_alloc_parse_line(buf, r, s, io_alloc_closid); + if (ret) + goto out_clear_configs; + + ret = resctrl_arch_update_domains(r, io_alloc_closid); + +out_clear_configs: + rdt_staged_configs_clear(); +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index cf1fd82dc5a9..bff4a54ae333 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -390,6 +390,8 @@ void rdt_staged_configs_clear(void); bool closid_allocated(unsigned int closid); +bool closid_alloc_fixed(u32 closid); + int resctrl_find_cleanest_closid(void); void *rdt_kn_parent_priv(struct kernfs_node *kn); @@ -426,6 +428,21 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int resctrl_io_alloc_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + +int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid); + +enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type); + +ssize_t resctrl_io_alloc_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +const char *rdtgroup_name_by_closid(u32 closid); +int resctrl_io_alloc_cbm_show(struct kernfs_open_file *of, struct seq_file *seq, + void *v); +ssize_t resctrl_io_alloc_cbm_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); +u32 resctrl_io_alloc_closid(struct rdt_resource *r); #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 0320360cd7a6..8e39dfda56bc 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -226,6 +226,11 @@ bool closid_allocated(unsigned int closid) return !test_bit(closid, closid_free_map); } +bool closid_alloc_fixed(u32 closid) +{ + return __test_and_clear_bit(closid, closid_free_map); +} + /** * rdtgroup_mode_by_closid - Return mode of resource group with closid * @closid: closid if the resource group @@ -1057,15 +1062,17 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_putc(seq, ';'); + hw_shareable = r->cache.shareable_bits; sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->hdr.id); for (i = 0; i < closids_supported(); i++) { - if (!closid_allocated(i)) + if (!closid_allocated(i) || + (resctrl_arch_get_io_alloc_enabled(r) && + i == resctrl_io_alloc_closid(r))) continue; ctrl_val = resctrl_arch_get_config(r, dom, i, s->conf_type); @@ -1093,6 +1100,21 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, break; } } + + /* + * When the "io_alloc" feature is enabled, a portion of the cache + * is configured for shared use between hardware and software. + * Also, when CDP is enabled the CBMs of CDP_CODE and CDP_DATA + * resources are kept in sync. So, the CBMs for "io_alloc" can + * be accessed through either resource. + */ + if (resctrl_arch_get_io_alloc_enabled(r)) { + ctrl_val = resctrl_arch_get_config(r, dom, + resctrl_io_alloc_closid(r), + s->conf_type); + hw_shareable |= ctrl_val; + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { pseudo_locked = dom->plr ? dom->plr->cbm : 0; hwb = test_bit(i, &hw_shareable); @@ -1247,7 +1269,7 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, return 0; } -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) +enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) { switch (my_type) { case CDP_CODE: @@ -1838,6 +1860,18 @@ void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_ kernfs_put(mon_kn); } +const char *rdtgroup_name_by_closid(u32 closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdt_kn_name(rdtgrp->kn); + } + + return NULL; +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1948,6 +1982,20 @@ static struct rftype res_common_files[] = { .seq_show = rdt_thread_throttle_mode_show, }, { + .name = "io_alloc", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_io_alloc_show, + .write = resctrl_io_alloc_write, + }, + { + .name = "io_alloc_cbm", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_io_alloc_cbm_show, + .write = resctrl_io_alloc_cbm_write, + }, + { .name = "max_threshold_occupancy", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -2138,6 +2186,23 @@ static void thread_throttle_mode_init(void) RFTYPE_CTRL_INFO | RFTYPE_RES_MB); } +/* + * The resctrl file "io_alloc" is added using L3 resource. However, it results + * in this file being visible for *all* cache resources (eg. L2 cache), + * whether it supports "io_alloc" or not. + */ +static void io_alloc_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (r->cache.io_alloc_capable) { + resctrl_file_fflags_init("io_alloc", RFTYPE_CTRL_INFO | + RFTYPE_RES_CACHE); + resctrl_file_fflags_init("io_alloc_cbm", + RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE); + } +} + void resctrl_file_fflags_init(const char *config, unsigned long fflags) { struct rftype *rft; @@ -3383,11 +3448,12 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) { unsigned int cbm_len = r->cache.cbm_len; unsigned long first_bit, zero_bit; - unsigned long val = _val; + unsigned long val; - if (!val) - return 0; + if (!_val || r->cache.arch_has_sparse_bitmasks) + return _val; + val = _val; first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); @@ -3480,7 +3546,7 @@ static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schem * If there are no more shareable bits available on any domain then * the entire allocation will fail. */ -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) +int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) { struct rdt_ctrl_domain *d; int ret; @@ -4408,6 +4474,8 @@ int resctrl_init(void) thread_throttle_mode_init(); + io_alloc_init(); + ret = resctrl_mon_resource_init(); if (ret) return ret; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 0addcc849ff2..360b00854115 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -302,7 +302,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) if (!i) return ERR_PTR(-ENOMEM); - if (!(i->i_state & I_NEW)) + if (!(inode_state_read_once(i) & I_NEW)) return i; /* precalculate the data offset */ diff --git a/fs/select.c b/fs/select.c index 082cf60c7e23..65019b8ba3f7 100644 --- a/fs/select.c +++ b/fs/select.c @@ -776,17 +776,13 @@ static inline int get_sigset_argpack(struct sigset_argpack *to, { // the path is hot enough for overhead of copy_from_user() to matter if (from) { - if (can_do_masked_user_access()) - from = masked_user_access_begin(from); - else if (!user_read_access_begin(from, sizeof(*from))) - return -EFAULT; - unsafe_get_user(to->p, &from->p, Efault); - unsafe_get_user(to->size, &from->size, Efault); - user_read_access_end(); + scoped_user_read_access(from, Efault) { + unsafe_get_user(to->p, &from->p, Efault); + unsafe_get_user(to->size, &from->size, Efault); + } } return 0; Efault: - user_read_access_end(); return -EFAULT; } diff --git a/fs/signalfd.c b/fs/signalfd.c index d469782f97f4..d69eab584bc6 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -250,8 +250,6 @@ static const struct file_operations signalfd_fops = { static int do_signalfd4(int ufd, sigset_t *mask, int flags) { - struct signalfd_ctx *ctx; - /* Check the SFD_* constants for consistency. */ BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK); @@ -263,7 +261,8 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) signotset(mask); if (ufd == -1) { - struct file *file; + int fd; + struct signalfd_ctx *ctx __free(kfree) = NULL; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -271,22 +270,16 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) ctx->sigmask = *mask; - ufd = get_unused_fd_flags(flags & O_CLOEXEC); - if (ufd < 0) { - kfree(ctx); - return ufd; - } - - file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops, - ctx, O_RDWR | (flags & O_NONBLOCK), - FMODE_NOWAIT); - if (IS_ERR(file)) { - put_unused_fd(ufd); - kfree(ctx); - return PTR_ERR(file); - } - fd_install(ufd, file); + fd = FD_ADD(flags & O_CLOEXEC, + anon_inode_getfile_fmode( + "[signalfd]", &signalfd_fops, ctx, + O_RDWR | (flags & O_NONBLOCK), FMODE_NOWAIT)); + if (fd >= 0) + retain_and_null_ptr(ctx); + return fd; } else { + struct signalfd_ctx *ctx; + CLASS(fd, f)(ufd); if (fd_empty(f)) return -EBADF; diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 018055fd2cdb..e3ea6fe7edb4 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -16,6 +16,7 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); static void smb2_close_cached_fid(struct kref *ref); static void cfids_laundromat_worker(struct work_struct *work); +static void close_cached_dir_locked(struct cached_fid *cfid); struct cached_dir_dentry { struct list_head entry; @@ -388,7 +389,7 @@ out: * lease. Release one here, and the second below. */ cfid->has_lease = false; - close_cached_dir(cfid); + close_cached_dir_locked(cfid); } spin_unlock(&cfids->cfid_list_lock); @@ -480,18 +481,52 @@ void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon, spin_lock(&cfid->cfids->cfid_list_lock); if (cfid->has_lease) { cfid->has_lease = false; - close_cached_dir(cfid); + close_cached_dir_locked(cfid); } spin_unlock(&cfid->cfids->cfid_list_lock); close_cached_dir(cfid); } - +/** + * close_cached_dir - drop a reference of a cached dir + * + * The release function will be called with cfid_list_lock held to remove the + * cached dirs from the list before any other thread can take another @cfid + * ref. Must not be called with cfid_list_lock held; use + * close_cached_dir_locked() called instead. + * + * @cfid: cached dir + */ void close_cached_dir(struct cached_fid *cfid) { + lockdep_assert_not_held(&cfid->cfids->cfid_list_lock); kref_put_lock(&cfid->refcount, smb2_close_cached_fid, &cfid->cfids->cfid_list_lock); } +/** + * close_cached_dir_locked - put a reference of a cached dir with + * cfid_list_lock held + * + * Calling close_cached_dir() with cfid_list_lock held has the potential effect + * of causing a deadlock if the invariant of refcount >= 2 is false. + * + * This function is used in paths that hold cfid_list_lock and expect at least + * two references. If that invariant is violated, WARNs and returns without + * dropping a reference; the final put must still go through + * close_cached_dir(). + * + * @cfid: cached dir + */ +static void close_cached_dir_locked(struct cached_fid *cfid) +{ + lockdep_assert_held(&cfid->cfids->cfid_list_lock); + + if (WARN_ON(kref_read(&cfid->refcount) < 2)) + return; + + kref_put(&cfid->refcount, smb2_close_cached_fid); +} + /* * Called from cifs_kill_sb when we unmount a share */ diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index 9891f55bac1e..da935bd1ce87 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -90,7 +90,6 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo, size_t desc_len; struct key *spnego_key; const char *hostname = server->hostname; - const struct cred *saved_cred; /* length of fields (with semicolons): ver=0xyz ip4=ipaddress host=hostname sec=mechanism uid=0xFF user=username */ @@ -158,9 +157,8 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo, dp += sprintf(dp, ";upcall_target=app"); cifs_dbg(FYI, "key description = %s\n", description); - saved_cred = override_creds(spnego_cred); - spnego_key = request_key(&cifs_spnego_key_type, description, ""); - revert_creds(saved_cred); + scoped_with_creds(spnego_cred) + spnego_key = request_key(&cifs_spnego_key_type, description, ""); #ifdef CONFIG_CIFS_DEBUG2 if (cifsFYI && !IS_ERR(spnego_key)) { diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 185ac41bd7e9..6eccb9ed9daa 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -500,7 +500,7 @@ cifs_evict_inode(struct inode *inode) { netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_NETFS_WB) + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); cifs_fscache_release_inode_cookie(inode); clear_inode(inode); @@ -1149,6 +1149,9 @@ cifs_setlease(struct file *file, int arg, struct file_lease **lease, void **priv struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + /* Check if file is oplocked if this is request for new lease */ if (arg == F_UNLCK || ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 7da194f29fef..dcc50a2bfa4b 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1363,6 +1363,14 @@ do_retry: if (rdata->result == -ENODATA) { rdata->result = 0; __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + trace_smb3_read_err(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->subreq.len - rdata->subreq.transferred, + rdata->result); } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && @@ -1374,6 +1382,13 @@ do_retry: } if (rdata->got_bytes) __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); + trace_smb3_read_done(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->got_bytes); } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, @@ -1445,6 +1460,13 @@ cifs_async_readv(struct cifs_io_subrequest *rdata) rdata->iov[1].iov_base = (char *)smb + 4; rdata->iov[1].iov_len = get_rfc1002_length(smb); + trace_smb3_read_enter(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.netfid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start, rdata->subreq.len); + rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive, cifs_readv_callback, NULL, rdata, 0, NULL); diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 55cb4b0cbd48..388efb35fb98 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3104,7 +3104,7 @@ bind_socket(struct TCP_Server_Info *server) struct socket *socket = server->ssocket; rc = kernel_bind(socket, - (struct sockaddr *) &server->srcaddr, + (struct sockaddr_unsized *) &server->srcaddr, sizeof(server->srcaddr)); if (rc < 0) { struct sockaddr_in *saddr4; @@ -3403,7 +3403,7 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); - rc = kernel_connect(socket, saddr, slen, + rc = kernel_connect(socket, (struct sockaddr_unsized *)saddr, slen, server->noblockcnt ? O_NONBLOCK : 0); /* * When mounting SMB root file systems, we do not want to block in @@ -4451,6 +4451,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) out: kfree(ctx->username); + kfree(ctx->domainname); kfree_sensitive(ctx->password); kfree(origin_fullpath); kfree(ctx); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 474dadeb1593..9dc0a968ec89 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -9,6 +9,7 @@ * */ #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/filelock.h> #include <linux/backing-dev.h> #include <linux/stat.h> diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 0f894d09157b..2a0d8b87bd8e 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1834,6 +1834,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->password = NULL; kfree_sensitive(ctx->password2); ctx->password2 = NULL; + kfree(ctx->source); + ctx->source = NULL; + kfree(fc->source); + fc->source = NULL; return -EINVAL; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index cac355364e43..b75482730912 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -6,6 +6,7 @@ * */ #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/pagemap.h> @@ -101,7 +102,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) cifs_dbg(FYI, "%s: revalidating inode %llu\n", __func__, cifs_i->uniqueid); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { cifs_dbg(FYI, "%s: inode %llu is new\n", __func__, cifs_i->uniqueid); return; @@ -146,7 +147,7 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) */ if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) { /* only provide fake values on a new inode */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { if (fattr->cf_cifsattrs & ATTR_DIRECTORY) set_nlink(inode, 2); else @@ -167,12 +168,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - if (!(inode->i_state & I_NEW) && + if (!(inode_state_read_once(inode) & I_NEW) && unlikely(inode_wrong_type(inode, fattr->cf_mode))) { CIFS_I(inode)->time = 0; /* force reval */ return -ESTALE; } - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) CIFS_I(inode)->netfs.zero_point = fattr->cf_eof; cifs_revalidate_cache(inode, fattr); @@ -194,7 +195,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, inode->i_gid = fattr->cf_gid; /* if dynperm is set, don't clobber existing mode */ - if (inode->i_state & I_NEW || + if (inode_state_read(inode) & I_NEW || !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) inode->i_mode = fattr->cf_mode; @@ -236,7 +237,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, if (fattr->cf_flags & CIFS_FATTR_JUNCTION) inode->i_flags |= S_AUTOMOUNT; - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { cifs_set_netfs_context(inode); cifs_set_ops(inode); } @@ -1638,7 +1639,7 @@ retry_iget5_locked: cifs_fattr_to_inode(inode, fattr, false); if (sb->s_flags & SB_NOATIME) inode->i_flags |= S_NOATIME | S_NOCMTIME; - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_ino = hash; cifs_fscache_get_inode_cookie(inode); unlock_new_inode(inode); diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index ca8f3dd7ff63..78650527d4bb 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -7,6 +7,7 @@ #include <linux/pagemap.h> #include <linux/vfs.h> +#include <linux/fs_struct.h> #include <uapi/linux/magic.h> #include "cifsglob.h" #include "cifsproto.h" diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index f901ae18e68a..94454e8826b0 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -6092,8 +6092,8 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "target name is %s\n", target_name); - rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS, - &path, 0); + rc = ksmbd_vfs_kern_path_start_removing(work, link_name, LOOKUP_NO_SYMLINKS, + &path, 0); if (rc) { if (rc != -ENOENT) goto out; @@ -6111,7 +6111,7 @@ static int smb2_create_link(struct ksmbd_work *work, ksmbd_debug(SMB, "link already exists\n"); goto out; } - ksmbd_vfs_kern_path_unlock(&path); + ksmbd_vfs_kern_path_end_removing(&path); } rc = ksmbd_vfs_link(work, target_name, link_name); if (rc) diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index d2e391c29464..6e03e93321b8 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -522,10 +522,10 @@ static int create_socket(struct interface *iface) } if (ipv4) - ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin, + ret = kernel_bind(ksmbd_socket, (struct sockaddr_unsized *)&sin, sizeof(sin)); else - ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin6, + ret = kernel_bind(ksmbd_socket, (struct sockaddr_unsized *)&sin6, sizeof(sin6)); if (ret) { pr_err("Failed to bind socket: %d\n", ret); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 891ed2dc2b73..03fd7409be79 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -49,27 +49,9 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, i_uid_write(inode, i_uid_read(parent_inode)); } -/** - * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable - * @parent: parent dentry - * @child: child dentry - * - * Returns: %0 on success, %-ENOENT if the parent dentry is not stable - */ -int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) -{ - inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - if (child->d_parent != parent) { - inode_unlock(d_inode(parent)); - return -ENOENT; - } - - return 0; -} - static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf, char *pathname, unsigned int flags, - struct path *path, bool do_lock) + struct path *path, bool for_remove) { struct qstr last; struct filename *filename __free(putname) = NULL; @@ -99,22 +81,20 @@ static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf, return -ENOENT; } - if (do_lock) { + if (for_remove) { err = mnt_want_write(path->mnt); if (err) { path_put(path); return -ENOENT; } - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, path->dentry, 0); + d = start_removing_noperm(path->dentry, &last); if (!IS_ERR(d)) { dput(path->dentry); path->dentry = d; return 0; } - inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); return -ENOENT; @@ -188,8 +168,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } mode |= S_IFREG; - err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), - dentry, mode, true); + err = vfs_create(mnt_idmap(path.mnt), dentry, mode, NULL); if (!err) { ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); @@ -230,7 +209,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; d = dentry; - dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); + dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode, NULL); if (IS_ERR(dentry)) err = PTR_ERR(dentry); else if (d_is_negative(dentry)) @@ -609,7 +588,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) idmap = mnt_idmap(path->mnt); if (S_ISDIR(d_inode(path->dentry)->i_mode)) { - err = vfs_rmdir(idmap, d_inode(parent), path->dentry); + err = vfs_rmdir(idmap, d_inode(parent), path->dentry, NULL); if (err && err != -ENOTEMPTY) ksmbd_debug(VFS, "rmdir failed, err %d\n", err); } else { @@ -681,7 +660,6 @@ out1: int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, char *newname, int flags) { - struct dentry *old_parent, *new_dentry, *trap; struct dentry *old_child = old_path->dentry; struct path new_path; struct qstr new_last; @@ -691,7 +669,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, struct ksmbd_file *parent_fp; int new_type; int err, lookup_flags = LOOKUP_NO_SYMLINKS; - int target_lookup_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; if (ksmbd_override_fsids(work)) return -ENOMEM; @@ -702,14 +679,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, goto revert_fsids; } - /* - * explicitly handle file overwrite case, for compatibility with - * filesystems that may not support rename flags (e.g: fuse) - */ - if (flags & RENAME_NOREPLACE) - target_lookup_flags |= LOOKUP_EXCL; - flags &= ~(RENAME_NOREPLACE); - retry: err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH, &new_path, &new_last, &new_type, @@ -726,17 +695,14 @@ retry: if (err) goto out2; - trap = lock_rename_child(old_child, new_path.dentry); - if (IS_ERR(trap)) { - err = PTR_ERR(trap); + rd.mnt_idmap = mnt_idmap(old_path->mnt); + rd.old_parent = NULL; + rd.new_parent = new_path.dentry; + rd.flags = flags; + rd.delegated_inode = NULL, + err = start_renaming_dentry(&rd, lookup_flags, old_child, &new_last); + if (err) goto out_drop_write; - } - - old_parent = dget(old_child->d_parent); - if (d_unhashed(old_child)) { - err = -EINVAL; - goto out3; - } parent_fp = ksmbd_lookup_fd_inode(old_child->d_parent); if (parent_fp) { @@ -749,44 +715,17 @@ retry: ksmbd_fd_put(work, parent_fp); } - new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, - lookup_flags | target_lookup_flags); - if (IS_ERR(new_dentry)) { - err = PTR_ERR(new_dentry); - goto out3; - } - - if (d_is_symlink(new_dentry)) { + if (d_is_symlink(rd.new_dentry)) { err = -EACCES; - goto out4; - } - - if (old_child == trap) { - err = -EINVAL; - goto out4; - } - - if (new_dentry == trap) { - err = -ENOTEMPTY; - goto out4; + goto out3; } - rd.mnt_idmap = mnt_idmap(old_path->mnt), - rd.old_parent = old_parent, - rd.old_dentry = old_child, - rd.new_parent = new_path.dentry, - rd.new_dentry = new_dentry, - rd.flags = flags, - rd.delegated_inode = NULL, err = vfs_rename(&rd); if (err) ksmbd_debug(VFS, "vfs_rename failed err %d\n", err); -out4: - dput(new_dentry); out3: - dput(old_parent); - unlock_rename(old_parent, new_path.dentry); + end_renaming(&rd); out_drop_write: mnt_drop_write(old_path->mnt); out2: @@ -1084,18 +1023,17 @@ int ksmbd_vfs_unlink(struct file *filp) return err; dir = dget_parent(dentry); - err = ksmbd_vfs_lock_parent(dir, dentry); - if (err) + dentry = start_removing_dentry(dir, dentry); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) goto out; - dget(dentry); if (S_ISDIR(d_inode(dentry)->i_mode)) - err = vfs_rmdir(idmap, d_inode(dir), dentry); + err = vfs_rmdir(idmap, d_inode(dir), dentry, NULL); else err = vfs_unlink(idmap, d_inode(dir), dentry, NULL); - dput(dentry); - inode_unlock(d_inode(dir)); + end_removing(dentry); if (err) ksmbd_debug(VFS, "failed to delete, err %d\n", err); out: @@ -1207,7 +1145,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, static int __ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, unsigned int flags, - struct path *path, bool caseless, bool do_lock) + struct path *path, bool caseless, bool for_remove) { struct ksmbd_share_config *share_conf = work->tcon->share_conf; struct path parent_path; @@ -1215,7 +1153,7 @@ int __ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, int err; retry: - err = ksmbd_vfs_path_lookup(share_conf, filepath, flags, path, do_lock); + err = ksmbd_vfs_path_lookup(share_conf, filepath, flags, path, for_remove); if (!err || !caseless) return err; @@ -1286,7 +1224,7 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, } /** - * ksmbd_vfs_kern_path_locked() - lookup a file and get path info + * ksmbd_vfs_kern_path_start_remove() - lookup a file and get path info prior to removal * @work: work * @filepath: file path that is relative to share * @flags: lookup flags @@ -1298,20 +1236,19 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, * filesystem will have been gained. * Return: 0 on if file was found, otherwise error */ -int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath, - unsigned int flags, - struct path *path, bool caseless) +int ksmbd_vfs_kern_path_start_removing(struct ksmbd_work *work, char *filepath, + unsigned int flags, + struct path *path, bool caseless) { return __ksmbd_vfs_kern_path(work, filepath, flags, path, caseless, true); } -void ksmbd_vfs_kern_path_unlock(const struct path *path) +void ksmbd_vfs_kern_path_end_removing(const struct path *path) { - /* While lock is still held, ->d_parent is safe */ - inode_unlock(d_inode(path->dentry->d_parent)); + end_removing(path->dentry); mnt_drop_write(path->mnt); - path_put(path); + mntput(path->mnt); } struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index df6421b4590b..16ca29ee16e5 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -120,10 +120,10 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, bool caseless); -int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, - unsigned int flags, - struct path *path, bool caseless); -void ksmbd_vfs_kern_path_unlock(const struct path *path); +int ksmbd_vfs_kern_path_start_removing(struct ksmbd_work *work, char *name, + unsigned int flags, + struct path *path, bool caseless); +void ksmbd_vfs_kern_path_end_removing(const struct path *path); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, diff --git a/fs/splice.c b/fs/splice.c index f5094b6d00a0..d338fe56b50b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1498,7 +1498,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, /* * For lack of a better implementation, implement vmsplice() to userspace - * as a simple copy of the pipes pages to the user iov. + * as a simple copy of the pipe's pages to the user iov. */ static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index cceae3b78698..82b687414e65 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -86,7 +86,7 @@ struct inode *squashfs_iget(struct super_block *sb, long long ino, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; err = squashfs_read_inode(inode, ino); diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..7c66b96b59be 100644 --- a/fs/super.c +++ b/fs/super.c @@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; return s; fail: @@ -1183,11 +1184,14 @@ static inline bool get_active_super(struct super_block *sb) static const char *filesystems_freeze_ptr = "filesystems_freeze"; -static void filesystems_freeze_callback(struct super_block *sb, void *unused) +static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; + if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE)) + return; + if (!get_active_super(sb)) return; @@ -1201,9 +1205,13 @@ static void filesystems_freeze_callback(struct super_block *sb, void *unused) deactivate_super(sb); } -void filesystems_freeze(void) +void filesystems_freeze(bool freeze_all) { - __iterate_supers(filesystems_freeze_callback, NULL, + void *freeze_all_ptr = NULL; + + if (freeze_all) + freeze_all_ptr = &freeze_all; + __iterate_supers(filesystems_freeze_callback, freeze_all_ptr, SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE); } diff --git a/fs/sync.c b/fs/sync.c index 2955cd4c77a3..431fc5f5be06 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -117,16 +117,17 @@ SYSCALL_DEFINE0(sync) static void do_sync_work(struct work_struct *work) { int nowait = 0; + int wait = 1; /* * Sync twice to reduce the possibility we skipped some inodes / pages * because they were temporarily locked */ - iterate_supers(sync_inodes_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); sync_bdevs(false); - iterate_supers(sync_inodes_one_sb, &nowait); - iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, NULL); + iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); @@ -182,7 +183,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) if (!file->f_op->fsync) return -EINVAL; - if (!datasync && (inode->i_state & I_DIRTY_TIME)) + if (!datasync && (inode_state_read_once(inode) & I_DIRTY_TIME)) mark_inode_dirty_sync(inode); return file->f_op->fsync(file, start, end, datasync); } @@ -280,14 +281,12 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { - int sync_mode = WB_SYNC_NONE; - if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == SYNC_FILE_RANGE_WRITE_AND_WAIT) - sync_mode = WB_SYNC_ALL; - - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - sync_mode); + ret = filemap_fdatawrite_range(mapping, offset, + endbyte); + else + ret = filemap_flush_range(mapping, offset, endbyte); if (ret < 0) goto out; } diff --git a/fs/timerfd.c b/fs/timerfd.c index c68f28d9c426..9fcea7860ddf 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -393,9 +393,8 @@ static const struct file_operations timerfd_fops = { SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { - int ufd; - struct timerfd_ctx *ctx; - struct file *file; + struct timerfd_ctx *ctx __free(kfree) = NULL; + int ret; /* Check the TFD_* constants for consistency. */ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); @@ -432,23 +431,13 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ctx->moffs = ktime_mono_to_real(0); - ufd = get_unused_fd_flags(flags & TFD_SHARED_FCNTL_FLAGS); - if (ufd < 0) { - kfree(ctx); - return ufd; - } - - file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, - O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), - FMODE_NOWAIT); - if (IS_ERR(file)) { - put_unused_fd(ufd); - kfree(ctx); - return PTR_ERR(file); - } - - fd_install(ufd, file); - return ufd; + ret = FD_ADD(flags & TFD_SHARED_FCNTL_FLAGS, + anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), + FMODE_NOWAIT)); + if (ret >= 0) + retain_and_null_ptr(ctx); + return ret; } static int do_timerfd_settime(int ufd, int flags, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index ca41ce8208c4..c3265b8804f5 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1323,7 +1323,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) inode_lock(inode); /* Synchronize the inode unless this is a 'datasync()' call. */ - if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { + if (!datasync || (inode_state_read_once(inode) & I_DIRTY_DATASYNC)) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 46952a33c4e6..f453c37cee37 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -114,7 +114,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) inode = iget_locked(sb, inum); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ui = ubifs_inode(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a79d73f28aa7..7fae8002344a 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1962,7 +1962,7 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { if (UDF_I(inode)->i_hidden != hidden_inode) { iput(inode); return ERR_PTR(-EFSCORRUPTED); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 8361c00e8fa6..e2b0a35de2a7 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -655,7 +655,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ufsi = UFS_I(inode); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 54c6cc7fe9c6..e6e74b384087 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -2111,9 +2111,7 @@ static void init_once_userfaultfd_ctx(void *mem) static int new_userfaultfd(int flags) { - struct userfaultfd_ctx *ctx; - struct file *file; - int fd; + struct userfaultfd_ctx *ctx __free(kfree) = NULL; VM_WARN_ON_ONCE(!current->mm); @@ -2135,26 +2133,18 @@ static int new_userfaultfd(int flags) atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; - fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); - if (fd < 0) - goto err_out; + FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS, + anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), + NULL)); + if (fdf.err) + return fdf.err; - /* Create a new inode so that the LSM can block the creation. */ - file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); - if (IS_ERR(file)) { - put_unused_fd(fd); - fd = PTR_ERR(file); - goto err_out; - } /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - file->f_mode |= FMODE_NOWAIT; - fd_install(fd, file); - return fd; -err_out: - kmem_cache_free(userfaultfd_ctx_cachep, ctx); - return fd; + fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT; + retain_and_null_ptr(ctx); + return fd_publish(fdf); } static inline bool userfaultfd_syscall_allowed(int flags) diff --git a/fs/utimes.c b/fs/utimes.c index c7c7958e57b2..86f8ce8cd6b1 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -22,7 +22,7 @@ int vfs_utimes(const struct path *path, struct timespec64 *times) int error; struct iattr newattrs; struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; if (times) { if (!nsec_valid(times[0].tv_nsec) || @@ -66,7 +66,7 @@ retry_deleg: error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -76,6 +76,7 @@ retry_deleg: out: return error; } +EXPORT_SYMBOL_GPL(vfs_utimes); static int do_utimes_path(int dfd, const char __user *filename, struct timespec64 *times, int flags) diff --git a/fs/xattr.c b/fs/xattr.c index 8851a5ef34f5..32d445fb60aa 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -274,7 +274,7 @@ int __vfs_setxattr_noperm(struct mnt_idmap *idmap, int __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, - int flags, struct inode **delegated_inode) + int flags, struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -305,7 +305,7 @@ vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; const void *orig_value = value; int error; @@ -322,7 +322,7 @@ retry_deleg: flags, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -533,7 +533,7 @@ EXPORT_SYMBOL(__vfs_removexattr); int __vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, - struct inode **delegated_inode) + struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -567,7 +567,7 @@ vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int error; retry_deleg: @@ -576,7 +576,7 @@ retry_deleg: name, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index de840abc0bcd..57e47077c75a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -73,7 +73,8 @@ #define XFS_ERRTAG_WRITE_DELAY_MS 43 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 #define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 -#define XFS_ERRTAG_MAX 46 +#define XFS_ERRTAG_FORCE_ZERO_RANGE 46 +#define XFS_ERRTAG_MAX 47 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ -XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \ +XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4) #endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c index b0791a71931c..b40f71f878b5 100644 --- a/fs/xfs/libxfs/xfs_zones.c +++ b/fs/xfs/libxfs/xfs_zones.c @@ -95,6 +95,7 @@ xfs_zone_validate_seq( case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_ACTIVE: return xfs_zone_validate_wp(zone, rtg, write_pointer); case BLK_ZONE_COND_FULL: return xfs_zone_validate_full(zone, rtg, write_pointer); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 2ef7742be7d3..7bfa37c99480 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1249,7 +1249,7 @@ xchk_irele( * hits do not clear DONTCACHE, so we must do it here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index a90a011c7e5f..4f7040c9ddf0 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -1933,7 +1933,7 @@ xrep_inode_pptr( * Unlinked inodes that cannot be added to the directory tree will not * have a parent pointer. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return 0; /* Children of the superblock do not have parent pointers. */ diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 9c12cb844231..4e550a1d5353 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -152,11 +152,10 @@ xrep_orphanage_create( } /* Try to find the orphanage directory. */ - inode_lock_nested(root_inode, I_MUTEX_PARENT); - orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry); + orphanage_dentry = start_creating_noperm(root_dentry, &QSTR(ORPHANAGE)); if (IS_ERR(orphanage_dentry)) { error = PTR_ERR(orphanage_dentry); - goto out_unlock_root; + goto out_dput_root; } /* @@ -167,10 +166,10 @@ xrep_orphanage_create( */ if (d_really_is_negative(orphanage_dentry)) { orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, - orphanage_dentry, 0750); + orphanage_dentry, 0750, NULL); error = PTR_ERR(orphanage_dentry); if (IS_ERR(orphanage_dentry)) - goto out_unlock_root; + goto out_dput_orphanage; } /* Not a directory? Bail out. */ @@ -200,9 +199,7 @@ xrep_orphanage_create( sc->orphanage_ilock_flags = 0; out_dput_orphanage: - dput(orphanage_dentry); -out_unlock_root: - inode_unlock(VFS_I(sc->mp->m_rootip)); + end_creating(orphanage_dentry); out_dput_root: dput(root_dentry); out: diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 3b692c4acc1e..11d5de10fd56 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -915,7 +915,7 @@ xchk_pptr_looks_zapped( * Temporary files that cannot be linked into the directory tree do not * have attr forks because they cannot ever have parents. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return false; /* diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 5902398185a8..df629892462f 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -184,7 +184,7 @@ xrep_symlink_salvage_inline( sc->ip->i_disk_size == 1 && old_target[0] == '?') return 0; - nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); + nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes); memcpy(target_buf, ifp->if_data, nr); return nr; } diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index cdd13ed9c569..ed2e8c64b1a8 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -834,7 +834,7 @@ xfarray_sort_scan( si->first_folio_idx = xfarray_idx(si->array, folio_pos(si->folio) + si->array->obj_size - 1); - next_pos = folio_pos(si->folio) + folio_size(si->folio); + next_pos = folio_next_pos(si->folio); si->last_folio_idx = xfarray_idx(si->array, next_pos - 1); if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos) si->last_folio_idx--; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a26f79815533..56a544638491 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -271,7 +271,7 @@ xfs_discard_folio( * folio itself and not the start offset that is passed in. */ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio), NULL); + folio_next_pos(folio), NULL); } /* @@ -742,14 +742,15 @@ xfs_vm_read_folio( struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &xfs_read_iomap_ops); + iomap_bio_read_folio(folio, &xfs_read_iomap_ops); + return 0; } STATIC void xfs_vm_readahead( struct readahead_control *rac) { - iomap_readahead(rac, &xfs_read_iomap_ops); + iomap_bio_readahead(rac, &xfs_read_iomap_ops); } static int diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 06ca11731e43..2208a720ec3f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -514,7 +514,7 @@ xfs_can_free_eofblocks( * Caller must either hold the exclusive io lock; or be inactivating * the inode, which guarantees there are no other users of the inode. */ - if (!(VFS_I(ip)->i_state & I_FREEING)) + if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING)) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); /* prealloc/delalloc exists only on regular files */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2702fef2c90c..6108612182e2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -27,6 +27,8 @@ #include "xfs_file.h" #include "xfs_aops.h" #include "xfs_zone_alloc.h" +#include "xfs_error.h" +#include "xfs_errortag.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -674,8 +676,17 @@ xfs_file_dio_write_aligned( struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int dio_flags = 0; ssize_t ret; + /* + * For always COW inodes, each bio must be aligned to the file system + * block size and not just the device sector size because we need to + * allocate a block-aligned amount of space for each write. + */ + if (xfs_is_always_cow_inode(ip)) + dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -693,7 +704,7 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); out_unlock: xfs_iunlock(ip, iolock); return ret; @@ -890,15 +901,7 @@ xfs_file_dio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* - * For always COW inodes we also must check the alignment of each - * individual iovec segment, as they could end up with different - * I/Os due to the way bio_iov_iter_get_pages works, and we'd - * then overwrite an already written block. - */ - if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || - (xfs_is_always_cow_inode(ip) && - (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) return xfs_file_dio_write_unaligned(ip, iocb, from); if (xfs_is_zoned_inode(ip)) return xfs_file_dio_write_zoned(ip, iocb, from); @@ -1254,23 +1257,36 @@ xfs_falloc_zero_range( struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); unsigned int blksize = i_blocksize(inode); loff_t new_size = 0; int error; - trace_xfs_zero_file_space(XFS_I(inode)); + trace_xfs_zero_file_space(ip); error = xfs_falloc_newsize(file, mode, offset, len, &new_size); if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len, ac); - if (error) - return error; + /* + * Zero range implements a full zeroing mechanism but is only used in + * limited situations. It is more efficient to allocate unwritten + * extents than to perform zeroing here, so use an errortag to randomly + * force zeroing on DEBUG kernels for added test coverage. + */ + if (XFS_TEST_ERROR(ip->i_mount, + XFS_ERRTAG_FORCE_ZERO_RANGE)) { + error = xfs_zero_range(ip, offset, len, ac, NULL); + } else { + error = xfs_free_file_space(ip, offset, len, ac); + if (error) + return error; - len = round_up(offset + len, blksize) - round_down(offset, blksize); - offset = round_down(offset, blksize); - error = xfs_alloc_file_space(XFS_I(inode), offset, len); + len = round_up(offset + len, blksize) - + round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(ip, offset, len); + } if (error) return error; return xfs_falloc_setsize(file, new_size); diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index f19fce557354..5a3e3bf4e7cc 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -233,14 +233,11 @@ xfs_open_by_handle( xfs_fsop_handlereq_t *hreq) { const struct cred *cred = current_cred(); - int error; - int fd; int permflag; - struct file *filp; struct inode *inode; struct dentry *dentry; fmode_t fmode; - struct path path; + struct path path __free(path_put) = {}; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -249,12 +246,11 @@ xfs_open_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); inode = d_inode(dentry); + path.dentry = dentry; /* Restrict xfs_open_by_handle to directories & regular files. */ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - error = -EPERM; - goto out_dput; - } + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EPERM; #if BITS_PER_LONG != 32 hreq->oflags |= O_LARGEFILE; @@ -263,48 +259,30 @@ xfs_open_by_handle( permflag = hreq->oflags; fmode = OPEN_FMODE(permflag); if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && - (fmode & FMODE_WRITE) && IS_APPEND(inode)) { - error = -EPERM; - goto out_dput; - } + (fmode & FMODE_WRITE) && IS_APPEND(inode)) + return -EPERM; - if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - error = -EPERM; - goto out_dput; - } + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) + return -EPERM; /* Can't write directories. */ - if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { - error = -EISDIR; - goto out_dput; - } + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) + return -EISDIR; - fd = get_unused_fd_flags(0); - if (fd < 0) { - error = fd; - goto out_dput; - } + path.mnt = mntget(parfilp->f_path.mnt); - path.mnt = parfilp->f_path.mnt; - path.dentry = dentry; - filp = dentry_open(&path, hreq->oflags, cred); - dput(dentry); - if (IS_ERR(filp)) { - put_unused_fd(fd); - return PTR_ERR(filp); - } + FD_PREPARE(fdf, 0, dentry_open(&path, hreq->oflags, cred)); + if (fdf.err) + return fdf.err; if (S_ISREG(inode->i_mode)) { + struct file *filp = fd_prepare_file(fdf); + filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } - fd_install(fd, filp); - return fd; - - out_dput: - dput(dentry); - return error; + return fd_publish(fdf); } int diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 7c541fb373d5..3c1557fb1cf0 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -285,7 +285,7 @@ xfs_inode_mark_sick( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } @@ -309,7 +309,7 @@ xfs_inode_mark_corrupt( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e44040206851..f3fc4d21bfe1 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -334,7 +334,7 @@ xfs_reinit_inode( dev_t dev = inode->i_rdev; kuid_t uid = inode->i_uid; kgid_t gid = inode->i_gid; - unsigned long state = inode->i_state; + unsigned long state = inode_state_read_once(inode); error = inode_init_always(mp->m_super, inode); @@ -345,7 +345,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; - inode->i_state = state; + inode_state_assign_raw(inode, state); mapping_set_folio_min_order(inode->i_mapping, M_IGEO(mp)->min_folio_order); return error; @@ -411,7 +411,7 @@ xfs_iget_recycle( ip->i_flags |= XFS_INEW; xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - inode->i_state = I_NEW; + inode_state_assign_raw(inode, I_NEW); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 36b39539e561..f1f88e48fe22 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1580,7 +1580,7 @@ xfs_iunlink_reload_next( next_ip->i_prev_unlinked = prev_agino; trace_xfs_iunlink_reload_next(next_ip); rele: - ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); + ASSERT(!(inode_state_read_once(VFS_I(next_ip)) & I_DONTCACHE)); if (xfs_is_quotacheck_running(mp) && next_ip) xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); xfs_irele(next_ip); @@ -2111,7 +2111,7 @@ xfs_rename_alloc_whiteout( */ xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); - VFS_I(tmpfile)->i_state |= I_LINKABLE; + inode_state_set_raw(VFS_I(tmpfile), I_LINKABLE); *wip = tmpfile; return 0; @@ -2330,7 +2330,7 @@ retry: * flag from the inode so it doesn't accidentally get misused in * future. */ - VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE; + inode_state_clear_raw(VFS_I(du_wip.ip), I_LINKABLE); } out_commit: diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 1bd411a1114c..2eb0c6011a2e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -113,9 +113,9 @@ xfs_inode_item_precommit( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & I_DIRTY_TIME) { + if (inode_state_read_once(inode) & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a6bb7ee7a27a..59eaad774371 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1408,10 +1408,8 @@ xfs_file_ioctl( trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_); - sb_start_write(mp->m_super); - error = xfs_blockgc_free_space(mp, &icw); - sb_end_write(mp->m_super); - return error; + guard(super_write)(mp->m_super); + return xfs_blockgc_free_space(mp, &icw); } case XFS_IOC_EXCHANGE_RANGE: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 490e12cb99be..04f39ea15898 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1758,6 +1758,8 @@ xfs_buffered_write_iomap_begin( struct iomap *iomap, struct iomap *srcmap) { + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, + iomap); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -1823,21 +1825,41 @@ xfs_buffered_write_iomap_begin( } /* - * For zeroing, trim a delalloc extent that extends beyond the EOF - * block. If it starts beyond the EOF block, convert it to an + * For zeroing, trim extents that extend beyond the EOF block. If a + * delalloc extent starts beyond the EOF block, convert it to an * unwritten extent. */ - if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && - isnullstartblock(imap.br_startblock)) { + if (flags & IOMAP_ZERO) { xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + u64 end; - if (offset_fsb >= eof_fsb) + if (isnullstartblock(imap.br_startblock) && + offset_fsb >= eof_fsb) goto convert_delay; - if (end_fsb > eof_fsb) { + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) end_fsb = eof_fsb; - xfs_trim_extent(&imap, offset_fsb, - end_fsb - offset_fsb); + + /* + * Look up dirty folios for unwritten mappings within EOF. + * Providing this bypasses the flush iomap uses to trigger + * extent conversion when unwritten mappings have dirty + * pagecache in need of zeroing. + * + * Trim the mapping to the end pos of the lookup, which in turn + * was trimmed to the end of the batch if it became full before + * the end of the mapping. + */ + if (imap.br_state == XFS_EXT_UNWRITTEN && + offset_fsb < eof_fsb) { + loff_t len = min(count, + XFS_FSB_TO_B(mp, imap.br_blockcount)); + + end = iomap_fill_dirty_folios(iter, offset, len); + end_fsb = min_t(xfs_fileoff_t, end_fsb, + XFS_B_TO_FSB(mp, end)); } + + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); } /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index caff0125faea..ad94fbf55014 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1420,7 +1420,7 @@ xfs_setup_inode( bool is_meta = xfs_is_internal_inode(ip); inode->i_ino = ip->i_ino; - inode->i_state |= I_NEW; + inode_state_set_raw(inode, I_NEW); inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 36cda724da89..9d1ed9bb0bee 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -17,7 +17,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip) { struct inode *inode = VFS_I(ip); - if ((inode->i_state & I_DIRTY_PAGES) || + if ((inode_state_read_once(inode) & I_DIRTY_PAGES) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&inode->i_dio_count)) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1067ebb3b001..bc71aa9dcee8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1693,7 +1693,10 @@ xfs_fs_fill_super( if (error) return error; - sb_min_blocksize(sb, BBSIZE); + if (!sb_min_blocksize(sb, BBSIZE)) { + xfs_err(mp, "unable to set blocksize"); + return -EINVAL; + } sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; #ifdef CONFIG_XFS_QUOTA diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index ef7a931ebde5..98f65d99b776 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1204,6 +1204,7 @@ xfs_mount_zones( .mp = mp, }; struct xfs_buftarg *bt = mp->m_rtdev_targp; + xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks; int error; if (!bt) { @@ -1234,12 +1235,35 @@ xfs_mount_zones( return -ENOMEM; xfs_info(mp, "%u zones of %u blocks (%u max open zones)", - mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, - mp->m_max_open_zones); + mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); + /* + * The writeback code switches between inodes regularly to provide + * fairness. The default lower bound is 4MiB, but for zoned file + * systems we want to increase that both to reduce seeks, but also more + * importantly so that workloads that writes files in a multiple of the + * zone size do not get fragmented and require garbage collection when + * they shouldn't. Increase is to the zone size capped by the max + * extent len. + * + * Note that because s_min_writeback_pages is a superblock field, this + * value also get applied to non-zoned files on the data device if + * there are any. On typical zoned setup all data is on the RT device + * because using the more efficient sequential write required zones + * is the reason for using the zone allocator, and either the RT device + * and the (meta)data device are on the same block device, or the + * (meta)data device is on a fast SSD while the data on the RT device + * is on a SMR HDD. In any combination of the above cases enforcing + * the higher min_writeback_pages for non-RT inodes is either a noop + * or beneficial. + */ + mp->m_super->s_min_writeback_pages = + XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >> + PAGE_SHIFT; + if (bdev_is_zoned(bt->bt_bdev)) { - error = blkdev_report_zones(bt->bt_bdev, + error = blkdev_report_zones_cached(bt->bt_bdev, XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); if (error < 0) diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 90e2ad8ee5f4..c1e5e30e90a0 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -112,12 +112,13 @@ static const struct iomap_ops zonefs_write_iomap_ops = { static int zonefs_read_folio(struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &zonefs_read_iomap_ops); + iomap_bio_read_folio(folio, &zonefs_read_iomap_ops); + return 0; } static void zonefs_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &zonefs_read_iomap_ops); + iomap_bio_readahead(rac, &zonefs_read_iomap_ops); } /* diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 70be0b3dda49..086a31269198 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -644,7 +644,7 @@ static struct inode *zonefs_get_file_inode(struct inode *dir, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { WARN_ON_ONCE(inode->i_private != z); return inode; } @@ -683,7 +683,7 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; inode->i_ino = ino; |
