diff options
Diffstat (limited to 'fs')
197 files changed, 2628 insertions, 1783 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index eb0b083da269..612a230bc012 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -483,24 +483,15 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) static void v9fs_mmap_vm_close(struct vm_area_struct *vma) { - struct inode *inode; - - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_ALL, - .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE, - /* absolute end, byte at end included */ - .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE + - (vma->vm_end - vma->vm_start - 1), - }; - if (!(vma->vm_flags & VM_SHARED)) return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); - inode = file_inode(vma->vm_file); - filemap_fdatawrite_wbc(inode->i_mapping, &wbc); + filemap_fdatawrite_range(file_inode(vma->vm_file)->i_mapping, + (loff_t)vma->vm_pgoff * PAGE_SIZE, + (loff_t)vma->vm_pgoff * PAGE_SIZE + + (vma->vm_end - vma->vm_start - 1)); } static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index d0c77ec31b1d..8666c9c62258 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -422,7 +422,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; /* * initialize the inode with the stat info diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index be297e335468..1661a25f2772 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -112,7 +112,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; /* * initialize the inode with the stat info diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0210df8d3500..0bfc7d151dcd 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -29,7 +29,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; pr_debug("affs_iget(%lu)\n", inode->i_ino); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index f31359922e98..71c10a05cebe 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -140,7 +140,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, return ERR_PTR(-ENOMEM); } - cell->name = kmalloc(1 + namelen + 1, GFP_KERNEL); + /* Allocate the cell name and the key name in one go. */ + cell->name = kmalloc(1 + namelen + 1 + + 4 + namelen + 1, GFP_KERNEL); if (!cell->name) { kfree(cell); return ERR_PTR(-ENOMEM); @@ -151,7 +153,11 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->name_len = namelen; for (i = 0; i < namelen; i++) cell->name[i] = tolower(name[i]); - cell->name[i] = 0; + cell->name[i++] = 0; + + cell->key_desc = cell->name + i; + memcpy(cell->key_desc, "afs@", 4); + memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1); cell->net = net; refcount_set(&cell->ref, 1); @@ -229,7 +235,7 @@ error: * @name: The name of the cell. * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. - * @excl: T if an error should be given if the cell name already exists. + * @reason: The reason we're doing the lookup * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if @@ -239,7 +245,8 @@ error: */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; @@ -247,12 +254,18 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, enum afs_cell_state state; int ret, n; - _enter("%s,%s", name, vllist); + _enter("%s,%s,%u", name, vllist, reason); - if (!excl) { + if (reason != AFS_LOOKUP_CELL_PRELOAD) { cell = afs_find_cell(net, name, namesz, trace); - if (!IS_ERR(cell)) + if (!IS_ERR(cell)) { + if (reason == AFS_LOOKUP_CELL_DYNROOT) + goto no_wait; + if (cell->state == AFS_CELL_SETTING_UP || + cell->state == AFS_CELL_UNLOOKED) + goto lookup_cell; goto wait_for_cell; + } } /* Assume we're probably going to create a cell and preallocate and @@ -298,26 +311,69 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell, afs_cell_trace_queue_new); +lookup_cell: + if (reason != AFS_LOOKUP_CELL_PRELOAD && + reason != AFS_LOOKUP_CELL_ROOTCELL) { + set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); + afs_queue_cell(cell, afs_cell_trace_queue_new); + } wait_for_cell: - _debug("wait_for_cell"); state = smp_load_acquire(&cell->state); /* vs error */ - if (state != AFS_CELL_ACTIVE && - state != AFS_CELL_DEAD) { + switch (state) { + case AFS_CELL_ACTIVE: + case AFS_CELL_DEAD: + break; + case AFS_CELL_UNLOOKED: + default: + if (reason == AFS_LOOKUP_CELL_PRELOAD || + reason == AFS_LOOKUP_CELL_ROOTCELL) + break; + _debug("wait_for_cell"); afs_see_cell(cell, afs_cell_trace_wait); wait_var_event(&cell->state, ({ state = smp_load_acquire(&cell->state); /* vs error */ state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; })); + _debug("waited_for_cell %d %d", cell->state, cell->error); } +no_wait: /* Check the state obtained from the wait check. */ + state = smp_load_acquire(&cell->state); /* vs error */ if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } + if (state == AFS_CELL_ACTIVE) { + switch (cell->dns_status) { + case DNS_LOOKUP_NOT_DONE: + if (cell->dns_source == DNS_RECORD_FROM_CONFIG) { + ret = 0; + break; + } + fallthrough; + default: + ret = -EIO; + goto error; + case DNS_LOOKUP_GOOD: + case DNS_LOOKUP_GOOD_WITH_BAD: + ret = 0; + break; + case DNS_LOOKUP_GOT_NOT_FOUND: + ret = -ENOENT; + goto error; + case DNS_LOOKUP_BAD: + ret = -EREMOTEIO; + goto error; + case DNS_LOOKUP_GOT_LOCAL_FAILURE: + case DNS_LOOKUP_GOT_TEMP_FAILURE: + case DNS_LOOKUP_GOT_NS_FAILURE: + ret = -EDESTADDRREQ; + goto error; + } + } _leave(" = %p [cell]", cell); return cell; @@ -325,7 +381,7 @@ wait_for_cell: cell_already_exists: _debug("cell exists"); cell = cursor; - if (excl) { + if (reason == AFS_LOOKUP_CELL_PRELOAD) { ret = -EEXIST; } else { afs_use_cell(cursor, trace); @@ -384,7 +440,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) return -EINVAL; /* allocate a cell record for the root/workstation cell */ - new_root = afs_lookup_cell(net, rootcell, len, vllist, false, + new_root = afs_lookup_cell(net, rootcell, len, vllist, + AFS_LOOKUP_CELL_ROOTCELL, afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); @@ -660,33 +717,6 @@ void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs) } /* - * Allocate a key to use as a placeholder for anonymous user security. - */ -static int afs_alloc_anon_key(struct afs_cell *cell) -{ - struct key *key; - char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp; - - /* Create a key to represent an anonymous user. */ - memcpy(keyname, "afs@", 4); - dp = keyname + 4; - cp = cell->name; - do { - *dp++ = tolower(*cp); - } while (*cp++); - - key = rxrpc_get_null_key(keyname); - if (IS_ERR(key)) - return PTR_ERR(key); - - cell->anonymous_key = key; - - _debug("anon key %p{%x}", - cell->anonymous_key, key_serial(cell->anonymous_key)); - return 0; -} - -/* * Activate a cell. */ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) @@ -695,12 +725,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) struct afs_cell *pcell; int ret; - if (!cell->anonymous_key) { - ret = afs_alloc_anon_key(cell); - if (ret < 0) - return ret; - } - ret = afs_proc_cell_setup(cell); if (ret < 0) return ret; @@ -777,6 +801,7 @@ static bool afs_manage_cell(struct afs_cell *cell) switch (cell->state) { case AFS_CELL_SETTING_UP: goto set_up_cell; + case AFS_CELL_UNLOOKED: case AFS_CELL_ACTIVE: goto cell_is_active; case AFS_CELL_REMOVING: @@ -797,7 +822,7 @@ set_up_cell: goto remove_cell; } - afs_set_cell_state(cell, AFS_CELL_ACTIVE); + afs_set_cell_state(cell, AFS_CELL_UNLOOKED); cell_is_active: if (afs_has_cell_expired(cell, &next_manage)) @@ -807,6 +832,8 @@ cell_is_active: ret = afs_update_cell(cell); if (ret < 0) cell->error = ret; + if (cell->state == AFS_CELL_UNLOOKED) + afs_set_cell_state(cell, AFS_CELL_ACTIVE); } if (next_manage < TIME64_MAX && cell->net->live) { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 89d36e3e5c79..f4e9e12373ac 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -779,7 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct inode *inode = NULL, *ti; afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version); - bool supports_ibulk; + bool supports_ibulk, isnew; long ret; int i; @@ -850,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) * callback counters. */ ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode, - afs_ilookup5_test_by_fid, &vp->fid); + afs_ilookup5_test_by_fid, &vp->fid, &isnew); if (!IS_ERR_OR_NULL(ti)) { vnode = AFS_FS_I(ti); vp->dv_before = vnode->status.data_version; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 8c6130789fde..aa56e8951e03 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -64,7 +64,7 @@ static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) vnode = AFS_FS_I(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 2); @@ -108,7 +108,8 @@ static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry * dotted = true; } - cell = afs_lookup_cell(net, name, len, NULL, false, + cell = afs_lookup_cell(net, name, len, NULL, + AFS_LOOKUP_CELL_DYNROOT, afs_cell_trace_use_lookup_dynroot); if (IS_ERR(cell)) { ret = PTR_ERR(cell); @@ -258,7 +259,7 @@ static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry vnode = AFS_FS_I(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 1); @@ -383,7 +384,7 @@ struct inode *afs_dynroot_iget_root(struct super_block *sb) vnode = AFS_FS_I(inode); /* there shouldn't be an existing inode */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 2); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e1cb17b85791..dde1857fcabb 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -427,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->netfs.inode.i_state & I_NEW) { + if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); afs_op_set_error(op, ret); if (ret == 0) @@ -579,7 +579,7 @@ struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); /* deal with an existing inode */ - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { _leave(" = %p", inode); return inode; } @@ -639,7 +639,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) _debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid); - BUG_ON(!(inode->i_state & I_NEW)); + BUG_ON(!(inode_state_read_once(inode) & I_NEW)); vnode = AFS_FS_I(inode); vnode->cb_v_check = atomic_read(&as->volume->cb_v_break); @@ -748,7 +748,7 @@ void afs_evict_inode(struct inode *inode) if ((S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - (inode->i_state & I_DIRTY) && + (inode_state_read_once(inode) & I_DIRTY) && !sbi->dyn_root) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a45ae5c2ef8a..009064b8d661 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -343,6 +343,7 @@ extern const char afs_init_sysname[]; enum afs_cell_state { AFS_CELL_SETTING_UP, + AFS_CELL_UNLOOKED, AFS_CELL_ACTIVE, AFS_CELL_REMOVING, AFS_CELL_DEAD, @@ -412,6 +413,7 @@ struct afs_cell { u8 name_len; /* Length of name */ char *name; /* Cell name, case-flattened and NUL-padded */ + char *key_desc; /* Authentication key description */ }; /* @@ -1049,9 +1051,18 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); +enum afs_lookup_cell_for { + AFS_LOOKUP_CELL_DYNROOT, + AFS_LOOKUP_CELL_MOUNTPOINT, + AFS_LOOKUP_CELL_DIRECT_MOUNT, + AFS_LOOKUP_CELL_PRELOAD, + AFS_LOOKUP_CELL_ROOTCELL, + AFS_LOOKUP_CELL_ALIAS_CHECK, +}; struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 1ad048e6e164..57c204a3c04e 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size > AFS_MAXCELLNAME) return -ENAMETOOLONG; - cell = afs_lookup_cell(ctx->net, p, size, NULL, false, + cell = afs_lookup_cell(ctx->net, p, size, NULL, + AFS_LOOKUP_CELL_MOUNTPOINT, afs_cell_trace_use_lookup_mntpt); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 40e879c8ca77..44520549b509 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -122,7 +122,8 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_lookup_cell(net, name, strlen(name), args, true, + cell = afs_lookup_cell(net, name, strlen(name), args, + AFS_LOOKUP_CELL_PRELOAD, afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); diff --git a/fs/afs/security.c b/fs/afs/security.c index 6a7744c9e2a2..55ddce94af03 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -16,6 +16,31 @@ static DEFINE_HASHTABLE(afs_permits_cache, 10); static DEFINE_SPINLOCK(afs_permits_lock); +static DEFINE_MUTEX(afs_key_lock); + +/* + * Allocate a key to use as a placeholder for anonymous user security. + */ +static int afs_alloc_anon_key(struct afs_cell *cell) +{ + struct key *key; + + mutex_lock(&afs_key_lock); + key = cell->anonymous_key; + if (!key) { + key = rxrpc_get_null_key(cell->key_desc); + if (!IS_ERR(key)) + cell->anonymous_key = key; + } + mutex_unlock(&afs_key_lock); + + if (IS_ERR(key)) + return PTR_ERR(key); + + _debug("anon key %p{%x}", + cell->anonymous_key, key_serial(cell->anonymous_key)); + return 0; +} /* * get a key @@ -23,11 +48,12 @@ static DEFINE_SPINLOCK(afs_permits_lock); struct key *afs_request_key(struct afs_cell *cell) { struct key *key; + int ret; - _enter("{%x}", key_serial(cell->anonymous_key)); + _enter("{%s}", cell->key_desc); - _debug("key %s", cell->anonymous_key->description); - key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description, + _debug("key %s", cell->key_desc); + key = request_key_net(&key_type_rxrpc, cell->key_desc, cell->net->net, NULL); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { @@ -35,6 +61,12 @@ struct key *afs_request_key(struct afs_cell *cell) return key; } + if (!cell->anonymous_key) { + ret = afs_alloc_anon_key(cell); + if (ret < 0) + return ERR_PTR(ret); + } + /* act as anonymous user */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); @@ -52,11 +84,10 @@ struct key *afs_request_key_rcu(struct afs_cell *cell) { struct key *key; - _enter("{%x}", key_serial(cell->anonymous_key)); + _enter("{%s}", cell->key_desc); - _debug("key %s", cell->anonymous_key->description); - key = request_key_net_rcu(&key_type_rxrpc, - cell->anonymous_key->description, + _debug("key %s", cell->key_desc); + key = request_key_net_rcu(&key_type_rxrpc, cell->key_desc, cell->net->net); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { @@ -65,6 +96,8 @@ struct key *afs_request_key_rcu(struct afs_cell *cell) } /* act as anonymous user */ + if (!cell->anonymous_key) + return NULL; /* Need to allocate */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); } else { @@ -408,7 +441,7 @@ int afs_permission(struct mnt_idmap *idmap, struct inode *inode, if (mask & MAY_NOT_BLOCK) { key = afs_request_key_rcu(vnode->volume->cell); - if (IS_ERR(key)) + if (IS_ERR_OR_NULL(key)) return -ECHILD; ret = -ECHILD; diff --git a/fs/afs/super.c b/fs/afs/super.c index da407f2d6f0d..d672b7ab57ae 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -290,7 +290,7 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) /* lookup the cell record */ if (cellname) { cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, - NULL, false, + NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT, afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%*.*s'\n", diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 709b4cdb723e..fc9676abd252 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) if (!name_len || name_len > AFS_MAXCELLNAME) master = ERR_PTR(-EOPNOTSUPP); else - master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false, + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, + AFS_LOOKUP_CELL_ALIAS_CHECK, afs_cell_trace_use_lookup_canonical); kfree(cell_name); if (IS_ERR(master)) diff --git a/fs/backing-file.c b/fs/backing-file.c index 15a7f8031084..2a86bb6fcd13 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -227,12 +227,6 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; - /* - * Stacked filesystems don't support deferred completions, don't copy - * this property in case it is set by the issuer. - */ - flags &= ~IOCB_DIO_CALLER_COMP; - old_cred = override_creds(ctx->cred); if (is_sync_kiocb(iocb)) { rwf_t rwf = iocb_to_rw_flags(flags); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8f430ff8e445..9fcfdd6b8189 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -307,7 +307,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; befs_ino = BEFS_I(inode); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 1d41ce477df5..ce6f83234b67 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -42,7 +42,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { @@ -61,7 +61,19 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; di = (struct bfs_inode *)bh->b_data + off; - inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); + /* + * https://martin.hinner.info/fs/bfs/bfs-structure.html explains that + * BFS in SCO UnixWare environment used only lower 9 bits of di->i_mode + * value. This means that, although bfs_write_inode() saves whole + * inode->i_mode bits (which include S_IFMT bits and S_IS{UID,GID,VTX} + * bits), middle 7 bits of di->i_mode value can be garbage when these + * bits were not saved by bfs_write_inode(). + * Since we can't tell whether middle 7 bits are garbage, use only + * lower 12 bits (i.e. tolerate S_IS{UID,GID,VTX} bits possibly being + * garbage) and reconstruct S_IFMT bits for Linux environment from + * di->i_vtype value. + */ + inode->i_mode = 0x00000FFF & le32_to_cpu(di->i_mode); if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { inode->i_mode |= S_IFDIR; inode->i_op = &bfs_dir_inops; @@ -71,6 +83,11 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; + } else { + brelse(bh); + printf("Unknown vtype=%u %s:%08lx\n", + le32_to_cpu(di->i_vtype), inode->i_sb->s_id, ino); + goto error; } BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a839f960cd4a..a8b1d79e4af0 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -837,8 +837,10 @@ out: inode_unlock(d_inode(root)); if (err) { - if (f) + if (f) { + exe_file_allow_write_access(f); filp_close(f, NULL); + } kfree(e); return err; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 755ec6dfd51c..b925da738afd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2228,6 +2228,14 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_account_cgroup_owner(wbc, folio, range_len); folio_unlock(folio); } + /* + * If the fs is already in error status, do not submit any writeback + * but immediately finish it. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) { + btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info))); + return; + } btrfs_submit_bbio(bbio, 0); } @@ -2460,10 +2468,7 @@ static int extent_write_cache_pages(struct address_space *mapping, &BTRFS_I(inode)->runtime_flags)) wbc->tagged_writepages = 1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7efd1f8a1912..fa82def46e39 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2854,12 +2854,22 @@ static int btrfs_fallocate_update_isize(struct inode *inode, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; + u64 range_start; + u64 range_end; int ret; int ret2; if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) return 0; + range_start = round_down(i_size_read(inode), root->fs_info->sectorsize); + range_end = round_up(end, root->fs_info->sectorsize); + + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start, + range_end - range_start); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b1b3a0553ee..2d0e5086dad8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -177,8 +177,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, return ret; } ret = paths_from_inode(inum, ipath); - if (ret < 0) + if (ret < 0) { + btrfs_put_root(local_root); goto err; + } /* * We deliberately ignore the bit ipath might have been too small to @@ -3884,7 +3886,7 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) ASSERT(ret != -ENOMEM); return ret; } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING))); } return 0; @@ -5361,7 +5363,7 @@ static void evict_inode_truncate_pages(struct inode *inode) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct rb_node *node; - ASSERT(inode->i_state & I_FREEING); + ASSERT(inode_state_read_once(inode) & I_FREEING); truncate_inode_pages_final(&inode->i_data); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); @@ -5799,7 +5801,7 @@ struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->vfs_inode.i_state & I_NEW)) + if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); @@ -5823,7 +5825,7 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->vfs_inode.i_state & I_NEW)) + if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) return inode; path = btrfs_alloc_path(); @@ -5837,6 +5839,8 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (ret) return ERR_PTR(ret); + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC; unlock_new_inode(&inode->vfs_inode); return inode; } @@ -6289,8 +6293,8 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) } /* - * This is a copy of file_update_time. We need this so we can return error on - * ENOSPC for updating the inode in the case of file write and mmap writes. + * We need our own ->update_time so that we can return error on ENOSPC for + * updating the inode in the case of file write and mmap writes. */ static int btrfs_update_time(struct inode *inode, int flags) { @@ -6788,8 +6792,11 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, } ret = btrfs_create_new_inode(trans, &new_inode_args); - if (!ret) + if (!ret) { + if (S_ISDIR(inode->i_mode)) + inode->i_opflags |= IOP_FASTPERM_MAY_EXEC; d_instantiate_new(dentry, inode); + } btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -6873,7 +6880,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inode_inc_iversion(inode); inode_set_ctime_current(inode); - set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); @@ -7480,7 +7486,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, u64 page_start = folio_pos(folio); u64 page_end = page_start + folio_size(folio) - 1; u64 cur; - int inode_evicting = inode->vfs_inode.i_state & I_FREEING; + int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING; /* * We have folio locked so no new ordered extent can be created on this @@ -8709,15 +8715,13 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, - struct writeback_control *wbc, bool snapshot, - bool in_reclaim_context) +static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write, + bool snapshot, bool in_reclaim_context) { struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); int ret = 0; - bool full_flush = wbc->nr_to_write == LONG_MAX; mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); @@ -8743,10 +8747,10 @@ static int start_delalloc_inodes(struct btrfs_root *root, if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); - if (full_flush) { - work = btrfs_alloc_delalloc_work(&inode->vfs_inode); + if (nr_to_write == NULL) { + work = btrfs_alloc_delalloc_work(tmp_inode); if (!work) { - iput(&inode->vfs_inode); + iput(tmp_inode); ret = -ENOMEM; goto out; } @@ -8754,9 +8758,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + ret = filemap_flush_nr(tmp_inode->i_mapping, + nr_to_write); btrfs_add_delayed_iput(inode); - if (ret || wbc->nr_to_write <= 0) + + if (ret || *nr_to_write <= 0) goto out; } cond_resched(); @@ -8782,29 +8788,17 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; struct btrfs_fs_info *fs_info = root->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - - return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); + return start_delalloc_inodes(root, NULL, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = nr, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; + long *nr_to_write = nr == LONG_MAX ? NULL : &nr; struct btrfs_root *root; LIST_HEAD(splice); int ret; @@ -8816,13 +8810,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { - /* - * Reset nr_to_write here so we know that we're doing a full - * flush. - */ - if (nr == LONG_MAX) - wbc.nr_to_write = LONG_MAX; - root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); @@ -8831,9 +8818,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); + ret = start_delalloc_inodes(root, nr_to_write, false, + in_reclaim_context); btrfs_put_root(root); - if (ret < 0 || wbc.nr_to_write <= 0) + if (ret < 0 || nr <= 0) goto out; spin_lock(&fs_info->delalloc_root_lock); } @@ -9169,6 +9157,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } +/* + * NOTE: in case you are adding MAY_EXEC check for directories: + * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to + * elide calls here. + */ static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1175b8192cd7..31ad8580322a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1539,8 +1539,10 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst ASSERT(prealloc); /* Check the level of src and dst first */ - if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) { + kfree(prealloc); return -EINVAL; + } mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 651b11884f82..ba20d9286a34 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2203,6 +2203,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, &length, &bioc, NULL, NULL); if (ret < 0) { + bio_put(bio); btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); goto out; @@ -2212,6 +2213,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_put_bioc(bioc); if (!rbio) { ret = -ENOMEM; + bio_put(bio); btrfs_bio_counter_dec(fs_info); goto out; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 621e0df097e3..30f3c3b849c1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7122,7 +7122,7 @@ log_extents: * a power failure unless the log was synced as part of an fsync * against any other unrelated inode. */ - if (inode_only != LOG_INODE_EXISTS) + if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS) inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); @@ -7910,6 +7910,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, bool log_pinned = false; int ret; + /* The inode has a new name (ref/extref), so make sure we log it. */ + set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); + btrfs_init_log_ctx(&ctx, inode); ctx.logging_new_name = true; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2bec544d8ba3..259e8b0496df 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2002,14 +2002,11 @@ out: static void update_dev_time(const char *device_path) { struct path path; - int ret; - ret = kern_path(device_path, LOOKUP_FOLLOW, &path); - if (ret) - return; - - inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); - path_put(&path); + if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) { + vfs_utimes(&path, NULL); + path_put(&path); + } } static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 0ea0df18a8e4..d1db7fa1fe58 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1317,6 +1317,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, if (!btrfs_dev_is_sequential(device, info->physical)) { up_read(&dev_replace->rwsem); info->alloc_offset = WP_CONVENTIONAL; + info->capacity = device->zone_info->zone_size; return 0; } @@ -1522,6 +1523,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1529,28 +1532,26 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, map->num_stripes); - div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > i) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == i) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if (test_bit(0, active) != test_bit(i, active)) { @@ -1574,6 +1575,8 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1581,6 +1584,14 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes / map->sub_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; @@ -1594,26 +1605,12 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, - map->num_stripes / map->sub_stripes); - div_u64_rem(stripe_nr, - (map->num_stripes / map->sub_stripes), - &stripe_index); - - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > (i / map->sub_stripes)) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == (i / map->sub_stripes)) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if ((i % map->sub_stripes) == 0) { @@ -1683,8 +1680,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { - /* Zone capacity is always zone size in emulation */ - cache->zone_capacity = cache->length; ret = calculate_alloc_pointer(cache, &last_alloc, new); if (ret) { btrfs_err(fs_info, @@ -1693,6 +1688,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } else if (map->num_stripes == num_conventional) { cache->alloc_offset = last_alloc; + cache->zone_capacity = cache->length; set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } diff --git a/fs/buffer.c b/fs/buffer.c index 6a8752f7bbed..17b8ce567cc3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -611,9 +611,9 @@ int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, return err; ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read_once(inode) & I_DIRTY_ALL)) goto out; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 322ed268f14a..63b75d214210 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping, ceph_wbc->index = ceph_wbc->start_index; ceph_wbc->end = -1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; - } else { - ceph_wbc->tag = PAGECACHE_TAG_DIRTY; - } + ceph_wbc->tag = wbc_to_tag(wbc); ceph_wbc->op_idx = -1; ceph_wbc->num_ops = 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 930fbd54d2c8..f678bab189d8 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -26,7 +26,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) return; /* Only new inodes! */ - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return; WARN_ON_ONCE(ci->netfs.cache); diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index 7026e794813c..928746b92512 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -329,7 +329,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen) out: kfree(cryptbuf); if (dir != parent) { - if ((dir->i_state & I_NEW)) + if ((inode_state_read_once(dir) & I_NEW)) discard_new_inode(dir); else iput(dir); @@ -438,7 +438,7 @@ out: fscrypt_fname_free_buffer(&_tname); out_inode: if (dir != fname->dir) { - if ((dir->i_state & I_NEW)) + if ((inode_state_read_once(dir) & I_NEW)) discard_new_inode(dir); else iput(dir); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 99b30f784ee2..983390069f73 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -740,7 +740,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode, vino.ino, ceph_ino(dir), dentry->d_name.name); ceph_dir_clear_ordered(dir); ceph_init_inode_acls(inode, as_ctx); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { /* * If it's not I_NEW, then someone created this before * we got here. Assume the server is aware of it at @@ -901,7 +901,7 @@ retry: new_inode = NULL; goto out_req; } - WARN_ON_ONCE(!(new_inode->i_state & I_NEW)); + WARN_ON_ONCE(!(inode_state_read_once(new_inode) & I_NEW)); spin_lock(&dentry->d_lock); di->flags |= CEPH_DENTRY_ASYNC_CREATE; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a6e260d9e420..37d3a2477c17 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -132,7 +132,7 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, goto out_err; } - inode->i_state = 0; + inode_state_assign_raw(inode, 0); inode->i_mode = *mode; err = ceph_security_init_secctx(dentry, *mode, as_ctx); @@ -201,7 +201,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, doutc(cl, "on %llx=%llx.%llx got %p new %d\n", ceph_present_inode(inode), ceph_vinop(inode), inode, - !!(inode->i_state & I_NEW)); + !!(inode_state_read_once(inode) & I_NEW)); return inode; } @@ -228,7 +228,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) goto err; } - if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { + if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) { pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n", inode->i_mode); goto err; @@ -261,7 +261,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) } } #endif - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_op = &ceph_snapdir_iops; inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ @@ -270,7 +270,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) return inode; err: - if ((inode->i_state & I_NEW)) + if ((inode_state_read_once(inode) & I_NEW)) discard_new_inode(inode); else iput(inode); @@ -744,7 +744,7 @@ void ceph_evict_inode(struct inode *inode) netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_NETFS_WB) + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); @@ -1013,7 +1013,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, le64_to_cpu(info->version), ci->i_version); /* Once I_NEW is cleared, we can't change type or dev numbers */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_mode = mode; } else { if (inode_wrong_type(inode, mode)) { @@ -1090,7 +1090,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, #ifdef CONFIG_FS_ENCRYPTION if (iinfo->fscrypt_auth_len && - ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) { + ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) { kfree(ci->fscrypt_auth); ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; ci->fscrypt_auth = iinfo->fscrypt_auth; @@ -1692,13 +1692,13 @@ retry_lookup: pr_err_client(cl, "badness %p %llx.%llx\n", in, ceph_vinop(in)); req->r_target_inode = NULL; - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) discard_new_inode(in); else iput(in); goto done; } - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) unlock_new_inode(in); } @@ -1898,11 +1898,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, pr_err_client(cl, "inode badness on %p got %d\n", in, rc); err = rc; - if (in->i_state & I_NEW) { + if (inode_state_read_once(in) & I_NEW) { ihold(in); discard_new_inode(in); } - } else if (in->i_state & I_NEW) { + } else if (inode_state_read_once(in) & I_NEW) { unlock_new_inode(in); } @@ -2114,7 +2114,7 @@ retry_lookup: pr_err_client(cl, "badness on %p %llx.%llx\n", in, ceph_vinop(in)); if (d_really_is_negative(dn)) { - if (in->i_state & I_NEW) { + if (inode_state_read_once(in) & I_NEW) { ihold(in); discard_new_inode(in); } @@ -2124,7 +2124,7 @@ retry_lookup: err = ret; goto next_item; } - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) unlock_new_inode(in); if (d_really_is_negative(dn)) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index ad0cf177e75a..f6bf24b5c683 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1149,7 +1149,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, const char *path = fsc->mount_options->server_path ? fsc->mount_options->server_path + 1 : ""; - err = __ceph_open_session(fsc->client, started); + err = __ceph_open_session(fsc->client); if (err < 0) goto out; diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 62a3d2565c26..70bb0579b40c 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -70,7 +70,7 @@ retry: if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { cii = ITOC(inode); /* we still need to set i_ino for things like stat(2) */ inode->i_ino = hash; @@ -148,7 +148,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) /* we should never see newly created inodes because we intentionally * fail in the initialization callback */ - BUG_ON(inode->i_state & I_NEW); + BUG_ON(inode_state_read_once(inode) & I_NEW); return inode; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index ca54bf24b719..e54ebe402df7 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -95,7 +95,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode = iget_locked(sb, cramino(cramfs_inode, offset)); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; switch (cramfs_inode->mode & S_IFMT) { diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 5dee7c498bc8..ed6e926226b5 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -333,8 +333,7 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh, inode = mapping->host; *inode_ret = inode; - *lblk_num_ret = ((u64)folio->index << (PAGE_SHIFT - inode->i_blkbits)) + - (bh_offset(bh) >> inode->i_blkbits); + *lblk_num_ret = (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits; return true; } diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 3adbd7167055..5e939ea3ac28 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -945,7 +945,7 @@ static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { inode = ci->ci_inode; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4bd3918f50e3..40fa05688d3a 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -834,7 +834,7 @@ int fscrypt_drop_inode(struct inode *inode) * userspace is still using the files, inodes can be dirtied between * then and now. We mustn't lose any writes, so skip dirty inodes here. */ - if (inode->i_state & I_DIRTY_ALL) + if (inode_state_read(inode) & I_DIRTY_ALL) return 0; /* @@ -1507,7 +1507,7 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); /* * invalidate the pages whose sharing state is to be changed @@ -1536,10 +1536,10 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (ret < 0) return ret; - ret = iomap_iter_advance(iter, &length); + ret = iomap_iter_advance(iter, length); if (ret) return ret; - } while (length > 0); + } while ((length = iomap_length(iter)) > 0); if (did_zero) *did_zero = true; @@ -1597,7 +1597,7 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { done = iov_iter_zero(min(length, end - pos), iter); - return iomap_iter_advance(iomi, &done); + return iomap_iter_advance(iomi, done); } } @@ -1681,12 +1681,12 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); - length = xfer; - ret = iomap_iter_advance(iomi, &length); + ret = iomap_iter_advance(iomi, xfer); if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; + length = iomap_length(iomi); } dax_read_unlock(id); @@ -1919,10 +1919,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp, ret |= VM_FAULT_MAJOR; } - if (!(ret & VM_FAULT_ERROR)) { - u64 length = PAGE_SIZE; - iter.status = iomap_iter_advance(&iter, &length); - } + if (!(ret & VM_FAULT_ERROR)) + iter.status = iomap_iter_advance(&iter, PAGE_SIZE); } if (iomap_errp) @@ -2034,10 +2032,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); - if (ret != VM_FAULT_FALLBACK) { - u64 length = PMD_SIZE; - iter.status = iomap_iter_advance(&iter, &length); - } + if (ret != VM_FAULT_FALLBACK) + iter.status = iomap_iter_advance(&iter, PMD_SIZE); } unlock_entry: @@ -2163,7 +2159,6 @@ static int dax_range_compare_iter(struct iomap_iter *it_src, const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; - u64 dest_len; void *saddr, *daddr; int id, ret; @@ -2196,10 +2191,9 @@ static int dax_range_compare_iter(struct iomap_iter *it_src, dax_read_unlock(id); advance: - dest_len = len; - ret = iomap_iter_advance(it_src, &len); + ret = iomap_iter_advance(it_src, len); if (!ret) - ret = iomap_iter_advance(it_dest, &dest_len); + ret = iomap_iter_advance(it_dest, len); return ret; out_unlock: diff --git a/fs/dcache.c b/fs/dcache.c index 035cccbc9276..9143fd502def 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -86,7 +86,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); -static struct kmem_cache *dentry_cache __ro_after_init; +static struct kmem_cache *__dentry_cache __ro_after_init; +#define dentry_cache runtime_const_ptr(__dentry_cache) const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); @@ -794,7 +795,7 @@ void d_mark_dontcache(struct inode *inode) de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } - inode->i_state |= I_DONTCACHE; + inode_state_set(inode, I_DONTCACHE); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); @@ -1073,7 +1074,7 @@ struct dentry *d_find_alias_rcu(struct inode *inode) spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left - if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { + if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { @@ -1980,14 +1981,8 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW & ~I_CREATING; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } @@ -2306,11 +2301,20 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; - if (d_unhashed(dentry)) - continue; if (dentry->d_name.hash_len != hashlen) continue; - if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) + if (unlikely(dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)) + continue; + /* + * Check for the dentry being unhashed. + * + * As tempting as it is, we *can't* skip it because of a race window + * between us finding the dentry before it gets unhashed and loading + * the sequence counter after unhashing is finished. + * + * We can at least predict on it. + */ + if (unlikely(d_unhashed(dentry))) continue; *seqp = seq; return dentry; @@ -3222,9 +3226,10 @@ static void __init dcache_init(void) * but it is probably not worth it because of the cache nature * of the dcache. */ - dentry_cache = KMEM_CACHE_USERCOPY(dentry, + __dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_shortname.string); + runtime_const_init(ptr, __dentry_cache); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 019a8b4eaaf9..49f56a598ecb 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -28,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) * inodes without pages but we deliberately won't in case * we need to reschedule to avoid softlockups. */ - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || (mapping_empty(inode->i_mapping) && !need_resched())) { spin_unlock(&inode->i_lock); continue; diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 1bdeaa6d5790..c2f4fb41b4e6 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -4,7 +4,7 @@ config ECRYPT_FS depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO_ECB select CRYPTO_CBC - select CRYPTO_MD5 + select CRYPTO_LIB_MD5 help Encrypted filesystem that operates on the VFS layer. See <file:Documentation/filesystems/ecryptfs.rst> to learn more about diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 69536cacdea8..260f8a4938b0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -9,7 +9,6 @@ * Michael C. Thompson <mcthomps@us.ibm.com> */ -#include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/fs.h> #include <linux/mount.h> @@ -48,32 +47,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size) } } -/** - * ecryptfs_calculate_md5 - calculates the md5 of @src - * @dst: Pointer to 16 bytes of allocated memory - * @crypt_stat: Pointer to crypt_stat struct for the current inode - * @src: Data to be md5'd - * @len: Length of @src - * - * Uses the allocated crypto context that crypt_stat references to - * generate the MD5 sum of the contents of src. - */ -static int ecryptfs_calculate_md5(char *dst, - struct ecryptfs_crypt_stat *crypt_stat, - char *src, int len) -{ - int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst); - - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; rc = [%d]\n", - __func__, rc); - goto out; - } -out: - return rc; -} - static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name, char *cipher_name, char *chaining_modifier) @@ -104,13 +77,10 @@ out: * * Generate the initialization vector from the given root IV and page * offset. - * - * Returns zero on success; non-zero on error. */ -int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset) +void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset) { - int rc = 0; char dst[MD5_DIGEST_SIZE]; char src[ECRYPTFS_MAX_IV_BYTES + 16]; @@ -129,20 +99,12 @@ int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, ecryptfs_printk(KERN_DEBUG, "source:\n"); ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16)); } - rc = ecryptfs_calculate_md5(dst, crypt_stat, src, - (crypt_stat->iv_bytes + 16)); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error attempting to compute " - "MD5 while generating IV for a page\n"); - goto out; - } + md5(src, crypt_stat->iv_bytes + 16, dst); memcpy(iv, dst, crypt_stat->iv_bytes); if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "derived iv:\n"); ecryptfs_dump_hex(iv, crypt_stat->iv_bytes); } -out: - return rc; } /** @@ -151,29 +113,14 @@ out: * * Initialize the crypt_stat structure. */ -int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) { - struct crypto_shash *tfm; - int rc; - - tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0); - if (IS_ERR(tfm)) { - rc = PTR_ERR(tfm); - ecryptfs_printk(KERN_ERR, "Error attempting to " - "allocate crypto context; rc = [%d]\n", - rc); - return rc; - } - memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); INIT_LIST_HEAD(&crypt_stat->keysig_list); mutex_init(&crypt_stat->keysig_list_mutex); mutex_init(&crypt_stat->cs_mutex); mutex_init(&crypt_stat->cs_tfm_mutex); - crypt_stat->hash_tfm = tfm; crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED; - - return 0; } /** @@ -187,7 +134,6 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) struct ecryptfs_key_sig *key_sig, *key_sig_tmp; crypto_free_skcipher(crypt_stat->tfm); - crypto_free_shash(crypt_stat->hash_tfm); list_for_each_entry_safe(key_sig, key_sig_tmp, &crypt_stat->keysig_list, crypt_stat_list) { list_del(&key_sig->crypt_stat_list); @@ -361,14 +307,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat, int rc; extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size)); - rc = ecryptfs_derive_iv(extent_iv, crypt_stat, - (extent_base + extent_offset)); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " - "extent [0x%.16llx]; rc = [%d]\n", - (unsigned long long)(extent_base + extent_offset), rc); - goto out; - } + ecryptfs_derive_iv(extent_iv, crypt_stat, extent_base + extent_offset); sg_init_table(&src_sg, 1); sg_init_table(&dst_sg, 1); @@ -609,31 +548,20 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) */ int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat) { - int rc = 0; char dst[MD5_DIGEST_SIZE]; BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE); BUG_ON(crypt_stat->iv_bytes <= 0); if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { - rc = -EINVAL; ecryptfs_printk(KERN_WARNING, "Session key not valid; " "cannot generate root IV\n"); - goto out; - } - rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key, - crypt_stat->key_size); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error attempting to compute " - "MD5 while generating root IV\n"); - goto out; - } - memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); -out: - if (rc) { memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes); crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING; + return -EINVAL; } - return rc; + md5(crypt_stat->key, crypt_stat->key_size, dst); + memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); + return 0; } static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat) diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 9e6ab0b41337..62a2ea7f59ed 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -14,6 +14,7 @@ #ifndef ECRYPTFS_KERNEL_H #define ECRYPTFS_KERNEL_H +#include <crypto/md5.h> #include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> @@ -137,8 +138,6 @@ ecryptfs_get_key_payload_data(struct key *key) + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES) #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 -#define ECRYPTFS_DEFAULT_HASH "md5" -#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED @@ -163,8 +162,6 @@ ecryptfs_get_key_payload_data(struct key *key) * ECRYPTFS_MAX_IV_BYTES */ #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ -#define MD5_DIGEST_SIZE 16 -#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE #define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \ + ECRYPTFS_SIG_SIZE + 1 + 1) #define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \ @@ -237,8 +234,6 @@ struct ecryptfs_crypt_stat { unsigned int extent_mask; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct crypto_skcipher *tfm; - struct crypto_shash *hash_tfm; /* Crypto context for generating - * the initialization vectors */ unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; @@ -558,7 +553,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_rotate_iv(unsigned char *iv); -int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat); @@ -693,8 +688,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, char *data, size_t max_packet_size); int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, struct ecryptfs_mount_crypt_stat *mount_crypt_stat); -int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset); +void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset); extern const struct xattr_handler * const ecryptfs_xattr_handlers[]; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ed1394da8d6b..fc6d37419753 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -95,7 +95,7 @@ static struct inode *__ecryptfs_get_inode(struct inode *lower_inode, iput(lower_inode); return ERR_PTR(-EACCES); } - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) iput(lower_inode); return inode; @@ -106,7 +106,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode, { struct inode *inode = __ecryptfs_get_inode(lower_inode, sb); - if (!IS_ERR(inode) && (inode->i_state & I_NEW)) + if (!IS_ERR(inode) && (inode_state_read_once(inode) & I_NEW)) unlock_new_inode(inode); return inode; @@ -364,7 +364,7 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, } } - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); return d_splice_alias(inode, dentry); } @@ -903,11 +903,8 @@ static int ecryptfs_setattr(struct mnt_idmap *idmap, struct ecryptfs_crypt_stat *crypt_stat; crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; - if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) { - rc = ecryptfs_init_crypt_stat(crypt_stat); - if (rc) - return rc; - } + if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) + ecryptfs_init_crypt_stat(crypt_stat); inode = d_inode(dentry); lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 7f9f68c00ef6..bbf8603242fa 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -11,7 +11,6 @@ * Trevor S. Highland <trevor.highland@gmail.com> */ -#include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/string.h> #include <linux/pagemap.h> @@ -601,10 +600,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack { struct crypto_skcipher *skcipher_tfm; struct skcipher_request *skcipher_req; char iv[ECRYPTFS_MAX_IV_BYTES]; - char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; - char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; - struct crypto_shash *hash_tfm; - struct shash_desc *hash_desc; + char hash[MD5_DIGEST_SIZE]; }; /* @@ -741,51 +737,15 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "password tokens\n", __func__); goto out_free_unlock; } - s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0); - if (IS_ERR(s->hash_tfm)) { - rc = PTR_ERR(s->hash_tfm); - printk(KERN_ERR "%s: Error attempting to " - "allocate hash crypto context; rc = [%d]\n", - __func__, rc); - goto out_free_unlock; - } - - s->hash_desc = kmalloc(sizeof(*s->hash_desc) + - crypto_shash_descsize(s->hash_tfm), GFP_KERNEL); - if (!s->hash_desc) { - rc = -ENOMEM; - goto out_release_free_unlock; - } - s->hash_desc->tfm = s->hash_tfm; - - rc = crypto_shash_digest(s->hash_desc, - (u8 *)s->auth_tok->token.password.session_key_encryption_key, - s->auth_tok->token.password.session_key_encryption_key_bytes, - s->hash); - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; rc = [%d]\n", - __func__, rc); - goto out_release_free_unlock; - } + md5(s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes, + s->hash); for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { s->block_aligned_filename[s->j] = - s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; - if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) - == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { - rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash, - ECRYPTFS_TAG_70_DIGEST_SIZE, - s->tmp_hash); - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; " - "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; - } - memcpy(s->hash, s->tmp_hash, - ECRYPTFS_TAG_70_DIGEST_SIZE); - } + s->hash[s->j % MD5_DIGEST_SIZE]; + if ((s->j % MD5_DIGEST_SIZE) == (MD5_DIGEST_SIZE - 1)) + md5(s->hash, MD5_DIGEST_SIZE, s->hash); if (s->block_aligned_filename[s->j] == '\0') s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; } @@ -798,7 +758,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "convert filename memory to scatterlist; rc = [%d]. " "block_aligned_filename_size = [%zd]\n", __func__, rc, s->block_aligned_filename_size); - goto out_release_free_unlock; + goto out_free_unlock; } rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, s->dst_sg, 2); @@ -807,7 +767,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "convert encrypted filename memory to scatterlist; " "rc = [%d]. block_aligned_filename_size = [%zd]\n", __func__, rc, s->block_aligned_filename_size); - goto out_release_free_unlock; + goto out_free_unlock; } /* The characters in the first block effectively do the job * of the IV here, so we just use 0's for the IV. Note the @@ -825,7 +785,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, rc, s->auth_tok->token.password.session_key_encryption_key, mount_crypt_stat->global_default_fn_cipher_key_bytes); - goto out_release_free_unlock; + goto out_free_unlock; } skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg, s->block_aligned_filename_size, s->iv); @@ -833,13 +793,11 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt filename; " "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; + goto out_free_unlock; } s->i += s->block_aligned_filename_size; (*packet_size) = s->i; (*remaining_bytes) -= (*packet_size); -out_release_free_unlock: - crypto_free_shash(s->hash_tfm); out_free_unlock: kfree_sensitive(s->block_aligned_filename); out_unlock: @@ -850,7 +808,6 @@ out: key_put(auth_tok_key); } skcipher_request_free(s->skcipher_req); - kfree_sensitive(s->hash_desc); kfree(s); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 16ea14dd2c62..c12dc680f8fe 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -12,6 +12,7 @@ #include <linux/dcache.h> #include <linux/file.h> +#include <linux/fips.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/skbuff.h> @@ -454,6 +455,12 @@ static int ecryptfs_get_tree(struct fs_context *fc) goto out; } + if (fips_enabled) { + rc = -EINVAL; + err = "eCryptfs support is disabled due to FIPS"; + goto out; + } + s = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(s)) { rc = PTR_ERR(s); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index e7b7f426fecf..3bc21d677564 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -41,10 +41,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL); if (unlikely(!inode_info)) goto out; - if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) { - kmem_cache_free(ecryptfs_inode_info_cache, inode_info); - goto out; - } + ecryptfs_init_crypt_stat(&inode_info->crypt_stat); mutex_init(&inode_info->lower_file_mutex); atomic_set(&inode_info->lower_file_count, 0); inode_info->lower_file = NULL; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 1f4d8ce56667..6de97565d5f7 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -533,6 +533,7 @@ static struct file_system_type efivarfs_type = { .init_fs_context = efivarfs_init_fs_context, .kill_sb = efivarfs_kill_sb, .parameters = efivarfs_parameters, + .fs_flags = FS_POWER_FREEZE, }; static __init int efivarfs_init(void) diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 462619e59766..28407578f83a 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -62,7 +62,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) inode = iget_locked(super, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; in = INODE_INFO(inode); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8ca29962a3dd..bb13c4cb8455 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio) { trace_erofs_read_folio(folio, true); - return iomap_read_folio(folio, &erofs_iomap_ops); + iomap_bio_read_folio(folio, &erofs_iomap_ops); + return 0; } static void erofs_readahead(struct readahead_control *rac) @@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac) trace_erofs_readahead(rac->mapping->host, readahead_index(rac), readahead_count(rac), true); - return iomap_readahead(rac, &erofs_iomap_ops); + iomap_bio_readahead(rac, &erofs_iomap_ops); } static sector_t erofs_bmap(struct address_space *mapping, sector_t block) diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c index b4bfe14229f9..e38d93bb2104 100644 --- a/fs/erofs/decompressor_zstd.c +++ b/fs/erofs/decompressor_zstd.c @@ -172,7 +172,6 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, dctx.bounce = strm->bounce; do { - dctx.avail_out = out_buf.size - out_buf.pos; dctx.inbuf_sz = in_buf.size; dctx.inbuf_pos = in_buf.pos; err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst, @@ -188,14 +187,18 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, in_buf.pos = dctx.inbuf_pos; zerr = zstd_decompress_stream(stream, &out_buf, &in_buf); - if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) { + dctx.avail_out = out_buf.size - out_buf.pos; + if (zstd_is_error(zerr) || + ((rq->outputsize + dctx.avail_out) && (!zerr || (zerr > 0 && + !(rq->inputsize + in_buf.size - in_buf.pos))))) { erofs_err(sb, "failed to decompress in[%u] out[%u]: %s", rq->inputsize, rq->outputsize, - zerr ? zstd_get_error_name(zerr) : "unexpected end of stream"); + zstd_is_error(zerr) ? zstd_get_error_name(zerr) : + "unexpected end of stream"); err = -EFSCORRUPTED; break; } - } while (rq->outputsize || out_buf.pos < out_buf.size); + } while (rq->outputsize + dctx.avail_out); if (dctx.kout) kunmap_local(dctx.kout); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index cb780c095d28..bce98c845a18 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -295,7 +295,7 @@ struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { int err = erofs_fill_inode(inode); if (err) { diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 7f9592856bf7..74d451f732c7 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -433,7 +433,10 @@ static int exfat_read_boot_sector(struct super_block *sb) struct exfat_sb_info *sbi = EXFAT_SB(sb); /* set block size to read super block */ - sb_min_blocksize(sb, 512); + if (!sb_min_blocksize(sb, 512)) { + exfat_err(sb, "unable to set blocksize"); + return -EINVAL; + } /* read boot sector */ sbi->boot_bh = sb_bread(sb, 0); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e10c376843d7..dbfe9098a124 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1398,7 +1398,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ei = EXT2_I(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e99306a8f47c..57dc1ff91df2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -202,8 +202,7 @@ void ext4_evict_inode(struct inode *inode) * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ - if (!list_empty_careful(&inode->i_io_list)) - inode_io_list_del(inode); + inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any @@ -425,7 +424,7 @@ void ext4_check_map_extents_env(struct inode *inode) if (!S_ISREG(inode->i_mode) || IS_NOQUOTA(inode) || IS_VERITY(inode) || is_special_ino(inode->i_sb, inode->i_ino) || - (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || ext4_verity_in_progress(inode)) return; @@ -2619,10 +2618,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); - if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(mpd->wbc); mpd->map.m_len = 0; mpd->next_pos = mpd->start_pos; @@ -3473,7 +3469,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; - return inode->i_state & I_DIRTY_DATASYNC; + return inode_state_read_once(inode) & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, @@ -4552,7 +4548,7 @@ int ext4_truncate(struct inode *inode) * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ - if (!(inode->i_state & (I_NEW|I_FREEING))) + if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); @@ -5210,7 +5206,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); @@ -5549,7 +5545,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 82d5e7501455..5fd54adf0c88 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -107,7 +107,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!sbi->s_journal || is_bad_inode(inode)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_inode_orphan_tracked(inode)) return 0; @@ -232,7 +232,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 775aa4f63aa3..8bf4feda42b0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) @@ -4222,7 +4219,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (map.m_flags & F2FS_MAP_NEW) iomap->flags |= IOMAP_F_NEW; - if ((inode->i_state & I_DIRTY_DATASYNC) || + if ((inode_state_read_once(inode) & I_DIRTY_DATASYNC) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 8c4eafe9ffac..f1cda1900658 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -569,7 +569,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { if (is_meta_ino(sbi, ino)) { f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); set_sbi_flag(sbi, SBI_NEED_FSCK); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b882771e4699..af40282a6948 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -844,7 +844,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); - inode->i_state |= I_LINKABLE; + inode_state_set(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } else { if (file) @@ -1057,7 +1057,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto put_out_dir; spin_lock(&whiteout->i_lock); - whiteout->i_state &= ~I_LINKABLE; + inode_state_clear(whiteout, I_LINKABLE); spin_unlock(&whiteout->i_lock); iput(whiteout); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index db7afb806411..47489d48f2b9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1798,7 +1798,7 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { + if ((!inode_unhashed(inode) && inode_state_read(inode) & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ __iget(inode); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 9648ed097816..9cfe20a3daaf 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1595,8 +1595,12 @@ int fat_fill_super(struct super_block *sb, struct fs_context *fc, setup(sb); /* flavour-specific stuff that needs options */ + error = -EINVAL; + if (!sb_min_blocksize(sb, 512)) { + fat_msg(sb, KERN_ERR, "unable to set blocksize"); + goto out_fail; + } error = -EIO; - sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); if (bh == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector"); diff --git a/fs/file.c b/fs/file.c index 28743b742e3c..3f56890068aa 100644 --- a/fs/file.c +++ b/fs/file.c @@ -641,6 +641,34 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); +/* + * Install a file pointer in the fd array while it is being resized. + * + * We need to make sure our update to the array does not get lost as the resizing + * thread can be copying the content as we modify it. + * + * We have two ways to do it: + * - go off CPU waiting for resize_in_progress to clear + * - take the spin lock + * + * The latter is trivial to implement and saves us from having to might_sleep() + * for debugging purposes. + * + * This is moved out of line from fd_install() to convince gcc to optimize that + * routine better. + */ +static void noinline fd_install_slowpath(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); +} + /** * fd_install - install a file pointer in the fd array * @fd: file descriptor to install the file in @@ -658,14 +686,9 @@ void fd_install(unsigned int fd, struct file *file) return; rcu_read_lock_sched(); - if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + fd_install_slowpath(fd, file); return; } /* coupled with smp_wmb() in expand_fdtable() */ diff --git a/fs/file_attr.c b/fs/file_attr.c index 1dcec88c0680..4c4916632f11 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -316,7 +316,6 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp) err = put_user(fa.flags, argp); return err; } -EXPORT_SYMBOL(ioctl_getflags); int ioctl_setflags(struct file *file, unsigned int __user *argp) { @@ -337,7 +336,6 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp) } return err; } -EXPORT_SYMBOL(ioctl_setflags); int ioctl_fsgetxattr(struct file *file, void __user *argp) { @@ -350,7 +348,6 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp) return err; } -EXPORT_SYMBOL(ioctl_fsgetxattr); int ioctl_fssetxattr(struct file *file, void __user *argp) { @@ -369,7 +366,6 @@ int ioctl_fssetxattr(struct file *file, void __user *argp) } return err; } -EXPORT_SYMBOL(ioctl_fssetxattr); SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, struct file_attr __user *, ufattr, size_t, usize, diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 20600e9ea202..21fc94b98209 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -258,7 +258,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino) ip = iget_locked(sbp, ino); if (!ip) return ERR_PTR(-ENOMEM); - if (!(ip->i_state & I_NEW)) + if (!(inode_state_read_once(ip) & I_NEW)) return ip; vip = VXFS_INO(ip); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2b35e80037fe..6800886c4d10 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,6 +14,7 @@ * Additions for address_space-based writeback */ +#include <linux/sched/sysctl.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -32,11 +33,6 @@ #include "internal.h" /* - * 4MB minimal write chunk size - */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) - -/* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { @@ -121,7 +117,7 @@ static bool inode_io_list_move_locked(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_FREEING); + WARN_ON_ONCE(inode_state_read(inode) & I_FREEING); list_move(&inode->i_io_list, head); @@ -200,6 +196,19 @@ static void wb_queue_work(struct bdi_writeback *wb, spin_unlock_irq(&wb->work_lock); } +static bool wb_wait_for_completion_cb(struct wb_completion *done) +{ + unsigned long waited_secs = (jiffies - done->wait_start) / HZ; + + done->progress_stamp = jiffies; + if (waited_secs > sysctl_hung_task_timeout_secs) + pr_info("INFO: The task %s:%d has been waiting for writeback " + "completion for more than %lu seconds.", + current->comm, current->pid, waited_secs); + + return !atomic_read(&done->cnt); +} + /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion @@ -212,8 +221,9 @@ static void wb_queue_work(struct bdi_writeback *wb, */ void wb_wait_for_completion(struct wb_completion *done) { + done->wait_start = jiffies; atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, !atomic_read(&done->cnt)); + wait_event(*done->waitq, wb_wait_for_completion_cb(done)); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -304,9 +314,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_FREEING); + WARN_ON_ONCE(inode_state_read(inode) & I_FREEING); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); if (wb != &wb->bdi->wb) list_move(&inode->i_io_list, &wb->b_attached); else @@ -408,7 +418,7 @@ static bool inode_do_switch_wbs(struct inode *inode, * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction * path owns the inode and we shouldn't modify ->i_io_list. */ - if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) + if (unlikely(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); @@ -451,7 +461,7 @@ static bool inode_do_switch_wbs(struct inode *inode, if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; - if (inode->i_state & I_DIRTY_ALL) { + if (inode_state_read(inode) & I_DIRTY_ALL) { /* * We need to keep b_dirty list sorted by * dirtied_time_when. However properly sorting the @@ -476,10 +486,11 @@ static bool inode_do_switch_wbs(struct inode *inode, switched = true; skip_switch: /* - * Paired with load_acquire in unlocked_inode_to_wb_begin() and + * Paired with an acquire fence in unlocked_inode_to_wb_begin() and * ensures that the new wb is visible if they see !I_WB_SWITCH. */ - smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); + smp_wmb(); + inode_state_clear(inode, I_WB_SWITCH); xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); @@ -600,12 +611,12 @@ static bool inode_prepare_wbs_switch(struct inode *inode, /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || + inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); return false; } - inode->i_state |= I_WB_SWITCH; + inode_state_set(inode, I_WB_SWITCH); __iget(inode); spin_unlock(&inode->i_lock); @@ -635,7 +646,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ - if (inode->i_state & I_WB_SWITCH) + if (inode_state_read_once(inode) & I_WB_SWITCH) return; /* avoid queueing a new switch if too many are already in flight */ @@ -807,9 +818,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, * @wbc: writeback_control of interest * @inode: target inode * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. + * This function is to be used by filemap_writeback(), which is an alternative + * entry point into writeback code, and first ensures @inode is associated with + * a bdi_writeback and attaches it to @wbc. */ void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) @@ -1236,9 +1247,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_FREEING); + WARN_ON_ONCE(inode_state_read(inode) & I_FREEING); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -1348,10 +1359,17 @@ void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; + /* + * FIXME: ext4 can call here from ext4_evict_inode() after evict() already + * unlinked the inode. + */ + if (list_empty_careful(&inode->i_io_list)) + return; + wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); @@ -1409,13 +1427,13 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&inode->i_lock); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); /* * When the inode is being freed just don't bother with dirty list * tracking. Flush worker will ignore this inode anyway and it will * trigger assertions in inode_io_list_move_locked(). */ - if (inode->i_state & I_FREEING) { + if (inode_state_read(inode) & I_FREEING) { list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); return; @@ -1449,9 +1467,9 @@ static void inode_sync_complete(struct inode *inode) { assert_spin_locked(&inode->i_lock); - inode->i_state &= ~I_SYNC; + inode_state_clear(inode, I_SYNC); /* If inode is clean an unused, put it into LRU now... */ - inode_add_lru(inode); + inode_lru_list_add(inode); /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_SYNC); } @@ -1493,7 +1511,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; - inode->i_state |= I_SYNC_QUEUED; + inode_state_set(inode, I_SYNC_QUEUED); spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; @@ -1579,14 +1597,14 @@ void inode_wait_for_writeback(struct inode *inode) assert_spin_locked(&inode->i_lock); - if (!(inode->i_state & I_SYNC)) + if (!(inode_state_read(inode) & I_SYNC)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); for (;;) { prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ - if (!(inode->i_state & I_SYNC)) + if (!(inode_state_read(inode) & I_SYNC)) break; spin_unlock(&inode->i_lock); schedule(); @@ -1612,7 +1630,7 @@ static void inode_sleep_on_writeback(struct inode *inode) wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ - sleep = !!(inode->i_state & I_SYNC); + sleep = !!(inode_state_read(inode) & I_SYNC); spin_unlock(&inode->i_lock); if (sleep) schedule(); @@ -1631,7 +1649,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc, unsigned long dirtied_before) { - if (inode->i_state & I_FREEING) + if (inode_state_read(inode) & I_FREEING) return; /* @@ -1639,7 +1657,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * shot. If still dirty, it will be redirty_tail()'ed below. Update * the dirty time to prevent enqueue and sync it again. */ - if ((inode->i_state & I_DIRTY) && + if ((inode_state_read(inode) & I_DIRTY) && (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) inode->dirtied_when = jiffies; @@ -1650,7 +1668,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * is odd for clean inodes, it can happen for some * filesystems so handle that gracefully. */ - if (inode->i_state & I_DIRTY_ALL) + if (inode_state_read(inode) & I_DIRTY_ALL) redirty_tail_locked(inode, wb); else inode_cgwb_move_to_attached(inode, wb); @@ -1676,17 +1694,17 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, */ redirty_tail_locked(inode, wb); } - } else if (inode->i_state & I_DIRTY) { + } else if (inode_state_read(inode) & I_DIRTY) { /* * Filesystems can dirty the inode during writeback operations, * such as delayed allocation during submission or metadata * updates after data IO completion. */ redirty_tail_locked(inode, wb); - } else if (inode->i_state & I_DIRTY_TIME) { + } else if (inode_state_read(inode) & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); } else { /* The inode is clean. Remove from writeback lists. */ inode_cgwb_move_to_attached(inode, wb); @@ -1712,7 +1730,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) unsigned dirty; int ret; - WARN_ON(!(inode->i_state & I_SYNC)); + WARN_ON(!(inode_state_read_once(inode) & I_SYNC)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); @@ -1736,7 +1754,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * mark_inode_dirty_sync() to notify the filesystem about it and to * change I_DIRTY_TIME into I_DIRTY_SYNC. */ - if ((inode->i_state & I_DIRTY_TIME) && + if ((inode_state_read_once(inode) & I_DIRTY_TIME) && (wbc->sync_mode == WB_SYNC_ALL || time_after(jiffies, inode->dirtied_time_when + dirtytime_expire_interval * HZ))) { @@ -1751,8 +1769,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * after handling timestamp expiration, as that may dirty the inode too. */ spin_lock(&inode->i_lock); - dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~dirty; + dirty = inode_state_read(inode) & I_DIRTY; + inode_state_clear(inode, dirty); /* * Paired with smp_mb() in __mark_inode_dirty(). This allows @@ -1768,10 +1786,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) smp_mb(); if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - inode->i_state |= I_DIRTY_PAGES; - else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) { - if (!(inode->i_state & I_DIRTY_PAGES)) { - inode->i_state &= ~I_PINNING_NETFS_WB; + inode_state_set(inode, I_DIRTY_PAGES); + else if (unlikely(inode_state_read(inode) & I_PINNING_NETFS_WB)) { + if (!(inode_state_read(inode) & I_DIRTY_PAGES)) { + inode_state_clear(inode, I_PINNING_NETFS_WB); wbc->unpinned_netfs_wb = true; dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */ } @@ -1807,11 +1825,11 @@ static int writeback_single_inode(struct inode *inode, spin_lock(&inode->i_lock); if (!icount_read(inode)) - WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); + WARN_ON(!(inode_state_read(inode) & (I_WILL_FREE | I_FREEING))); else - WARN_ON(inode->i_state & I_WILL_FREE); + WARN_ON(inode_state_read(inode) & I_WILL_FREE); - if (inode->i_state & I_SYNC) { + if (inode_state_read(inode) & I_SYNC) { /* * Writeback is already running on the inode. For WB_SYNC_NONE, * that's enough and we can just return. For WB_SYNC_ALL, we @@ -1822,7 +1840,7 @@ static int writeback_single_inode(struct inode *inode, goto out; inode_wait_for_writeback(inode); } - WARN_ON(inode->i_state & I_SYNC); + WARN_ON(inode_state_read(inode) & I_SYNC); /* * If the inode is already fully clean, then there's nothing to do. * @@ -1830,11 +1848,11 @@ static int writeback_single_inode(struct inode *inode, * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If * there are any such pages, we'll need to wait for them. */ - if (!(inode->i_state & I_DIRTY_ALL) && + if (!(inode_state_read(inode) & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; - inode->i_state |= I_SYNC; + inode_state_set(inode, I_SYNC); wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); @@ -1847,18 +1865,18 @@ static int writeback_single_inode(struct inode *inode, * If the inode is freeing, its i_io_list shoudn't be updated * as it can be finally deleted at this moment. */ - if (!(inode->i_state & I_FREEING)) { + if (!(inode_state_read(inode) & I_FREEING)) { /* * If the inode is now fully clean, then it can be safely * removed from its writeback list (if any). Otherwise the * flusher threads are responsible for the writeback lists. */ - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read(inode) & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); - else if (!(inode->i_state & I_SYNC_QUEUED)) { - if ((inode->i_state & I_DIRTY)) + else if (!(inode_state_read(inode) & I_SYNC_QUEUED)) { + if ((inode_state_read(inode) & I_DIRTY)) redirty_tail_locked(inode, wb); - else if (inode->i_state & I_DIRTY_TIME) { + else if (inode_state_read(inode) & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, @@ -1874,8 +1892,8 @@ out: return ret; } -static long writeback_chunk_size(struct bdi_writeback *wb, - struct wb_writeback_work *work) +static long writeback_chunk_size(struct super_block *sb, + struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -1893,16 +1911,13 @@ static long writeback_chunk_size(struct bdi_writeback *wb, * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) - pages = LONG_MAX; - else { - pages = min(wb->avg_write_bandwidth / 2, - global_wb_domain.dirty_limit / DIRTY_SCOPE); - pages = min(pages, work->nr_pages); - pages = round_down(pages + MIN_WRITEBACK_PAGES, - MIN_WRITEBACK_PAGES); - } + return LONG_MAX; - return pages; + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + return round_down(pages + sb->s_min_writeback_pages, + sb->s_min_writeback_pages); } /* @@ -1967,12 +1982,12 @@ static long writeback_sb_inodes(struct super_block *sb, * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) { redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); continue; } - if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { + if ((inode_state_read(inode) & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { /* * If this inode is locked for writeback and we are not * doing writeback-for-data-integrity, move it to @@ -1994,17 +2009,17 @@ static long writeback_sb_inodes(struct super_block *sb, * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */ - if (inode->i_state & I_SYNC) { + if (inode_state_read(inode) & I_SYNC) { /* Wait for I_SYNC. This function drops i_lock... */ inode_sleep_on_writeback(inode); /* Inode may be gone, start again */ spin_lock(&wb->list_lock); continue; } - inode->i_state |= I_SYNC; + inode_state_set(inode, I_SYNC); wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb, work); + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; @@ -2014,6 +2029,12 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + /* Report progress to inform the hung task detector of the progress. */ + if (work->done && work->done->progress_stamp && + (jiffies - work->done->progress_stamp) > HZ * + sysctl_hung_task_timeout_secs / 2) + wake_up_all(work->done->waitq); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; @@ -2039,7 +2060,7 @@ static long writeback_sb_inodes(struct super_block *sb, */ tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read(inode) & I_DIRTY_ALL)) total_wrote++; requeue_inode(inode, tmp_wb, &wbc, dirtied_before); inode_sync_complete(inode); @@ -2545,10 +2566,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) * We tell ->dirty_inode callback that timestamps need to * be updated by setting I_DIRTY_TIME in flags. */ - if (inode->i_state & I_DIRTY_TIME) { + if (inode_state_read_once(inode) & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - if (inode->i_state & I_DIRTY_TIME) { - inode->i_state &= ~I_DIRTY_TIME; + if (inode_state_read(inode) & I_DIRTY_TIME) { + inode_state_clear(inode, I_DIRTY_TIME); flags |= I_DIRTY_TIME; } spin_unlock(&inode->i_lock); @@ -2585,16 +2606,16 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ smp_mb(); - if ((inode->i_state & flags) == flags) + if ((inode_state_read_once(inode) & flags) == flags) return; spin_lock(&inode->i_lock); - if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + if ((inode_state_read(inode) & flags) != flags) { + const int was_dirty = inode_state_read(inode) & I_DIRTY; inode_attach_wb(inode, NULL); - inode->i_state |= flags; + inode_state_set(inode, flags); /* * Grab inode's wb early because it requires dropping i_lock and we @@ -2613,7 +2634,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * the inode it will place it on the appropriate superblock * list, based upon its state. */ - if (inode->i_state & I_SYNC_QUEUED) + if (inode_state_read(inode) & I_SYNC_QUEUED) goto out_unlock; /* @@ -2624,7 +2645,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (inode_unhashed(inode)) goto out_unlock; } - if (inode->i_state & I_FREEING) + if (inode_state_read(inode) & I_FREEING) goto out_unlock; /* @@ -2639,7 +2660,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (dirtytime) inode->dirtied_time_when = jiffies; - if (inode->i_state & I_DIRTY) + if (inode_state_read(inode) & I_DIRTY) dirty_list = &wb->b_dirty; else dirty_list = &wb->b_dirty_time; @@ -2736,7 +2757,7 @@ static void wait_sb_inodes(struct super_block *sb) spin_unlock_irq(&sb->s_inode_wblist_lock); spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); spin_lock_irq(&sb->s_inode_wblist_lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ecaec0fea3a1..316922d5dd13 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1192,7 +1192,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else - blkbits = fc->blkbits; + blkbits = inode->i_sb->s_blocksize_bits; stat->blksize = 1 << blkbits; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f1ef77a0be05..7bcb650a9f26 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -834,23 +834,142 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio, return 0; } +static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + iomap->type = IOMAP_MAPPED; + iomap->length = length; + iomap->offset = offset; + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, +}; + +struct fuse_fill_read_data { + struct file *file; + + /* Fields below are used if sending the read request asynchronously */ + struct fuse_conn *fc; + struct fuse_io_args *ia; + unsigned int nr_bytes; +}; + +/* forward declarations */ +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write); +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count, bool async); + +static int fuse_handle_readahead(struct folio *folio, + struct readahead_control *rac, + struct fuse_fill_read_data *data, loff_t pos, + size_t len) +{ + struct fuse_io_args *ia = data->ia; + size_t off = offset_in_folio(folio, pos); + struct fuse_conn *fc = data->fc; + struct fuse_args_pages *ap; + unsigned int nr_pages; + + if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, + false)) { + fuse_send_readpages(ia, data->file, data->nr_bytes, + fc->async_read); + data->nr_bytes = 0; + data->ia = NULL; + ia = NULL; + } + if (!ia) { + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + return -EAGAIN; + + nr_pages = min(fc->max_pages, readahead_count(rac)); + data->ia = fuse_io_alloc(NULL, nr_pages); + if (!data->ia) + return -ENOMEM; + ia = data->ia; + } + folio_get(folio); + ap = &ia->ap; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = off; + ap->descs[ap->num_folios].length = len; + data->nr_bytes += len; + ap->num_folios++; + + return 0; +} + +static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, + size_t len) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + struct folio *folio = ctx->cur_folio; + loff_t pos = iter->pos; + size_t off = offset_in_folio(folio, pos); + struct file *file = data->file; + int ret; + + if (ctx->rac) { + ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); + } else { + /* + * for non-readahead read requests, do reads synchronously + * since it's not guaranteed that the server can handle + * out-of-order reads + */ + ret = fuse_do_readfolio(file, folio, off, len); + if (!ret) + iomap_finish_folio_read(folio, off, len, ret); + } + return ret; +} + +static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + + if (data->ia) + fuse_send_readpages(data->ia, data->file, data->nr_bytes, + data->fc->async_read); +} + +static const struct iomap_read_ops fuse_iomap_read_ops = { + .read_folio_range = fuse_iomap_read_folio_range_async, + .submit_read = fuse_iomap_read_submit, +}; + static int fuse_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; - int err; + struct fuse_fill_read_data data = { + .file = file, + }; + struct iomap_read_folio_ctx ctx = { + .cur_folio = folio, + .ops = &fuse_iomap_read_ops, + .read_ctx = &data, - err = -EIO; - if (fuse_is_bad(inode)) - goto out; + }; - err = fuse_do_readfolio(file, folio, 0, folio_size(folio)); - if (!err) - folio_mark_uptodate(folio); + if (fuse_is_bad(inode)) { + folio_unlock(folio); + return -EIO; + } + iomap_read_folio(&fuse_iomap_ops, &ctx); fuse_invalidate_atime(inode); - out: - folio_unlock(folio); - return err; + return 0; } static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, @@ -887,7 +1006,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); for (i = 0; i < ap->num_folios; i++) { - folio_end_read(ap->folios[i], !err); + iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, + ap->descs[i].length, err); folio_put(ap->folios[i]); } if (ia->ff) @@ -897,7 +1017,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, } static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, - unsigned int count) + unsigned int count, bool async) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; @@ -919,7 +1039,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, fuse_read_args_fill(ia, file, pos, count, FUSE_READ); ia->read.attr_ver = fuse_get_attr_version(fm->fc); - if (fm->fc->async_read) { + if (async) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); @@ -936,81 +1056,20 @@ static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int max_pages, nr_pages; - struct folio *folio = NULL; + struct fuse_fill_read_data data = { + .file = rac->file, + .fc = fc, + }; + struct iomap_read_folio_ctx ctx = { + .ops = &fuse_iomap_read_ops, + .rac = rac, + .read_ctx = &data + }; if (fuse_is_bad(inode)) return; - max_pages = min_t(unsigned int, fc->max_pages, - fc->max_read / PAGE_SIZE); - - /* - * This is only accurate the first time through, since readahead_folio() - * doesn't update readahead_count() from the previous folio until the - * next call. Grab nr_pages here so we know how many pages we're going - * to have to process. This means that we will exit here with - * readahead_count() == folio_nr_pages(last_folio), but we will have - * consumed all of the folios, and read_pages() will call - * readahead_folio() again which will clean up the rac. - */ - nr_pages = readahead_count(rac); - - while (nr_pages) { - struct fuse_io_args *ia; - struct fuse_args_pages *ap; - unsigned cur_pages = min(max_pages, nr_pages); - unsigned int pages = 0; - - if (fc->num_background >= fc->congestion_threshold && - rac->ra->async_size >= readahead_count(rac)) - /* - * Congested and only async pages left, so skip the - * rest. - */ - break; - - ia = fuse_io_alloc(NULL, cur_pages); - if (!ia) - break; - ap = &ia->ap; - - while (pages < cur_pages) { - unsigned int folio_pages; - - /* - * This returns a folio with a ref held on it. - * The ref needs to be held until the request is - * completed, since the splice case (see - * fuse_try_move_page()) drops the ref after it's - * replaced in the page cache. - */ - if (!folio) - folio = __readahead_folio(rac); - - folio_pages = folio_nr_pages(folio); - if (folio_pages > cur_pages - pages) { - /* - * Large folios belonging to fuse will never - * have more pages than max_pages. - */ - WARN_ON(!pages); - break; - } - - ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].length = folio_size(folio); - ap->num_folios++; - pages += folio_pages; - folio = NULL; - } - fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); - nr_pages -= pages; - } - if (folio) { - folio_end_read(folio, false); - folio_put(folio); - } + iomap_readahead(&fuse_iomap_ops, &ctx); } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -1397,20 +1456,6 @@ static const struct iomap_write_ops fuse_iomap_write_ops = { .read_folio_range = fuse_iomap_read_folio_range, }; -static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) -{ - iomap->type = IOMAP_MAPPED; - iomap->length = length; - iomap->offset = offset; - return 0; -} - -static const struct iomap_ops fuse_iomap_ops = { - .iomap_begin = fuse_iomap_begin, -}; - static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1834,7 +1879,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ - iomap_finish_folio_write(inode, ap->folios[i], 1); + iomap_finish_folio_write(inode, ap->folios[i], + ap->descs[i].length); wake_up(&fi->page_waitq); } @@ -2047,7 +2093,7 @@ struct fuse_fill_wb_data { struct fuse_file *ff; unsigned int max_folios; /* - * nr_bytes won't overflow since fuse_writepage_need_send() caps + * nr_bytes won't overflow since fuse_folios_need_send() caps * wb requests to never exceed fc->max_pages (which has an upper bound * of U16_MAX). */ @@ -2092,14 +2138,15 @@ static void fuse_writepages_send(struct inode *inode, spin_unlock(&fi->lock); } -static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, - unsigned len, struct fuse_args_pages *ap, - struct fuse_fill_wb_data *data) +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write) { struct folio *prev_folio; struct fuse_folio_desc prev_desc; - unsigned bytes = data->nr_bytes + len; + unsigned bytes = cur_bytes + len; loff_t prev_pos; + size_t max_bytes = write ? fc->max_write : fc->max_read; WARN_ON(!ap->num_folios); @@ -2107,8 +2154,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) return true; - /* Reached max write bytes */ - if (bytes > fc->max_write) + if (bytes > max_bytes) return true; /* Discontinuity */ @@ -2118,11 +2164,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, if (prev_pos != pos) return true; - /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_folios == data->max_folios && - !fuse_pages_realloc(data, fc->max_pages)) - return true; - return false; } @@ -2146,10 +2187,24 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, return -EIO; } - if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { - fuse_writepages_send(inode, data); - data->wpa = NULL; - data->nr_bytes = 0; + if (wpa) { + bool send = fuse_folios_need_send(fc, pos, len, ap, + data->nr_bytes, true); + + if (!send) { + /* + * Need to grow the pages array? If so, did the + * expansion fail? + */ + send = (ap->num_folios == data->max_folios) && + !fuse_pages_realloc(data, fc->max_pages); + } + + if (send) { + fuse_writepages_send(inode, data); + data->wpa = NULL; + data->nr_bytes = 0; + } } if (data->wpa == NULL) { @@ -2161,7 +2216,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, ap = &wpa->ia.ap; } - iomap_start_folio_write(inode, folio, 1); fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, offset, len); data->nr_bytes += len; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c2f2a48156d6..f616c1991fed 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -981,14 +981,6 @@ struct fuse_conn { /* Request timeout (in jiffies). 0 = no timeout */ unsigned int req_timeout; } timeout; - - /* - * This is a workaround until fuse uses iomap for reads. - * For fuseblk servers, this represents the blocksize passed in at - * mount time and for regular fuse servers, this is equivalent to - * inode->i_blkbits. - */ - u8 blkbits; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d1babf56f254..1a397be53f49 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -160,7 +160,7 @@ static void fuse_evict_inode(struct inode *inode) struct fuse_inode *fi = get_fuse_inode(inode); /* Will write inode on close/munmap and in all other dirtiers */ - WARN_ON(inode->i_state & I_DIRTY_INODE); + WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE); if (FUSE_IS_DAX(inode)) dax_break_layout_final(inode); @@ -291,7 +291,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (attr->blksize) fi->cached_i_blkbits = ilog2(attr->blksize); else - fi->cached_i_blkbits = fc->blkbits; + fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; /* * Don't set the sticky bit in i_mode, unless we want the VFS @@ -505,7 +505,7 @@ retry: if (!inode) return NULL; - if ((inode->i_state & I_NEW)) { + if ((inode_state_read_once(inode) & I_NEW)) { inode->i_flags |= S_NOATIME; if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; @@ -1838,22 +1838,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; - /* - * This is a workaround until fuse hooks into iomap for reads. - * Use PAGE_SIZE for the blocksize else if the writeback cache - * is enabled, buffered writes go through iomap and a read may - * overwrite partially written data if blocksize < PAGE_SIZE - */ - fc->blkbits = sb->s_blocksize_bits; - if (ctx->blksize != PAGE_SIZE && - !sb_set_blocksize(sb, PAGE_SIZE)) - goto err; #endif fc->sync_fs = 1; } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - fc->blkbits = sb->s_blocksize_bits; } sb->s_subtype = ctx->subtype; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6bc7c97b017d..b2f6486fe1d5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) sprintf(buff, "%d", i); fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); - if (!fs->mqs_kobj) { + if (!fsvq->kobj) { ret = -ENOMEM; goto out_del; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 47d74afd63ac..e2b1c860664d 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -311,10 +311,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) @@ -424,11 +421,11 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) struct inode *inode = folio->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; + int error = 0; if (!gfs2_is_jdata(ip) || (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { - error = iomap_read_folio(folio, &gfs2_iomap_ops); + iomap_bio_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { error = stuffed_read_folio(ip, folio); } else { @@ -503,7 +500,7 @@ static void gfs2_readahead(struct readahead_control *rac) else if (gfs2_is_jdata(ip)) mpage_readahead(rac, gfs2_block_map); else - iomap_readahead(rac, &gfs2_iomap_ops); + iomap_bio_readahead(rac, &gfs2_iomap_ops); } /** diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index bc67fa058c84..ee92f5910ae1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -744,7 +744,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & I_DIRTY; + int sync_state = inode_state_read_once(inode) & I_DIRTY; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b677c0e6b9ab..c9712235e7a0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -957,7 +957,7 @@ static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { - wait_on_inode(&ip->i_inode); + wait_on_new_inode(&ip->i_inode); if (is_bad_inode(&ip->i_inode)) { iput(&ip->i_inode); ip = NULL; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 0c0a80b3baca..c94e42b0c94d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -394,7 +394,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) u16 height, depth; umode_t mode = be32_to_cpu(str->di_mode); struct inode *inode = &ip->i_inode; - bool is_new = inode->i_state & I_NEW; + bool is_new = inode_state_read_once(inode) & I_NEW; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) { gfs2_consist_inode(ip); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8a7ed80d9f2d..890c87e3e365 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -127,7 +127,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, ip = GFS2_I(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; int extra_flags = 0; @@ -924,7 +924,7 @@ fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(&d_gh); if (!IS_ERR_OR_NULL(inode)) { - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) iget_failed(inode); else iput(inode); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index aa15183f9a16..889682f051ea 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1751,7 +1751,7 @@ static void gfs2_evict_inodes(struct super_block *sb) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) && + if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) && !need_resched()) { spin_unlock(&inode->i_lock); continue; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 22e62fe7448b..54c20d01c342 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -42,7 +42,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->inode = iget_locked(sb, id); if (!tree->inode) goto free_tree; - BUG_ON(!(tree->inode->i_state & I_NEW)); + BUG_ON(!(inode_state_read_once(tree->inode) & I_NEW)); { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; HFS_I(tree->inode)->flags = 0; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 9cd449913dc8..81ad93e6312f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -412,7 +412,7 @@ struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_ return NULL; } inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data); - if (inode && (inode->i_state & I_NEW)) + if (inode && (inode_state_read_once(inode) & I_NEW)) unlock_new_inode(inode); return inode; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 16bc4abc67e0..54e85e25a259 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -65,7 +65,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; atomic_set(&HFSPLUS_I(inode)->opencnt, 0); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 1e1acf5775ab..51d26aa2b93e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -581,7 +581,7 @@ static struct inode *hostfs_iget(struct super_block *sb, char *name) if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { unlock_new_inode(inode); } else { spin_lock(&inode->i_lock); @@ -979,7 +979,7 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hostfs_fs_info *fsi = fc->s_fs_info; struct fs_parse_result result; - char *host_root; + char *host_root, *tmp_root; int opt; opt = fs_parse(fc, hostfs_param_specs, param, &result); @@ -990,11 +990,13 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_hostfs: host_root = param->string; if (!*host_root) - host_root = ""; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + break; + tmp_root = kasprintf(GFP_KERNEL, "%s%s", + fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; break; } @@ -1004,17 +1006,17 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) static int hostfs_parse_monolithic(struct fs_context *fc, void *data) { struct hostfs_fs_info *fsi = fc->s_fs_info; - char *host_root = (char *)data; + char *tmp_root, *host_root = (char *)data; /* NULL is printed as '(null)' by printf(): avoid that. */ if (host_root == NULL) - host_root = ""; + return 0; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + tmp_root = kasprintf(GFP_KERNEL, "%s%s", fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; - + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; return 0; } @@ -1049,6 +1051,11 @@ static int hostfs_init_fs_context(struct fs_context *fc) if (!fsi) return -ENOMEM; + fsi->host_root_path = kasprintf(GFP_KERNEL, "%s/", root_ino); + if (!fsi->host_root_path) { + kfree(fsi); + return -ENOMEM; + } fc->s_fs_info = fsi; fc->ops = &hostfs_context_ops; return 0; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 49dd585c2b17..ceb50b2dc91a 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in result = ERR_PTR(-ENOMEM); goto bail1; } - if (result->i_state & I_NEW) { + if (inode_state_read_once(result) & I_NEW) { hpfs_init_inode(result); if (de->directory) hpfs_read_inode(result); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 34008442ee26..93d528f4f4f2 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -196,7 +196,7 @@ void hpfs_write_inode(struct inode *i) parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir); if (parent) { hpfs_inode->i_dirty = 0; - if (parent->i_state & I_NEW) { + if (inode_state_read_once(parent) & I_NEW) { hpfs_init_inode(parent); hpfs_read_inode(parent); unlock_new_inode(parent); diff --git a/fs/inode.c b/fs/inode.c index ec9339024ac3..cc8265cfe80e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -233,7 +233,7 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; - inode->i_state = 0; + inode_state_assign_raw(inode, 0); atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; @@ -471,7 +471,7 @@ EXPORT_SYMBOL(set_nlink); void inc_nlink(struct inode *inode) { if (unlikely(inode->i_nlink == 0)) { - WARN_ON(!(inode->i_state & I_LINKABLE)); + WARN_ON(!(inode_state_read_once(inode) & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); } @@ -530,9 +530,48 @@ void ihold(struct inode *inode) } EXPORT_SYMBOL(ihold); -static void __inode_add_lru(struct inode *inode, bool rotate) +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, + struct inode *inode, u32 bit) +{ + void *bit_address; + + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); +} +EXPORT_SYMBOL(inode_bit_waitqueue); + +void wait_on_new_inode(struct inode *inode) +{ + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + + spin_lock(&inode->i_lock); + if (!(inode_state_read(inode) & I_NEW)) { + spin_unlock(&inode->i_lock); + return; + } + + wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); + for (;;) { + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + if (!(inode_state_read(inode) & I_NEW)) + break; + spin_unlock(&inode->i_lock); + schedule(); + spin_lock(&inode->i_lock); + } + finish_wait(wq_head, &wqe.wq_entry); + WARN_ON(inode_state_read(inode) & I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(wait_on_new_inode); + +static void __inode_lru_list_add(struct inode *inode, bool rotate) { - if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) + lockdep_assert_held(&inode->i_lock); + + if (inode_state_read(inode) & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; if (icount_read(inode)) return; @@ -544,32 +583,22 @@ static void __inode_add_lru(struct inode *inode, bool rotate) if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) - inode->i_state |= I_REFERENCED; -} - -struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, - struct inode *inode, u32 bit) -{ - void *bit_address; - - bit_address = inode_state_wait_address(inode, bit); - init_wait_var_entry(wqe, bit_address, 0); - return __var_waitqueue(bit_address); + inode_state_set(inode, I_REFERENCED); } -EXPORT_SYMBOL(inode_bit_waitqueue); /* * Add inode to LRU if needed (inode is unused and clean). - * - * Needs inode->i_lock held. */ -void inode_add_lru(struct inode *inode) +void inode_lru_list_add(struct inode *inode) { - __inode_add_lru(inode, false); + __inode_lru_list_add(inode, false); } static void inode_lru_list_del(struct inode *inode) { + if (list_empty(&inode->i_lru)) + return; + if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } @@ -577,15 +606,15 @@ static void inode_lru_list_del(struct inode *inode) static void inode_pin_lru_isolating(struct inode *inode) { lockdep_assert_held(&inode->i_lock); - WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); - inode->i_state |= I_LRU_ISOLATING; + WARN_ON(inode_state_read(inode) & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); + inode_state_set(inode, I_LRU_ISOLATING); } static void inode_unpin_lru_isolating(struct inode *inode) { spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); - inode->i_state &= ~I_LRU_ISOLATING; + WARN_ON(!(inode_state_read(inode) & I_LRU_ISOLATING)); + inode_state_clear(inode, I_LRU_ISOLATING); /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_LRU_ISOLATING); spin_unlock(&inode->i_lock); @@ -597,7 +626,7 @@ static void inode_wait_for_lru_isolating(struct inode *inode) struct wait_queue_head *wq_head; lockdep_assert_held(&inode->i_lock); - if (!(inode->i_state & I_LRU_ISOLATING)) + if (!(inode_state_read(inode) & I_LRU_ISOLATING)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); @@ -607,14 +636,14 @@ static void inode_wait_for_lru_isolating(struct inode *inode) * Checking I_LRU_ISOLATING with inode->i_lock guarantees * memory ordering. */ - if (!(inode->i_state & I_LRU_ISOLATING)) + if (!(inode_state_read(inode) & I_LRU_ISOLATING)) break; spin_unlock(&inode->i_lock); schedule(); spin_lock(&inode->i_lock); } finish_wait(wq_head, &wqe.wq_entry); - WARN_ON(inode->i_state & I_LRU_ISOLATING); + WARN_ON(inode_state_read(inode) & I_LRU_ISOLATING); } /** @@ -761,11 +790,11 @@ void clear_inode(struct inode *inode) */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.i_private_list)); - BUG_ON(!(inode->i_state & I_FREEING)); - BUG_ON(inode->i_state & I_CLEAR); + BUG_ON(!(inode_state_read_once(inode) & I_FREEING)); + BUG_ON(inode_state_read_once(inode) & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ - inode->i_state = I_FREEING | I_CLEAR; + inode_state_assign_raw(inode, I_FREEING | I_CLEAR); } EXPORT_SYMBOL(clear_inode); @@ -786,12 +815,10 @@ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; - BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(!(inode_state_read_once(inode) & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - if (!list_empty(&inode->i_io_list)) - inode_io_list_del(inode); - + inode_io_list_del(inode); inode_sb_list_del(inode); spin_lock(&inode->i_lock); @@ -829,7 +856,7 @@ static void evict(struct inode *inode) * This also means we don't need any fences for the call below. */ inode_wake_up_bit(inode, __I_NEW); - BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + BUG_ON(inode_state_read_once(inode) != (I_FREEING | I_CLEAR)); destroy_inode(inode); } @@ -879,12 +906,12 @@ again: spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } - inode->i_state |= I_FREEING; + inode_state_set(inode, I_FREEING); inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); @@ -938,7 +965,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * sync, or the last page cache deletion will requeue them. */ if (icount_read(inode) || - (inode->i_state & ~I_REFERENCED) || + (inode_state_read(inode) & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); @@ -947,8 +974,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, } /* Recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { - inode->i_state &= ~I_REFERENCED; + if (inode_state_read(inode) & I_REFERENCED) { + inode_state_clear(inode, I_REFERENCED); spin_unlock(&inode->i_lock); return LRU_ROTATE; } @@ -975,8 +1002,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_RETRY; } - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; + WARN_ON(inode_state_read(inode) & I_NEW); + inode_state_set(inode, I_FREEING); list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); @@ -1008,7 +1035,8 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), - void *data, bool is_inode_hash_locked) + void *data, bool is_inode_hash_locked, + bool *isnew) { struct inode *inode = NULL; @@ -1025,16 +1053,17 @@ repeat: if (!test(inode, data)) continue; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } - if (unlikely(inode->i_state & I_CREATING)) { + if (unlikely(inode_state_read(inode) & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); + *isnew = !!(inode_state_read(inode) & I_NEW); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; @@ -1049,7 +1078,7 @@ repeat: */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, - bool is_inode_hash_locked) + bool is_inode_hash_locked, bool *isnew) { struct inode *inode = NULL; @@ -1066,16 +1095,17 @@ repeat: if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } - if (unlikely(inode->i_state & I_CREATING)) { + if (unlikely(inode_state_read(inode) & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); + *isnew = !!(inode_state_read(inode) & I_NEW); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; @@ -1180,14 +1210,8 @@ void unlock_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW & ~I_CREATING; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } @@ -1197,14 +1221,8 @@ void discard_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); @@ -1260,6 +1278,7 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set + * @isnew: pointer to a bool which will indicate whether I_NEW is set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a @@ -1278,12 +1297,13 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; + bool isnew; might_sleep(); again: spin_lock(&inode_hash_lock); - old = find_inode(inode->i_sb, head, test, data, true); + old = find_inode(inode->i_sb, head, test, data, true, &isnew); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. @@ -1292,7 +1312,8 @@ again: spin_unlock(&inode_hash_lock); if (IS_ERR(old)) return NULL; - wait_on_inode(old); + if (unlikely(isnew)) + wait_on_new_inode(old); if (unlikely(inode_unhashed(old))) { iput(old); goto again; @@ -1310,7 +1331,7 @@ again: * caller is responsible for filling in the contents */ spin_lock(&inode->i_lock); - inode->i_state |= I_NEW; + inode_state_set(inode, I_NEW); hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); @@ -1383,15 +1404,17 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; + bool isnew; might_sleep(); again: - inode = find_inode(sb, head, test, data, false); + inode = find_inode(sb, head, test, data, false, &isnew); if (inode) { if (IS_ERR(inode)) return NULL; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1426,15 +1449,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + bool isnew; might_sleep(); again: - inode = find_inode_fast(sb, head, ino, false); + inode = find_inode_fast(sb, head, ino, false, &isnew); if (inode) { if (IS_ERR(inode)) return NULL; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1448,11 +1473,11 @@ again: spin_lock(&inode_hash_lock); /* We released the lock, so.. */ - old = find_inode_fast(sb, head, ino, true); + old = find_inode_fast(sb, head, ino, true, &isnew); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); - inode->i_state = I_NEW; + inode_state_assign(inode, I_NEW); hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); @@ -1474,7 +1499,8 @@ again: if (IS_ERR(old)) return NULL; inode = old; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1545,7 +1571,7 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode->i_lock); - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { + if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) { __iget(inode); spin_unlock(&inode->i_lock); } else { @@ -1578,13 +1604,13 @@ EXPORT_SYMBOL(igrab); * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) + int (*test)(struct inode *, void *), void *data, bool *isnew) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; spin_lock(&inode_hash_lock); - inode = find_inode(sb, head, test, data, true); + inode = find_inode(sb, head, test, data, true, isnew); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; @@ -1612,13 +1638,15 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; + bool isnew; might_sleep(); again: - inode = ilookup5_nowait(sb, hashval, test, data); + inode = ilookup5_nowait(sb, hashval, test, data, &isnew); if (inode) { - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1640,16 +1668,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + bool isnew; might_sleep(); again: - inode = find_inode_fast(sb, head, ino, false); + inode = find_inode_fast(sb, head, ino, false, &isnew); if (inode) { if (IS_ERR(inode)) return NULL; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1741,7 +1771,7 @@ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb == sb && - !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && + !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) return inode; } @@ -1780,7 +1810,7 @@ struct inode *find_inode_by_ino_rcu(struct super_block *sb, hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && - !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) + !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE))) return inode; } return NULL; @@ -1792,6 +1822,7 @@ int insert_inode_locked(struct inode *inode) struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); + bool isnew; might_sleep(); @@ -1804,7 +1835,7 @@ int insert_inode_locked(struct inode *inode) if (old->i_sb != sb) continue; spin_lock(&old->i_lock); - if (old->i_state & (I_FREEING|I_WILL_FREE)) { + if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) { spin_unlock(&old->i_lock); continue; } @@ -1812,21 +1843,23 @@ int insert_inode_locked(struct inode *inode) } if (likely(!old)) { spin_lock(&inode->i_lock); - inode->i_state |= I_NEW | I_CREATING; + inode_state_set(inode, I_NEW | I_CREATING); hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; } - if (unlikely(old->i_state & I_CREATING)) { + if (unlikely(inode_state_read(old) & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); return -EBUSY; } __iget(old); + isnew = !!(inode_state_read(old) & I_NEW); spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); - wait_on_inode(old); + if (isnew) + wait_on_new_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; @@ -1843,7 +1876,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, might_sleep(); - inode->i_state |= I_CREATING; + inode_state_set_raw(inode, I_CREATING); old = inode_insert5(inode, hashval, test, NULL, data); if (old != inode) { @@ -1875,10 +1908,10 @@ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; - unsigned long state; int drop; - WARN_ON(inode->i_state & I_NEW); + WARN_ON(inode_state_read(inode) & I_NEW); + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); if (op->drop_inode) drop = op->drop_inode(inode); @@ -1886,29 +1919,33 @@ static void iput_final(struct inode *inode) drop = inode_generic_drop(inode); if (!drop && - !(inode->i_state & I_DONTCACHE) && + !(inode_state_read(inode) & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { - __inode_add_lru(inode, true); + __inode_lru_list_add(inode, true); spin_unlock(&inode->i_lock); return; } - state = inode->i_state; - if (!drop) { - WRITE_ONCE(inode->i_state, state | I_WILL_FREE); + /* + * Re-check ->i_count in case the ->drop_inode() hooks played games. + * Note we only execute this if the verdict was to drop the inode. + */ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); + + if (drop) { + inode_state_set(inode, I_FREEING); + } else { + inode_state_set(inode, I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); - state = inode->i_state; - WARN_ON(state & I_NEW); - state &= ~I_WILL_FREE; + WARN_ON(inode_state_read(inode) & I_NEW); + inode_state_replace(inode, I_WILL_FREE, I_FREEING); } - WRITE_ONCE(inode->i_state, state | I_FREEING); - if (!list_empty(&inode->i_lru)) - inode_lru_list_del(inode); + inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); @@ -1931,7 +1968,7 @@ void iput(struct inode *inode) retry: lockdep_assert_not_held(&inode->i_lock); - VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode); + VFS_BUG_ON_INODE(inode_state_read_once(inode) & I_CLEAR, inode); /* * Note this assert is technically racy as if the count is bogusly * equal to one, then two CPUs racing to further drop it can both @@ -1942,14 +1979,14 @@ retry: if (atomic_add_unless(&inode->i_count, -1, 1)) return; - if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) { + if ((inode_state_read_once(inode) & I_DIRTY_TIME) && inode->i_nlink) { trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); goto retry; } spin_lock(&inode->i_lock); - if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) { + if (unlikely((inode_state_read(inode) & I_DIRTY_TIME) && inode->i_nlink)) { spin_unlock(&inode->i_lock); goto retry; } @@ -1967,6 +2004,18 @@ retry: } EXPORT_SYMBOL(iput); +/** + * iput_not_last - put an inode assuming this is not the last reference + * @inode: inode to put + */ +void iput_not_last(struct inode *inode) +{ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode); + + WARN_ON(atomic_sub_return(1, &inode->i_count) == 0); +} +EXPORT_SYMBOL(iput_not_last); + #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file @@ -2310,42 +2359,40 @@ out: } EXPORT_SYMBOL(current_time); -static int inode_needs_update_time(struct inode *inode) +static int file_update_time_flags(struct file *file, unsigned int flags) { + struct inode *inode = file_inode(file); struct timespec64 now, ts; - int sync_it = 0; + int sync_mode = 0; + int ret = 0; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; + if (unlikely(file->f_mode & FMODE_NOCMTIME)) + return 0; now = current_time(inode); ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) - sync_it |= S_MTIME; - + sync_mode |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) - sync_it |= S_CTIME; - + sync_mode |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) - sync_it |= S_VERSION; + sync_mode |= S_VERSION; - return sync_it; -} - -static int __file_update_time(struct file *file, int sync_mode) -{ - int ret = 0; - struct inode *inode = file_inode(file); + if (!sync_mode) + return 0; - /* try to update time settings */ - if (!mnt_get_write_access_file(file)) { - ret = inode_update_time(inode, sync_mode); - mnt_put_write_access_file(file); - } + if (flags & IOCB_NOWAIT) + return -EAGAIN; + if (mnt_get_write_access_file(file)) + return 0; + ret = inode_update_time(inode, sync_mode); + mnt_put_write_access_file(file); return ret; } @@ -2365,14 +2412,7 @@ static int __file_update_time(struct file *file, int sync_mode) */ int file_update_time(struct file *file) { - int ret; - struct inode *inode = file_inode(file); - - ret = inode_needs_update_time(inode); - if (ret <= 0) - return ret; - - return __file_update_time(file, ret); + return file_update_time_flags(file, 0); } EXPORT_SYMBOL(file_update_time); @@ -2394,7 +2434,6 @@ EXPORT_SYMBOL(file_update_time); static int file_modified_flags(struct file *file, int flags) { int ret; - struct inode *inode = file_inode(file); /* * Clear the security bits if the process is not being run by root. @@ -2403,17 +2442,7 @@ static int file_modified_flags(struct file *file, int flags) ret = file_remove_privs_flags(file, flags); if (ret) return ret; - - if (unlikely(file->f_mode & FMODE_NOCMTIME)) - return 0; - - ret = inode_needs_update_time(inode); - if (ret <= 0) - return ret; - if (flags & IOCB_NOWAIT) - return -EAGAIN; - - return __file_update_time(file, ret); + return file_update_time_flags(file, flags); } /** @@ -2970,7 +2999,7 @@ void dump_inode(struct inode *inode, const char *reason) pr_warn("%s encountered for inode %px\n" "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, - inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); + inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index f7e1c8534c46..a572b8808524 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -14,5 +14,6 @@ iomap-y += trace.o \ iomap-$(CONFIG_BLOCK) += direct-io.o \ ioend.o \ fiemap.o \ - seek.o + seek.o \ + bio.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c new file mode 100644 index 000000000000..fc045f2e4c45 --- /dev/null +++ b/fs/iomap/bio.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2016-2023 Christoph Hellwig. + */ +#include <linux/iomap.h> +#include <linux/pagemap.h> +#include "internal.h" +#include "trace.h" + +static void iomap_read_end_io(struct bio *bio) +{ + int error = blk_status_to_errno(bio->bi_status); + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) + iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); + bio_put(bio); +} + +static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) +{ + struct bio *bio = ctx->read_ctx; + + if (bio) + submit_bio(bio); +} + +static int iomap_bio_read_folio_range(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t plen) +{ + struct folio *folio = ctx->cur_folio; + const struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos; + size_t poff = offset_in_folio(folio, pos); + loff_t length = iomap_length(iter); + sector_t sector; + struct bio *bio = ctx->read_ctx; + + sector = iomap_sector(iomap, pos); + if (!bio || bio_end_sector(bio) != sector || + !bio_add_folio(bio, folio, plen, poff)) { + gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); + gfp_t orig_gfp = gfp; + unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); + + if (bio) + submit_bio(bio); + + if (ctx->rac) /* same as readahead_gfp_mask */ + gfp |= __GFP_NORETRY | __GFP_NOWARN; + bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, + gfp); + /* + * If the bio_alloc fails, try it again for a single page to + * avoid having to deal with partial page reads. This emulates + * what do_mpage_read_folio does. + */ + if (!bio) + bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); + if (ctx->rac) + bio->bi_opf |= REQ_RAHEAD; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = iomap_read_end_io; + bio_add_folio_nofail(bio, folio, plen, poff); + ctx->read_ctx = bio; + } + return 0; +} + +const struct iomap_read_ops iomap_bio_read_ops = { + .read_folio_range = iomap_bio_read_folio_range, + .submit_read = iomap_bio_submit_read, +}; +EXPORT_SYMBOL_GPL(iomap_bio_read_ops); + +int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + const struct iomap *srcmap = iomap_iter_srcmap(iter); + struct bio_vec bvec; + struct bio bio; + + bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); + bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); + return submit_bio_wait(&bio); +} diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8b847a1e27f1..f68fc6ac15e0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -8,6 +8,7 @@ #include <linux/writeback.h> #include <linux/swap.h> #include <linux/migrate.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -37,10 +38,28 @@ static inline bool ifs_is_fully_uptodate(struct folio *folio, return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } -static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, - unsigned int block) +/* + * Find the next uptodate block in the folio. end_blk is inclusive. + * If no uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_uptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + + return find_next_bit(ifs->state, end_blk + 1, start_blk); +} + +/* + * Find the next non-uptodate block in the folio. end_blk is inclusive. + * If no non-uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_nonuptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { - return test_bit(block, ifs->state); + struct iomap_folio_state *ifs = folio->private; + + return find_next_zero_bit(ifs->state, end_blk + 1, start_blk); } static bool ifs_set_range_uptodate(struct folio *folio, @@ -75,13 +94,34 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, folio_mark_uptodate(folio); } -static inline bool ifs_block_is_dirty(struct folio *folio, - struct iomap_folio_state *ifs, int block) +/* + * Find the next dirty block in the folio. end_blk is inclusive. + * If no dirty block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_dirty_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { + struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int blks = i_blocks_per_folio(inode, folio); + + return find_next_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; +} + +/* + * Find the next clean block in the folio. end_blk is inclusive. + * If no clean block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_clean_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + unsigned int blks = i_blocks_per_folio(inode, folio); - return test_bit(block + blks_per_folio, ifs->state); + return find_next_zero_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; } static unsigned ifs_find_dirty_range(struct folio *folio, @@ -92,18 +132,17 @@ static unsigned ifs_find_dirty_range(struct folio *folio, offset_in_folio(folio, *range_start) >> inode->i_blkbits; unsigned end_blk = min_not_zero( offset_in_folio(folio, range_end) >> inode->i_blkbits, - i_blocks_per_folio(inode, folio)); - unsigned nblks = 1; + i_blocks_per_folio(inode, folio)) - 1; + unsigned nblks; - while (!ifs_block_is_dirty(folio, ifs, start_blk)) - if (++start_blk == end_blk) - return 0; - - while (start_blk + nblks < end_blk) { - if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) - break; - nblks++; - } + start_blk = ifs_next_dirty_block(folio, start_blk, end_blk); + if (start_blk > end_blk) + return 0; + if (start_blk == end_blk) + nblks = 1; + else + nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk) - + start_blk; *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); return nblks << inode->i_blkbits; @@ -218,6 +257,22 @@ static void ifs_free(struct folio *folio) } /* + * Calculate how many bytes to truncate based off the number of blocks to + * truncate and the end position to start truncating from. + */ +static size_t iomap_bytes_to_truncate(loff_t end_pos, unsigned block_bits, + unsigned blocks_truncated) +{ + unsigned block_size = 1 << block_bits; + unsigned block_offset = end_pos & (block_size - 1); + + if (!block_offset) + return blocks_truncated << block_bits; + + return ((blocks_truncated - 1) << block_bits) + block_offset; +} + +/* * Calculate the range inside the folio that we actually need to read. */ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, @@ -240,24 +295,29 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * to avoid reading in already uptodate ranges. */ if (ifs) { - unsigned int i; - - /* move forward for each leading block marked uptodate */ - for (i = first; i <= last; i++) { - if (!ifs_block_is_uptodate(ifs, i)) - break; - *pos += block_size; - poff += block_size; - plen -= block_size; - first++; + unsigned int next, blocks_skipped; + + next = ifs_next_nonuptodate_block(folio, first, last); + blocks_skipped = next - first; + + if (blocks_skipped) { + unsigned long block_offset = *pos & (block_size - 1); + unsigned bytes_skipped = + (blocks_skipped << block_bits) - block_offset; + + *pos += bytes_skipped; + poff += bytes_skipped; + plen -= bytes_skipped; } + first = next; /* truncate len if we find any trailing uptodate block(s) */ - while (++i <= last) { - if (ifs_block_is_uptodate(ifs, i)) { - plen -= (last - i + 1) * block_size; - last = i - 1; - break; + if (++next <= last) { + next = ifs_next_uptodate_block(folio, next, last); + if (next <= last) { + plen -= iomap_bytes_to_truncate(*pos + plen, + block_bits, last - next + 1); + last = next - 1; } } } @@ -271,7 +331,8 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) - plen -= (last - end) * block_size; + plen -= iomap_bytes_to_truncate(*pos + plen, block_bits, + last - end); } *offp = poff; @@ -320,9 +381,8 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, return 0; } -#ifdef CONFIG_BLOCK -static void iomap_finish_folio_read(struct folio *folio, size_t off, - size_t len, int error) +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error) { struct iomap_folio_state *ifs = folio->private; bool uptodate = !error; @@ -342,169 +402,201 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off, if (finished) folio_end_read(folio, uptodate); } +EXPORT_SYMBOL_GPL(iomap_finish_folio_read); -static void iomap_read_end_io(struct bio *bio) +static void iomap_read_init(struct folio *folio) { - int error = blk_status_to_errno(bio->bi_status); - struct folio_iter fi; + struct iomap_folio_state *ifs = folio->private; - bio_for_each_folio_all(fi, bio) - iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); - bio_put(bio); + if (ifs) { + size_t len = folio_size(folio); + + /* + * ifs->read_bytes_pending is used to track how many bytes are + * read in asynchronously by the IO helper. We need to track + * this so that we can know when the IO helper has finished + * reading in all the necessary ranges of the folio and can end + * the read. + * + * Increase ->read_bytes_pending by the folio size to start, and + * add a +1 bias. We'll subtract the bias and any uptodate / + * zeroed ranges that did not require IO in iomap_read_end() + * after we're done processing the folio. + * + * We do this because otherwise, we would have to increment + * ifs->read_bytes_pending every time a range in the folio needs + * to be read in, which can get expensive since the spinlock + * needs to be held whenever modifying ifs->read_bytes_pending. + * + * We add the bias to ensure the read has not been ended on the + * folio when iomap_read_end() is called, even if the IO helper + * has already finished reading in the entire folio. + */ + spin_lock_irq(&ifs->state_lock); + WARN_ON_ONCE(ifs->read_bytes_pending != 0); + ifs->read_bytes_pending = len + 1; + spin_unlock_irq(&ifs->state_lock); + } } -struct iomap_readpage_ctx { - struct folio *cur_folio; - bool cur_folio_in_bio; - struct bio *bio; - struct readahead_control *rac; -}; +/* + * This ends IO if no bytes were submitted to an IO helper. + * + * Otherwise, this calibrates ifs->read_bytes_pending to represent only the + * submitted bytes (see comment in iomap_read_init()). If all bytes submitted + * have already been completed by the IO helper, then this will end the read. + * Else the IO helper will end the read after all submitted ranges have been + * read. + */ +static void iomap_read_end(struct folio *folio, size_t bytes_submitted) +{ + struct iomap_folio_state *ifs = folio->private; -static int iomap_readpage_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) + if (ifs) { + bool end_read, uptodate; + + spin_lock_irq(&ifs->state_lock); + if (!ifs->read_bytes_pending) { + WARN_ON_ONCE(bytes_submitted); + spin_unlock_irq(&ifs->state_lock); + folio_unlock(folio); + return; + } + + /* + * Subtract any bytes that were initially accounted to + * read_bytes_pending but skipped for IO. The +1 accounts for + * the bias we added in iomap_read_init(). + */ + ifs->read_bytes_pending -= + (folio_size(folio) + 1 - bytes_submitted); + + /* + * If !ifs->read_bytes_pending, this means all pending reads by + * the IO helper have already completed, which means we need to + * end the folio read here. If ifs->read_bytes_pending != 0, + * the IO helper will end the folio read. + */ + end_read = !ifs->read_bytes_pending; + if (end_read) + uptodate = ifs_is_fully_uptodate(folio, ifs); + spin_unlock_irq(&ifs->state_lock); + if (end_read) + folio_end_read(folio, uptodate); + } else if (!bytes_submitted) { + /* + * If there were no bytes submitted, this means we are + * responsible for unlocking the folio here, since no IO helper + * has taken ownership of it. If there were bytes submitted, + * then the IO helper will end the read via + * iomap_finish_folio_read(). + */ + folio_unlock(folio); + } +} + +static int iomap_read_folio_iter(struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t *bytes_submitted) { const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos; loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; - struct iomap_folio_state *ifs; size_t poff, plen; - sector_t sector; + loff_t pos_diff; int ret; if (iomap->type == IOMAP_INLINE) { ret = iomap_read_inline_data(iter, folio); if (ret) return ret; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } - /* zero post-eof blocks as the page may be mapped */ - ifs = ifs_alloc(iter->inode, folio, iter->flags); - iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); - if (plen == 0) - goto done; + ifs_alloc(iter->inode, folio, iter->flags); - if (iomap_block_needs_zeroing(iter, pos)) { - folio_zero_range(folio, poff, plen); - iomap_set_range_uptodate(folio, poff, plen); - goto done; - } + length = min_t(loff_t, length, + folio_size(folio) - offset_in_folio(folio, pos)); + while (length) { + iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, + &plen); - ctx->cur_folio_in_bio = true; - if (ifs) { - spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending += plen; - spin_unlock_irq(&ifs->state_lock); - } + pos_diff = pos - iter->pos; + if (WARN_ON_ONCE(pos_diff + plen > length)) + return -EIO; - sector = iomap_sector(iomap, pos); - if (!ctx->bio || - bio_end_sector(ctx->bio) != sector || - !bio_add_folio(ctx->bio, folio, plen, poff)) { - gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); - gfp_t orig_gfp = gfp; - unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); - - if (ctx->bio) - submit_bio(ctx->bio); - - if (ctx->rac) /* same as readahead_gfp_mask */ - gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), - REQ_OP_READ, gfp); - /* - * If the bio_alloc fails, try it again for a single page to - * avoid having to deal with partial page reads. This emulates - * what do_mpage_read_folio does. - */ - if (!ctx->bio) { - ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, - orig_gfp); - } - if (ctx->rac) - ctx->bio->bi_opf |= REQ_RAHEAD; - ctx->bio->bi_iter.bi_sector = sector; - ctx->bio->bi_end_io = iomap_read_end_io; - bio_add_folio_nofail(ctx->bio, folio, plen, poff); - } + ret = iomap_iter_advance(iter, pos_diff); + if (ret) + return ret; -done: - /* - * Move the caller beyond our range so that it keeps making progress. - * For that, we have to include any leading non-uptodate ranges, but - * we can skip trailing ones as they will be handled in the next - * iteration. - */ - length = pos - iter->pos + plen; - return iomap_iter_advance(iter, &length); -} + if (plen == 0) + return 0; -static int iomap_read_folio_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) -{ - int ret; + /* zero post-eof blocks as the page may be mapped */ + if (iomap_block_needs_zeroing(iter, pos)) { + folio_zero_range(folio, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); + } else { + if (!*bytes_submitted) + iomap_read_init(folio); + ret = ctx->ops->read_folio_range(iter, ctx, plen); + if (ret) + return ret; + *bytes_submitted += plen; + } - while (iomap_length(iter)) { - ret = iomap_readpage_iter(iter, ctx); + ret = iomap_iter_advance(iter, plen); if (ret) return ret; + length -= pos_diff + plen; + pos = iter->pos; } - return 0; } -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) +void iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct folio *folio = ctx->cur_folio; struct iomap_iter iter = { .inode = folio->mapping->host, .pos = folio_pos(folio), .len = folio_size(folio), }; - struct iomap_readpage_ctx ctx = { - .cur_folio = folio, - }; + size_t bytes_submitted = 0; int ret; trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, ctx, + &bytes_submitted); - if (ctx.bio) { - submit_bio(ctx.bio); - WARN_ON_ONCE(!ctx.cur_folio_in_bio); - } else { - WARN_ON_ONCE(ctx.cur_folio_in_bio); - folio_unlock(folio); - } + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); - /* - * Just like mpage_readahead and block_read_full_folio, we always - * return 0 and just set the folio error flag on errors. This - * should be cleaned up throughout the stack eventually. - */ - return 0; + iomap_read_end(folio, bytes_submitted); } EXPORT_SYMBOL_GPL(iomap_read_folio); static int iomap_readahead_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) + struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_submitted) { int ret; while (iomap_length(iter)) { if (ctx->cur_folio && offset_in_folio(ctx->cur_folio, iter->pos) == 0) { - if (!ctx->cur_folio_in_bio) - folio_unlock(ctx->cur_folio); + iomap_read_end(ctx->cur_folio, *cur_bytes_submitted); ctx->cur_folio = NULL; } if (!ctx->cur_folio) { ctx->cur_folio = readahead_folio(ctx->rac); - ctx->cur_folio_in_bio = false; + if (WARN_ON_ONCE(!ctx->cur_folio)) + return -EINVAL; + *cur_bytes_submitted = 0; } - ret = iomap_readpage_iter(iter, ctx); + ret = iomap_read_folio_iter(iter, ctx, cur_bytes_submitted); if (ret) return ret; } @@ -514,8 +606,8 @@ static int iomap_readahead_iter(struct iomap_iter *iter, /** * iomap_readahead - Attempt to read pages from a file. - * @rac: Describes the pages to be read. * @ops: The operations vector for the filesystem. + * @ctx: The ctx used for issuing readahead. * * This function is for filesystems to call to implement their readahead * address_space operation. @@ -527,51 +619,30 @@ static int iomap_readahead_iter(struct iomap_iter *iter, * function is called with memalloc_nofs set, so allocations will not cause * the filesystem to be reentered. */ -void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct readahead_control *rac = ctx->rac; struct iomap_iter iter = { .inode = rac->mapping->host, .pos = readahead_pos(rac), .len = readahead_length(rac), }; - struct iomap_readpage_ctx ctx = { - .rac = rac, - }; + size_t cur_bytes_submitted; trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.status = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, ctx, + &cur_bytes_submitted); - if (ctx.bio) - submit_bio(ctx.bio); - if (ctx.cur_folio) { - if (!ctx.cur_folio_in_bio) - folio_unlock(ctx.cur_folio); - } -} -EXPORT_SYMBOL_GPL(iomap_readahead); - -static int iomap_read_folio_range(const struct iomap_iter *iter, - struct folio *folio, loff_t pos, size_t len) -{ - const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct bio_vec bvec; - struct bio bio; + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); - bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); - bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); - return submit_bio_wait(&bio); + if (ctx->cur_folio) + iomap_read_end(ctx->cur_folio, cur_bytes_submitted); } -#else -static int iomap_read_folio_range(const struct iomap_iter *iter, - struct folio *folio, loff_t pos, size_t len) -{ - WARN_ON_ONCE(1); - return -EIO; -} -#endif /* CONFIG_BLOCK */ +EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a folio are @@ -584,7 +655,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned first, last, i; + unsigned first, last; if (!ifs) return false; @@ -596,10 +667,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) first = from >> inode->i_blkbits; last = (from + count - 1) >> inode->i_blkbits; - for (i = first; i <= last; i++) - if (!ifs_block_is_uptodate(ifs, i)) - return false; - return true; + return ifs_next_nonuptodate_block(folio, first, last) > last; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); @@ -723,9 +791,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, if (plen == 0) break; - if (!(iter->flags & IOMAP_UNSHARE) && - (from <= poff || from >= poff + plen) && - (to <= poff || to >= poff + plen)) + /* + * If the read range will be entirely overwritten by the write, + * we can skip having to zero/read it in. + */ + if (!(iter->flags & IOMAP_UNSHARE) && from <= poff && + to >= poff + plen) continue; if (iomap_block_needs_zeroing(iter, block_start)) { @@ -742,7 +813,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, status = write_ops->read_folio_range(iter, folio, block_start, plen); else - status = iomap_read_folio_range(iter, + status = iomap_bio_read_folio_range_sync(iter, folio, block_start, plen); if (status) return status; @@ -761,6 +832,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); + if (iter->fbatch) { + struct folio *folio = folio_batch_next(iter->fbatch); + + if (!folio) + return NULL; + + /* + * The folio mapping generally shouldn't have changed based on + * fs locks, but be consistent with filemap lookup and retry + * the iter if it does. + */ + folio_lock(folio); + if (unlikely(folio->mapping != iter->inode->i_mapping)) { + iter->iomap.flags |= IOMAP_F_STALE; + folio_unlock(folio); + return NULL; + } + + folio_get(folio); + return folio; + } + if (write_ops && write_ops->get_folio) return write_ops->get_folio(iter, pos, len); return iomap_get_folio(iter, pos, len); @@ -815,15 +908,14 @@ static int iomap_write_begin(struct iomap_iter *iter, size_t *poffset, u64 *plen) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; + loff_t pos; u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); struct folio *folio; int status = 0; len = min_not_zero(len, *plen); - BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); - if (srcmap != &iter->iomap) - BUG_ON(pos + len > srcmap->offset + srcmap->length); + *foliop = NULL; + *plen = 0; if (fatal_signal_pending(current)) return -EINTR; @@ -833,6 +925,15 @@ static int iomap_write_begin(struct iomap_iter *iter, return PTR_ERR(folio); /* + * No folio means we're done with a batch. We still have range to + * process so return and let the caller iterate and refill the batch. + */ + if (!folio) { + WARN_ON_ONCE(!iter->fbatch); + return 0; + } + + /* * Now we have a locked folio, before we do anything with it we need to * check that the iomap we have cached is not stale. The inode extent * mapping can change due to concurrent IO in flight (e.g. @@ -852,6 +953,22 @@ static int iomap_write_begin(struct iomap_iter *iter, } } + /* + * The folios in a batch may not be contiguous. If we've skipped + * forward, advance the iter to the pos of the current folio. If the + * folio starts beyond the end of the mapping, it may have been trimmed + * since the lookup for whatever reason. Return a NULL folio to + * terminate the op. + */ + if (folio_pos(folio) > iter->pos) { + len = min_t(u64, folio_pos(folio) - iter->pos, + iomap_length(iter)); + status = iomap_iter_advance(iter, len); + len = iomap_length(iter); + if (status || !len) + goto out_unlock; + } + pos = iomap_trim_folio_range(iter, folio, poffset, &len); if (srcmap->type == IOMAP_INLINE) @@ -1041,7 +1158,7 @@ retry: } } else { total_written += written; - iomap_iter_advance(iter, &written); + iomap_iter_advance(iter, written); } } while (iov_iter_count(i) && iomap_length(iter)); @@ -1082,7 +1199,7 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, struct folio *folio, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { - unsigned int first_blk, last_blk, i; + unsigned int first_blk, last_blk; loff_t last_byte; u8 blkbits = inode->i_blkbits; struct iomap_folio_state *ifs; @@ -1101,10 +1218,11 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, folio_pos(folio) + folio_size(folio) - 1); first_blk = offset_in_folio(folio, start_byte) >> blkbits; last_blk = offset_in_folio(folio, last_byte) >> blkbits; - for (i = first_blk; i <= last_blk; i++) { - if (!ifs_block_is_dirty(folio, ifs, i)) - punch(inode, folio_pos(folio) + (i << blkbits), - 1 << blkbits, iomap); + while ((first_blk = ifs_next_clean_block(folio, first_blk, last_blk)) + <= last_blk) { + punch(inode, folio_pos(folio) + (first_blk << blkbits), + 1 << blkbits, iomap); + first_blk++; } } @@ -1310,7 +1428,7 @@ static int iomap_unshare_iter(struct iomap_iter *iter, int status; if (!iomap_want_unshare_iter(iter)) - return iomap_iter_advance(iter, &bytes); + return iomap_iter_advance(iter, bytes); do { struct folio *folio; @@ -1334,10 +1452,10 @@ static int iomap_unshare_iter(struct iomap_iter *iter, balance_dirty_pages_ratelimited(iter->inode->i_mapping); - status = iomap_iter_advance(iter, &bytes); + status = iomap_iter_advance(iter, bytes); if (status) break; - } while (bytes > 0); + } while ((bytes = iomap_length(iter)) > 0); return status; } @@ -1398,6 +1516,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, if (iter->iomap.flags & IOMAP_F_STALE) break; + /* a NULL folio means we're done with a folio batch */ + if (!folio) { + status = iomap_iter_advance_full(iter); + break; + } + /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); @@ -1412,16 +1536,36 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, if (WARN_ON_ONCE(!ret)) return -EIO; - status = iomap_iter_advance(iter, &bytes); + status = iomap_iter_advance(iter, bytes); if (status) break; - } while (bytes > 0); + } while ((bytes = iomap_length(iter)) > 0); if (did_zero) *did_zero = true; return status; } +loff_t +iomap_fill_dirty_folios( + struct iomap_iter *iter, + loff_t offset, + loff_t length) +{ + struct address_space *mapping = iter->inode->i_mapping; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + length - 1) >> PAGE_SHIFT; + + iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL); + if (!iter->fbatch) + return offset + length; + folio_batch_init(iter->fbatch); + + filemap_get_folios_dirty(mapping, &start, end, iter->fbatch); + return (start << PAGE_SHIFT); +} +EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios); + int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, @@ -1435,46 +1579,26 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, .private = private, }; struct address_space *mapping = inode->i_mapping; - unsigned int blocksize = i_blocksize(inode); - unsigned int off = pos & (blocksize - 1); - loff_t plen = min_t(loff_t, len, blocksize - off); int ret; bool range_dirty; /* - * Zero range can skip mappings that are zero on disk so long as - * pagecache is clean. If pagecache was dirty prior to zero range, the - * mapping converts on writeback completion and so must be zeroed. - * - * The simplest way to deal with this across a range is to flush - * pagecache and process the updated mappings. To avoid excessive - * flushing on partial eof zeroing, special case it to zero the - * unaligned start portion if already dirty in pagecache. - */ - if (off && - filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { - iter.len = plen; - while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_zero_iter(&iter, did_zero, - write_ops); - - iter.len = len - (iter.pos - pos); - if (ret || !iter.len) - return ret; - } - - /* * To avoid an unconditional flush, check pagecache state and only flush * if dirty and the fs returns a mapping that might convert on * writeback. */ - range_dirty = filemap_range_needs_writeback(inode->i_mapping, - iter.pos, iter.pos + iter.len - 1); + range_dirty = filemap_range_needs_writeback(mapping, iter.pos, + iter.pos + iter.len - 1); while ((ret = iomap_iter(&iter, ops)) > 0) { const struct iomap *srcmap = iomap_iter_srcmap(&iter); - if (srcmap->type == IOMAP_HOLE || - srcmap->type == IOMAP_UNWRITTEN) { + if (WARN_ON_ONCE(iter.fbatch && + srcmap->type != IOMAP_UNWRITTEN)) + return -EIO; + + if (!iter.fbatch && + (srcmap->type == IOMAP_HOLE || + srcmap->type == IOMAP_UNWRITTEN)) { s64 status; if (range_dirty) { @@ -1526,7 +1650,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, folio_mark_dirty(folio); } - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, @@ -1559,16 +1683,25 @@ out_unlock: } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len) +static void iomap_writeback_init(struct inode *inode, struct folio *folio) { struct iomap_folio_state *ifs = folio->private; WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); - if (ifs) - atomic_add(len, &ifs->write_bytes_pending); + if (ifs) { + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + /* + * Set this to the folio size. After processing the folio for + * writeback in iomap_writeback_folio(), we'll subtract any + * ranges not written back. + * + * We do this because otherwise, we would have to atomically + * increment ifs->write_bytes_pending every time a range in the + * folio needs to be written back. + */ + atomic_set(&ifs->write_bytes_pending, folio_size(folio)); + } } -EXPORT_SYMBOL_GPL(iomap_start_folio_write); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len) @@ -1585,7 +1718,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write); static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 pos, u32 rlen, u64 end_pos, - bool *wb_pending) + size_t *bytes_submitted) { do { ssize_t ret; @@ -1599,11 +1732,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, pos += ret; /* - * Holes are not be written back by ->writeback_range, so track + * Holes are not written back by ->writeback_range, so track * if we did handle anything that is not a hole here. */ if (wpc->iomap.type != IOMAP_HOLE) - *wb_pending = true; + *bytes_submitted += ret; } while (rlen); return 0; @@ -1674,7 +1807,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); u64 end_aligned = 0; - bool wb_pending = false; + size_t bytes_submitted = 0; int error = 0; u32 rlen; @@ -1694,14 +1827,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) iomap_set_range_dirty(folio, 0, end_pos - pos); } - /* - * Keep the I/O completion handler from clearing the writeback - * bit until we have submitted all blocks by adding a bias to - * ifs->write_bytes_pending, which is dropped after submitting - * all blocks. - */ - WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); - iomap_start_folio_write(inode, folio, 1); + iomap_writeback_init(inode, folio); } /* @@ -1716,13 +1842,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) end_aligned = round_up(end_pos, i_blocksize(inode)); while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos, - &wb_pending); + &bytes_submitted); if (error) break; pos += rlen; } - if (wb_pending) + if (bytes_submitted) wpc->nr_folios++; /* @@ -1740,12 +1866,20 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) * bit ourselves right after unlocking the page. */ if (ifs) { - if (atomic_dec_and_test(&ifs->write_bytes_pending)) - folio_end_writeback(folio); - } else { - if (!wb_pending) - folio_end_writeback(folio); + /* + * Subtract any bytes that were initially accounted to + * write_bytes_pending but skipped for writeback. + */ + size_t bytes_not_submitted = folio_size(folio) - + bytes_submitted; + + if (bytes_not_submitted) + iomap_finish_folio_write(inode, folio, + bytes_not_submitted); + } else if (!bytes_submitted) { + folio_end_writeback(folio); } + mapping_set_error(inode->i_mapping, error); return error; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 5d5d63efbd57..8e273408453a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -16,21 +16,13 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ -#define IOMAP_DIO_NO_INVALIDATE (1U << 25) -#define IOMAP_DIO_CALLER_COMP (1U << 26) -#define IOMAP_DIO_INLINE_COMP (1U << 27) +#define IOMAP_DIO_NO_INVALIDATE (1U << 26) +#define IOMAP_DIO_COMP_WORK (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) #define IOMAP_DIO_NEED_SYNC (1U << 29) #define IOMAP_DIO_WRITE (1U << 30) #define IOMAP_DIO_DIRTY (1U << 31) -/* - * Used for sub block zeroing in iomap_dio_zero() - */ -#define IOMAP_ZERO_PAGE_SIZE (SZ_64K) -#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE)) -static struct page *zero_page; - struct iomap_dio { struct kiocb *iocb; const struct iomap_dio_ops *dops; @@ -140,11 +132,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) } EXPORT_SYMBOL_GPL(iomap_dio_complete); -static ssize_t iomap_dio_deferred_complete(void *data) -{ - return iomap_dio_complete(data); -} - static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); @@ -179,33 +166,33 @@ static void iomap_dio_done(struct iomap_dio *dio) WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { - WRITE_ONCE(iocb->private, NULL); - iomap_dio_complete_work(&dio->aio.work); - } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then - * schedule our completion that way to avoid an async punt to a - * workqueue. - */ - /* only polled IO cares about private cleared */ - iocb->private = dio; - iocb->dio_complete = iomap_dio_deferred_complete; + return; + } - /* - * Invoke ->ki_complete() directly. We've assigned our - * dio_complete callback handler, and since the issuer set - * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will - * notice ->dio_complete being set and will defer calling that - * handler until it can be done from a safe task context. - * - * Note that the 'res' being passed in here is not important - * for this case. The actual completion value of the request - * will be gotten from dio_complete when that is run by the - * issuer. - */ - iocb->ki_complete(iocb, 0); - } else { + /* + * Always run error completions in user context. These are not + * performance critical and some code relies on taking sleeping locks + * for error handling. + */ + if (dio->error) + dio->flags |= IOMAP_DIO_COMP_WORK; + + /* + * Never invalidate pages from this context to avoid deadlocks with + * buffered I/O completions when called from the ioend workqueue, + * or avoid sleeping when called directly from ->bi_end_io. + * Tough luck if you hit the tiny race with someone dirtying the range + * right between this check and the actual completion. + */ + if ((dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_COMP_WORK)) { + if (dio->iocb->ki_filp->f_mapping->nrpages) + dio->flags |= IOMAP_DIO_COMP_WORK; + else + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + + if (dio->flags & IOMAP_DIO_COMP_WORK) { struct inode *inode = file_inode(iocb->ki_filp); /* @@ -216,7 +203,11 @@ static void iomap_dio_done(struct iomap_dio *dio) */ INIT_WORK(&dio->aio.work, iomap_dio_complete_work); queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + return; } + + WRITE_ONCE(iocb->private, NULL); + iomap_dio_complete_work(&dio->aio.work); } void iomap_dio_bio_end_io(struct bio *bio) @@ -252,16 +243,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) /* * Try to avoid another context switch for the completion given * that we are already called from the ioend completion - * workqueue, but never invalidate pages from this thread to - * avoid deadlocks with buffered I/O completions. Tough luck if - * you hit the tiny race with someone dirtying the range now - * between this check and the actual completion. + * workqueue. */ - if (!dio->iocb->ki_filp->f_mapping->nrpages) { - dio->flags |= IOMAP_DIO_INLINE_COMP; - dio->flags |= IOMAP_DIO_NO_INVALIDATE; - } - dio->flags &= ~IOMAP_DIO_CALLER_COMP; + dio->flags &= ~IOMAP_DIO_COMP_WORK; iomap_dio_done(dio); } @@ -285,42 +269,36 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, { struct inode *inode = file_inode(dio->iocb->ki_filp); struct bio *bio; + struct folio *zero_folio = largest_zero_folio(); + int nr_vecs = max(1, i_blocksize(inode) / folio_size(zero_folio)); if (!len) return 0; + /* - * Max block size supported is 64k + * This limit shall never be reached as most filesystems have a + * maximum blocksize of 64k. */ - if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE)) + if (WARN_ON_ONCE(nr_vecs > BIO_MAX_VECS)) return -EINVAL; - bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); + bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - __bio_add_page(bio, zero_page, len, 0); + while (len > 0) { + unsigned int io_len = min(len, folio_size(zero_folio)); + + bio_add_folio_nofail(bio, zero_folio, io_len, 0); + len -= io_len; + } iomap_dio_submit_bio(iter, dio, bio, pos); - return 0; -} -/* - * Use a FUA write if we need datasync semantics and this is a pure data I/O - * that doesn't require any metadata updates (including after I/O completion - * such as unwritten extent conversion) and the underlying device either - * doesn't have a volatile write cache or supports FUA. - * This allows us to avoid cache flushes on I/O completion. - */ -static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, - struct iomap_dio *dio) -{ - if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) - return false; - if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) - return false; - return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); + return 0; } static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) @@ -336,12 +314,39 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) int nr_pages, ret = 0; u64 copied = 0; size_t orig_count; + unsigned int alignment; + + /* + * File systems that write out of place and always allocate new blocks + * need each bio to be block aligned as that's the unit of allocation. + */ + if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED) + alignment = fs_block_size; + else + alignment = bdev_logical_block_size(iomap->bdev); - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1)) + if ((pos | length) & (alignment - 1)) return -EINVAL; if (dio->flags & IOMAP_DIO_WRITE) { - bio_opf |= REQ_OP_WRITE; + bool need_completion_work = true; + + switch (iomap->type) { + case IOMAP_MAPPED: + /* + * Directly mapped I/O does not inherently need to do + * work at I/O completion time. But there are various + * cases below where this will get set again. + */ + need_completion_work = false; + break; + case IOMAP_UNWRITTEN: + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + break; + default: + break; + } if (iomap->flags & IOMAP_F_ATOMIC_BIO) { /* @@ -354,35 +359,54 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio_opf |= REQ_ATOMIC; } - if (iomap->type == IOMAP_UNWRITTEN) { - dio->flags |= IOMAP_DIO_UNWRITTEN; - need_zeroout = true; - } - - if (iomap->flags & IOMAP_F_SHARED) + if (iomap->flags & IOMAP_F_SHARED) { + /* + * Unsharing of needs to update metadata at I/O + * completion time. + */ + need_completion_work = true; dio->flags |= IOMAP_DIO_COW; + } - if (iomap->flags & IOMAP_F_NEW) + if (iomap->flags & IOMAP_F_NEW) { + /* + * Newly allocated blocks might need recording in + * metadata at I/O completion time. + */ + need_completion_work = true; need_zeroout = true; - else if (iomap->type == IOMAP_MAPPED && - iomap_dio_can_use_fua(iomap, dio)) - bio_opf |= REQ_FUA; + } - if (!(bio_opf & REQ_FUA)) - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + /* + * Use a FUA write if we need datasync semantics and this is a + * pure overwrite that doesn't require any metadata updates. + * + * This allows us to avoid cache flushes on I/O completion. + */ + if (dio->flags & IOMAP_DIO_WRITE_THROUGH) { + if (!need_completion_work && + !(iomap->flags & IOMAP_F_DIRTY) && + (!bdev_write_cache(iomap->bdev) || + bdev_fua(iomap->bdev))) + bio_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + } /* - * We can only do deferred completion for pure overwrites that + * We can only do inline completion for pure overwrites that * don't require additional I/O at completion time. * - * This rules out writes that need zeroing or extent conversion, - * extend the file size, or issue metadata I/O or cache flushes - * during completion processing. + * This rules out writes that need zeroing or metdata updates to + * convert unwritten or shared extents. + * + * Writes that extend i_size are also not supported, but this is + * handled in __iomap_dio_rw(). */ - if (need_zeroout || (pos >= i_size_read(inode)) || - ((dio->flags & IOMAP_DIO_NEED_SYNC) && - !(bio_opf & REQ_FUA))) - dio->flags &= ~IOMAP_DIO_CALLER_COMP; + if (need_completion_work) + dio->flags |= IOMAP_DIO_COMP_WORK; + + bio_opf |= REQ_OP_WRITE; } else { bio_opf |= REQ_OP_READ; } @@ -403,7 +427,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) * ones we set for inline and deferred completions. If none of those * are available for this IO, clear the polled flag. */ - if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) + if (dio->flags & IOMAP_DIO_COMP_WORK) dio->iocb->ki_flags &= ~IOCB_HIPRI; if (need_zeroout) { @@ -434,7 +458,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_end_io = iomap_dio_bio_end_io; ret = bio_iov_iter_get_pages(bio, dio->submit.iter, - bdev_logical_block_size(iomap->bdev) - 1); + alignment - 1); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall @@ -496,7 +520,7 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return iomap_iter_advance(iter, &copied); + return iomap_iter_advance(iter, copied); return ret; } @@ -507,7 +531,7 @@ static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) dio->size += length; if (!length) return -EFAULT; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) @@ -542,7 +566,7 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) dio->size += copied; if (!copied) return -EFAULT; - return iomap_iter_advance(iomi, &copied); + return iomap_iter_advance(iomi, copied); } static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) @@ -639,10 +663,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iov_iter_rw(iter) == READ) { - /* reads can always complete inline */ - dio->flags |= IOMAP_DIO_INLINE_COMP; + if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) + dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) goto out_free_dio; @@ -656,15 +680,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; - /* - * Flag as supporting deferred completions, if the issuer - * groks it. This can avoid a workqueue punt for writes. - * We may later clear this flag if we need to do other IO - * as part of this IO completion. - */ - if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) - dio->flags |= IOMAP_DIO_CALLER_COMP; - if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (iomi.pos >= dio->i_size || @@ -694,6 +709,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* + * i_size updates must to happen from process context. + */ + if (iomi.pos + iomi.len > dio->i_size) + dio->flags |= IOMAP_DIO_COMP_WORK; + + /* * Try to invalidate cache pages for the range we are writing. * If this invalidation fails, let the caller fall back to * buffered I/O. @@ -717,12 +738,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } goto out_free_dio; } + } - if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { - ret = sb_init_dio_done_wq(inode->i_sb); - if (ret < 0) - goto out_free_dio; - } + if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + goto out_free_dio; } inode_dio_begin(inode); @@ -765,9 +786,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * If all the writes we issued were already written through to the * media, we don't need to flush the cache on IO completion. Clear the * sync flag for this case. + * + * Otherwise clear the inline completion flag if any sync work is + * needed, as that needs to be performed from process context. */ if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + else if (dio->flags & IOMAP_DIO_NEED_SYNC) + dio->flags |= IOMAP_DIO_COMP_WORK; /* * We are about to drop our additional submission reference, which @@ -825,15 +851,3 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); - -static int __init iomap_dio_init(void) -{ - zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, - IOMAP_ZERO_PAGE_ORDER); - - if (!zero_page) - return -ENOMEM; - - return 0; -} -fs_initcall(iomap_dio_init); diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index d05cb3aed96e..3a4e4aad2bd1 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -6,4 +6,16 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); +#ifdef CONFIG_BLOCK +int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len); +#else +static inline int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + WARN_ON_ONCE(1); + return -EIO; +} +#endif /* CONFIG_BLOCK */ + #endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index b49fa75eab26..86f44922ed3b 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -194,8 +194,6 @@ new_ioend: if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) goto new_ioend; - iomap_start_folio_write(wpc->inode, folio, map_len); - /* * Clamp io_offset and io_size to the incore EOF so that ondisk * file size updates in the ioend completion are byte-accurate. diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index cef77ca0c20b..8692e5e41c6d 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -8,22 +8,24 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { + if (iter->fbatch) { + folio_batch_release(iter->fbatch); + kfree(iter->fbatch); + iter->fbatch = NULL; + } + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); } -/* - * Advance the current iterator position and output the length remaining for the - * current mapping. - */ -int iomap_iter_advance(struct iomap_iter *iter, u64 *count) +/* Advance the current iterator position and decrement the remaining length */ +int iomap_iter_advance(struct iomap_iter *iter, u64 count) { - if (WARN_ON_ONCE(*count > iomap_length(iter))) + if (WARN_ON_ONCE(count > iomap_length(iter))) return -EIO; - iter->pos += *count; - iter->len -= *count; - *count = iomap_length(iter); + iter->pos += count; + iter->len -= count; return 0; } diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 56db2dd4b10d..6cbc587c93da 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -16,13 +16,13 @@ static int iomap_seek_hole_iter(struct iomap_iter *iter, *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_HOLE); if (*hole_pos == iter->pos + length) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); return 0; case IOMAP_HOLE: *hole_pos = iter->pos; return 0; default: - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } } @@ -59,12 +59,12 @@ static int iomap_seek_data_iter(struct iomap_iter *iter, switch (iter->iomap.type) { case IOMAP_HOLE: - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); case IOMAP_UNWRITTEN: *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_DATA); if (*hole_pos < 0) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); return 0; default: *hole_pos = iter->pos; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index a61c1dae4742..532787277b16 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -122,9 +122,10 @@ DEFINE_RANGE_EVENT(iomap_zero_iter); #define IOMAP_DIO_STRINGS \ - {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ - {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ - {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } + {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ + {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ + {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }, \ + {IOMAP_DIO_FSBLOCK_ALIGNED, "DIO_FSBLOCK_ALIGNED" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 6f0e6b19383c..b7cbe126faf3 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -610,6 +610,11 @@ static int isofs_fill_super(struct super_block *s, struct fs_context *fc) goto out_freesbi; } opt->blocksize = sb_min_blocksize(s, opt->blocksize); + if (!opt->blocksize) { + printk(KERN_ERR + "ISOFS: unable to set blocksize\n"); + goto out_freesbi; + } sbi->s_high_sierra = 0; /* default is iso9660 */ sbi->s_session = opt->session; @@ -1515,7 +1520,7 @@ struct inode *__isofs_iget(struct super_block *sb, if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { ret = isofs_read_inode(inode, relocated); if (ret < 0) { iget_failed(inode); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index d175cccb7c55..764bba8ba999 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -265,7 +265,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; f = JFFS2_INODE_INFO(inode); @@ -373,7 +373,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) { struct iattr iattr; - if (!(inode->i_state & I_DIRTY_DATASYNC)) { + if (!(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) { jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n", __func__, inode->i_ino); return; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2a4a288b821c..87ad042221e7 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -26,8 +26,8 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) return rc; inode_lock(inode); - if (!(inode->i_state & I_DIRTY_ALL) || - (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { + if (!(inode_state_read_once(inode) & I_DIRTY_ALL) || + (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); inode_unlock(inode); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 21f3d029da7d..4709762713ef 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -29,7 +29,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ret = diRead(inode); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 7840a03e5bcb..c16578af3a77 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1287,7 +1287,7 @@ int txCommit(tid_t tid, /* transaction identifier */ * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. * Joern */ - if (tblk->u.ip->i_state & I_SYNC) + if (inode_state_read_once(tblk->u.ip) & I_SYNC) tblk->xflag &= ~COMMIT_LAZY; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 457f91c412d4..a36aaee98dce 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -251,7 +251,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) struct inode *inode; inode = iget_locked(sb, kernfs_ino(kn)); - if (inode && (inode->i_state & I_NEW)) + if (inode && (inode_state_read_once(inode) & I_NEW)) kernfs_init_inode(kn, inode); return inode; diff --git a/fs/libfs.c b/fs/libfs.c index 4bb4d8a313e7..1661dcb7d983 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1543,9 +1543,9 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read_once(inode) & I_DIRTY_ALL)) goto out; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); @@ -1665,7 +1665,7 @@ struct inode *alloc_anon_inode(struct super_block *s) * list because mark_inode_dirty() will think * that it already _is_ on the dirty list. */ - inode->i_state = I_DIRTY; + inode_state_assign_raw(inode, I_DIRTY); /* * Historically anonymous inodes don't have a type at all and * userspace has come to rely on this. diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 32db676127a9..f220d0e4aedf 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -589,7 +589,7 @@ struct inode *minix_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; if (INODE_VERSION(inode) == MINIX_V1) diff --git a/fs/namei.c b/fs/namei.c index 7377020a2cba..eaeb9278ffee 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -282,7 +282,7 @@ void putname(struct filename *name) return; refcnt = atomic_read(&name->refcnt); - if (refcnt != 1) { + if (unlikely(refcnt != 1)) { if (WARN_ON_ONCE(!refcnt)) return; @@ -290,7 +290,7 @@ void putname(struct filename *name) return; } - if (name->name != name->iname) { + if (unlikely(name->name != name->iname)) { __putname(name->name); kfree(name); } else @@ -540,10 +540,13 @@ static inline int do_inode_permission(struct mnt_idmap *idmap, * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. + * + * Note: lookup_inode_permission_may_exec() does not call here. If you add + * MAY_EXEC checks, adjust it. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ @@ -574,7 +577,7 @@ int inode_permission(struct mnt_idmap *idmap, if (unlikely(retval)) return retval; - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { /* * Nobody gets write access to an immutable file. */ @@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap, } EXPORT_SYMBOL(inode_permission); +/* + * lookup_inode_permission_may_exec - Check traversal right for given inode + * + * This is a special case routine for may_lookup() making assumptions specific + * to path traversal. Use inode_permission() if you are doing something else. + * + * Work is shaved off compared to inode_permission() as follows: + * - we know for a fact there is no MAY_WRITE to worry about + * - it is an invariant the inode is a directory + * + * Since majority of real-world traversal happens on inodes which grant it for + * everyone, we check it upfront and only resort to more expensive work if it + * fails. + * + * Filesystems which have their own ->permission hook and consequently miss out + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC + * on their directory inodes. + */ +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + /* Lookup already checked this to return -ENOTDIR */ + VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode); + VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0); + + mask |= MAY_EXEC; + + if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC)))) + return inode_permission(idmap, inode, mask); + + if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode))) + return inode_permission(idmap, inode, mask); + + return security_inode_permission(inode, mask); +} + /** * path_get - get a reference to a path * @path: path to get the reference to @@ -746,7 +785,8 @@ static void leave_rcu(struct nameidata *nd) static void terminate_walk(struct nameidata *nd) { - drop_links(nd); + if (unlikely(nd->depth)) + drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); @@ -843,7 +883,7 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (unlikely(!legitimize_links(nd))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; @@ -878,7 +918,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (unlikely(!legitimize_links(nd))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { @@ -951,8 +991,8 @@ static int complete_walk(struct nameidata *nd) * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ - if (!(nd->state & ND_ROOT_PRESET)) - if (!(nd->flags & LOOKUP_IS_SCOPED)) + if (likely(!(nd->state & ND_ROOT_PRESET))) + if (likely(!(nd->flags & LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) @@ -1034,7 +1074,7 @@ static int nd_jump_root(struct nameidata *nd) } if (!nd->root.mnt) { int error = set_root(nd); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) { @@ -1632,13 +1672,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; + if (likely(!d_managed(dentry))) + return 0; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; - if (!try_to_unlazy_next(nd, dentry)) + if (unlikely(!try_to_unlazy_next(nd, dentry))) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); @@ -1823,7 +1865,7 @@ again: return dentry; } -static struct dentry *lookup_slow(const struct qstr *name, +static noinline struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { @@ -1855,7 +1897,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; - err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); + err = lookup_inode_permission_may_exec(idmap, nd->inode, mask); if (likely(!err)) return 0; @@ -1870,7 +1912,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, if (err != -ECHILD) // hard error return err; - return inode_permission(idmap, nd->inode, MAY_EXEC); + return lookup_inode_permission_may_exec(idmap, nd->inode, 0); } static int reserve_stack(struct nameidata *nd, struct path *link) @@ -1901,13 +1943,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link) enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; -static const char *pick_link(struct nameidata *nd, struct path *link, +static noinline const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; - int error = reserve_stack(nd, link); + int error; + + if (nd->flags & LOOKUP_RCU) { + /* make sure that d_is_symlink from step_into_slowpath() matches the inode */ + if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + } else { + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + } + error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); @@ -1981,14 +2033,15 @@ all_done: // pure jump * * NOTE: dentry must be what nd->next_seq had been sampled from. */ -static const char *step_into(struct nameidata *nd, int flags, +static noinline const char *step_into_slowpath(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; - int err = handle_mounts(nd, dentry, &path); + int err; - if (err < 0) + err = handle_mounts(nd, dentry, &path); + if (unlikely(err < 0)) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || @@ -2010,15 +2063,32 @@ static const char *step_into(struct nameidata *nd, int flags, nd->seq = nd->next_seq; return NULL; } - if (nd->flags & LOOKUP_RCU) { - /* make sure that d_is_symlink above matches inode */ - if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) + return pick_link(nd, &path, inode, flags); +} + +static __always_inline const char *step_into(struct nameidata *nd, int flags, + struct dentry *dentry) +{ + /* + * In the common case we are in rcu-walk and traversing over a non-mounted on + * directory (as opposed to e.g., a symlink). + * + * We can handle that and negative entries with the checks below. + */ + if (likely((nd->flags & LOOKUP_RCU) && + !d_managed(dentry) && !d_is_symlink(dentry))) { + struct inode *inode = dentry->d_inode; + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); - } else { - if (path.mnt == nd->path.mnt) - mntget(path.mnt); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + nd->path.dentry = dentry; + /* nd->path.mnt is retained on purpose */ + nd->inode = inode; + nd->seq = nd->next_seq; + return NULL; } - return pick_link(nd, &path, inode, flags); + return step_into_slowpath(nd, flags, dentry); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) @@ -2101,7 +2171,7 @@ static const char *handle_dots(struct nameidata *nd, int type) if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) @@ -2131,7 +2201,7 @@ static const char *handle_dots(struct nameidata *nd, int type) return NULL; } -static const char *walk_component(struct nameidata *nd, int flags) +static __always_inline const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* @@ -2140,7 +2210,7 @@ static const char *walk_component(struct nameidata *nd, int flags) * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); return handle_dots(nd, nd->last_type); } @@ -2152,7 +2222,7 @@ static const char *walk_component(struct nameidata *nd, int flags) if (IS_ERR(dentry)) return ERR_CAST(dentry); } - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); return step_into(nd, flags, dentry); } @@ -2505,7 +2575,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ - if (!depth) { + if (likely(!depth)) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; @@ -2543,10 +2613,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags) const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ - if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) + if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)) return ERR_PTR(-EAGAIN); - if (!*s) + if (unlikely(!*s)) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); @@ -2560,7 +2630,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); - if (nd->state & ND_ROOT_PRESET) { + if (unlikely(nd->state & ND_ROOT_PRESET)) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) @@ -2579,7 +2649,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ - if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { + if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); @@ -2632,7 +2702,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } /* For scoped-lookups we need to set the root to the dirfd as well. */ - if (flags & LOOKUP_IS_SCOPED) { + if (unlikely(flags & LOOKUP_IS_SCOPED)) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; @@ -4036,7 +4106,7 @@ int vfs_tmpfile(struct mnt_idmap *idmap, inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); - inode->i_state |= I_LINKABLE; + inode_state_set(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } security_inode_post_create_tmpfile(idmap, inode); @@ -4931,7 +5001,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; @@ -4941,9 +5011,9 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, error = dir->i_op->link(old_dentry, dir, new_dentry); } - if (!error && (inode->i_state & I_LINKABLE)) { + if (!error && (inode_state_read_once(inode) & I_LINKABLE)) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_LINKABLE; + inode_state_clear(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } inode_unlock(inode); diff --git a/fs/namespace.c b/fs/namespace.c index 25289b869be1..d7afac807ac3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -132,16 +132,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) -{ - struct ns_common *ns; - - if (!node) - return NULL; - ns = rb_entry(node, struct ns_common, ns_tree_node.ns_node); - return container_of(ns, struct mnt_namespace, ns); -} - static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ @@ -151,7 +141,8 @@ static void mnt_ns_release(struct mnt_namespace *ns) kfree(ns); } } -DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, + if (!IS_ERR(_T)) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { @@ -1345,26 +1336,12 @@ static void delayed_mntput(struct work_struct *unused) } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); -static void mntput_no_expire(struct mount *mnt) +static void noinline mntput_no_expire_slowpath(struct mount *mnt) { LIST_HEAD(list); int count; - rcu_read_lock(); - if (likely(READ_ONCE(mnt->mnt_ns))) { - /* - * Since we don't do lock_mount_hash() here, - * ->mnt_ns can change under us. However, if it's - * non-NULL, then there's a reference that won't - * be dropped until after an RCU delay done after - * turning ->mnt_ns NULL. So if we observe it - * non-NULL under rcu_read_lock(), the reference - * we are dropping is not the final one. - */ - mnt_add_count(mnt, -1); - rcu_read_unlock(); - return; - } + VFS_BUG_ON(mnt->mnt_ns); lock_mount_hash(); /* * make sure that if __legitimize_mnt() has not seen us grab @@ -1415,6 +1392,26 @@ static void mntput_no_expire(struct mount *mnt) cleanup_mnt(mnt); } +static void mntput_no_expire(struct mount *mnt) +{ + rcu_read_lock(); + if (likely(READ_ONCE(mnt->mnt_ns))) { + /* + * Since we don't do lock_mount_hash() here, + * ->mnt_ns can change under us. However, if it's + * non-NULL, then there's a reference that won't + * be dropped until after an RCU delay done after + * turning ->mnt_ns NULL. So if we observe it + * non-NULL under rcu_read_lock(), the reference + * we are dropping is not the final one. + */ + mnt_add_count(mnt, -1); + rcu_read_unlock(); + return; + } + mntput_no_expire_slowpath(mnt); +} + void mntput(struct vfsmount *mnt) { if (mnt) { @@ -5455,11 +5452,11 @@ static int statmount_string(struct kstatmount *s, u64 flag) ret = statmount_sb_source(s, seq); break; case STATMOUNT_MNT_UIDMAP: - sm->mnt_uidmap = start; + offp = &sm->mnt_uidmap; ret = statmount_mnt_uidmap(s, seq); break; case STATMOUNT_MNT_GIDMAP: - sm->mnt_gidmap = start; + offp = &sm->mnt_gidmap; ret = statmount_mnt_gidmap(s, seq); break; default: @@ -5737,7 +5734,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; - if (kreq->spare != 0) + if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) return -EINVAL; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5754,16 +5751,14 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq { struct mnt_namespace *mnt_ns; - if (kreq->mnt_ns_id && kreq->spare) - return ERR_PTR(-EINVAL); - - if (kreq->mnt_ns_id) - return lookup_mnt_ns(kreq->mnt_ns_id); - - if (kreq->spare) { + if (kreq->mnt_ns_id) { + mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id); + if (!mnt_ns) + return ERR_PTR(-ENOENT); + } else if (kreq->mnt_ns_fd) { struct ns_common *ns; - CLASS(fd, f)(kreq->spare); + CLASS(fd, f)(kreq->mnt_ns_fd); if (fd_empty(f)) return ERR_PTR(-EBADF); @@ -5775,11 +5770,12 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); + refcount_inc(&mnt_ns->passive); } else { mnt_ns = current->nsproxy->mnt_ns; + refcount_inc(&mnt_ns->passive); } - refcount_inc(&mnt_ns->passive); return mnt_ns; } @@ -5802,8 +5798,8 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, return ret; ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + if (IS_ERR(ns)) + return PTR_ERR(ns); if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) @@ -5913,8 +5909,8 @@ static void __free_klistmount_free(const struct klistmount *kls) static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, size_t nr_mnt_ids) { - u64 last_mnt_id = kreq->param; + struct mnt_namespace *ns; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5928,9 +5924,10 @@ static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req * if (!kls->kmnt_ids) return -ENOMEM; - kls->ns = grab_requested_mnt_ns(kreq); - if (!kls->ns) - return -ENOENT; + ns = grab_requested_mnt_ns(kreq); + if (IS_ERR(ns)) + return PTR_ERR(ns); + kls->ns = ns; kls->mnt_parent_id = kreq->mnt_id; return 0; diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 486166460e17..3b97bc35de77 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -147,10 +147,10 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) if (!fscache_cookie_valid(cookie)) return true; - if (!(inode->i_state & I_PINNING_NETFS_WB)) { + if (!(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) { spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_NETFS_WB)) { - inode->i_state |= I_PINNING_NETFS_WB; + if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) { + inode_state_set(inode, I_PINNING_NETFS_WB); need_use = true; } spin_unlock(&inode->i_lock); @@ -192,7 +192,7 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux) { struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); - if (inode->i_state & I_PINNING_NETFS_WB) { + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) { loff_t i_size = i_size_read(inode); fscache_unuse_cookie(cookie, aux, &i_size); } diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index 5c0dc4efc792..8e6264f62a8f 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -36,12 +36,12 @@ void netfs_single_mark_inode_dirty(struct inode *inode) mark_inode_dirty(inode); - if (caching && !(inode->i_state & I_PINNING_NETFS_WB)) { + if (caching && !(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) { bool need_use = false; spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_NETFS_WB)) { - inode->i_state |= I_PINNING_NETFS_WB; + if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) { + inode_state_set(inode, I_PINNING_NETFS_WB); need_use = true; } spin_unlock(&inode->i_lock); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4e3dcc157a83..54699299d5b1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -338,6 +338,14 @@ again: /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; + if (clp->cl_xprtsec.policy == RPC_XPRTSEC_TLS_X509) { + if (clp->cl_xprtsec.cert_serial != + data->xprtsec.cert_serial) + continue; + if (clp->cl_xprtsec.privkey_serial != + data->xprtsec.privkey_serial) + continue; + } refcount_inc(&clp->cl_count); return clp; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 46d9c65d50f8..ea9f6ca8f30f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2268,11 +2268,12 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, return -ENAMETOOLONG; if (open_flags & O_CREAT) { - file->f_mode |= FMODE_CREATED; error = nfs_do_create(dir, dentry, mode, open_flags); - if (error) + if (!error) { + file->f_mode |= FMODE_CREATED; + return finish_open(file, dentry, NULL); + } else if (error != -EEXIST || open_flags & O_EXCL) return error; - return finish_open(file, dentry, NULL); } if (d_in_lookup(dentry)) { /* The only flags nfs_lookup considers are diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 18b57c7c2f97..f76fe406937a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -475,7 +475,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) goto out_no_inode; } - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; @@ -718,6 +718,8 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct nfs_fattr *fattr; loff_t oldsize = i_size_read(inode); int error = 0; + kuid_t task_uid = current_fsuid(); + kuid_t owner_uid = inode->i_uid; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -739,9 +741,11 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); if (attr->ia_valid & ATTR_MTIME_SET) { - nfs_set_timestamps_to_ts(inode, attr); - attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + if (uid_eq(task_uid, owner_uid)) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| ATTR_ATIME|ATTR_ATIME_SET); + } } else { nfs_update_timestamps(inode, attr->ia_valid); attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); @@ -751,10 +755,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { if (attr->ia_valid & ATTR_ATIME_SET) { - spin_lock(&inode->i_lock); - nfs_set_timestamps_to_ts(inode, attr); - spin_unlock(&inode->i_lock); - attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + if (uid_eq(task_uid, owner_uid)) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } } else { nfs_update_delegated_atime(inode); attr->ia_valid &= ~ATTR_ATIME; diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 2c0455e91571..656976b4f42c 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -42,10 +42,9 @@ struct nfs_local_kiocb { /* Begin mostly DIO-specific members */ size_t end_len; short int end_iter_index; - short int n_iters; + atomic_t n_iters; bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; - loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned; - struct iov_iter iters[NFSLOCAL_MAX_IOS]; + struct iov_iter iters[NFSLOCAL_MAX_IOS] ____cacheline_aligned; /* End mostly DIO-specific members */ }; @@ -314,7 +313,9 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, init_sync_kiocb(&iocb->kiocb, file); iocb->hdr = hdr; + iocb->kiocb.ki_pos = hdr->args.offset; iocb->kiocb.ki_flags &= ~IOCB_APPEND; + iocb->kiocb.ki_complete = NULL; iocb->aio_complete_work = NULL; iocb->end_iter_index = -1; @@ -388,13 +389,24 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, return true; } +static void +nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec, + unsigned int nvecs, unsigned long total, + size_t start, size_t len) +{ + iov_iter_bvec(iter, rw, bvec, nvecs, total); + if (start) + iov_iter_advance(iter, start); + iov_iter_truncate(iter, len); +} + /* * Setup as many as 3 iov_iter based on extents described by @local_dio. * Returns the number of iov_iter that were setup. */ static int nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, - unsigned int nvecs, size_t len, + unsigned int nvecs, unsigned long total, struct nfs_local_dio *local_dio) { int n_iters = 0; @@ -402,39 +414,17 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, /* Setup misaligned start? */ if (local_dio->start_len) { - iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); - iters[n_iters].count = local_dio->start_len; - iocb->offset[n_iters] = iocb->hdr->args.offset; - iocb->iter_is_dio_aligned[n_iters] = false; + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, 0, local_dio->start_len); ++n_iters; } - /* Setup misaligned end? - * If so, the end is purposely setup to be issued using buffered IO - * before the middle (which will use DIO, if DIO-aligned, with AIO). - * This creates problems if/when the end results in a partial write. - * So must save index and length of end to handle this corner case. - */ - if (local_dio->end_len) { - iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); - iocb->offset[n_iters] = local_dio->end_offset; - iov_iter_advance(&iters[n_iters], - local_dio->start_len + local_dio->middle_len); - iocb->iter_is_dio_aligned[n_iters] = false; - /* Save index and length of end */ - iocb->end_iter_index = n_iters; - iocb->end_len = local_dio->end_len; - ++n_iters; - } - - /* Setup DIO-aligned middle to be issued last, to allow for - * DIO with AIO completion (see nfs_local_call_{read,write}). + /* + * Setup DIO-aligned middle, if there is no misaligned end (below) + * then AIO completion is used, see nfs_local_call_{read,write} */ - iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); - if (local_dio->start_len) - iov_iter_advance(&iters[n_iters], local_dio->start_len); - iters[n_iters].count -= local_dio->end_len; - iocb->offset[n_iters] = local_dio->middle_offset; + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs, + total, local_dio->start_len, local_dio->middle_len); iocb->iter_is_dio_aligned[n_iters] = nfs_iov_iter_aligned_bvec(&iters[n_iters], @@ -442,12 +432,22 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) { trace_nfs_local_dio_misaligned(iocb->hdr->inode, - iocb->hdr->args.offset, len, local_dio); + local_dio->start_len, local_dio->middle_len, local_dio); return 0; /* no DIO-aligned IO possible */ } + iocb->end_iter_index = n_iters; ++n_iters; - iocb->n_iters = n_iters; + /* Setup misaligned end? */ + if (local_dio->end_len) { + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, local_dio->start_len + + local_dio->middle_len, local_dio->end_len); + iocb->end_iter_index = n_iters; + ++n_iters; + } + + atomic_set(&iocb->n_iters, n_iters); return n_iters; } @@ -473,18 +473,26 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) } len = hdr->args.count - total; + /* + * For each iocb, iocb->n_iters is always at least 1 and we always + * end io after first nfs_local_pgio_done call unless misaligned DIO. + */ + atomic_set(&iocb->n_iters, 1); + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { struct nfs_local_dio local_dio; if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && - nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) { + /* Ensure DIO WRITE's IO on stable storage upon completion */ + if (rw == ITER_SOURCE) + iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC; return; /* is DIO-aligned */ + } } /* Use buffered IO */ - iocb->offset[0] = hdr->args.offset; iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); - iocb->n_iters = 1; } static void @@ -504,9 +512,11 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr, hdr->task.tk_start = ktime_get(); } -static void -nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) +static bool +nfs_local_pgio_done(struct nfs_local_kiocb *iocb, long status, bool force) { + struct nfs_pgio_header *hdr = iocb->hdr; + /* Must handle partial completions */ if (status >= 0) { hdr->res.count += status; @@ -517,6 +527,12 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); hdr->task.tk_status = status; } + + if (force) + return true; + + BUG_ON(atomic_read(&iocb->n_iters) <= 0); + return atomic_dec_and_test(&iocb->n_iters); } static void @@ -547,11 +563,11 @@ static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb) queue_work(nfsiod_workqueue, &iocb->work); } -static void -nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) +static void nfs_local_read_done(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; struct file *filp = iocb->kiocb.ki_filp; + long status = hdr->task.tk_status; if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ @@ -564,20 +580,27 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) */ hdr->res.replen = 0; - if (hdr->res.count != hdr->args.count || - hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) + /* nfs_readpage_result() handles short read */ + + if (hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) hdr->res.eof = true; dprintk("%s: read %ld bytes eof %d.\n", __func__, status > 0 ? status : 0, hdr->res.eof); } +static inline void nfs_local_read_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_read_done(iocb); + nfs_local_pgio_release(iocb); +} + static void nfs_local_read_aio_complete_work(struct work_struct *work) { struct nfs_local_kiocb *iocb = container_of(work, struct nfs_local_kiocb, work); - nfs_local_pgio_release(iocb); + nfs_local_read_iocb_done(iocb); } static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) @@ -585,8 +608,10 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); - nfs_local_pgio_done(iocb->hdr, ret); - nfs_local_read_done(iocb, ret); + /* AIO completion of DIO read should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ } @@ -596,32 +621,36 @@ static void nfs_local_call_read(struct work_struct *work) container_of(work, struct nfs_local_kiocb, work); struct file *filp = iocb->kiocb.ki_filp; const struct cred *save_cred; + bool force_done = false; ssize_t status; + int n_iters; save_cred = override_creds(filp->f_cred); - for (int i = 0; i < iocb->n_iters ; i++) { + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; - iocb->kiocb.ki_complete = nfs_local_read_aio_complete; - iocb->aio_complete_work = nfs_local_read_aio_complete_work; - } + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; + iocb->aio_complete_work = nfs_local_read_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; - iocb->kiocb.ki_pos = iocb->offset[i]; status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); if (status != -EIOCBQUEUED) { - nfs_local_pgio_done(iocb->hdr, status); - if (iocb->hdr->task.tk_status) + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial read */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_read_iocb_done(iocb); break; + } } } revert_creds(save_cred); - - if (status != -EIOCBQUEUED) { - nfs_local_read_done(iocb, status); - nfs_local_pgio_release(iocb); - } } static int @@ -736,11 +765,10 @@ static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb) fattr->du.nfs3.used = stat.blocks << 9; } -static void -nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) +static void nfs_local_write_done(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; - struct inode *inode = hdr->inode; + long status = hdr->task.tk_status; dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); @@ -759,10 +787,17 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); status = -ENOSPC; /* record -ENOSPC in terms of nfs_local_pgio_done */ - nfs_local_pgio_done(hdr, status); + (void) nfs_local_pgio_done(iocb, status, true); } if (hdr->task.tk_status < 0) - nfs_reset_boot_verifier(inode); + nfs_reset_boot_verifier(hdr->inode); +} + +static inline void nfs_local_write_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_write_done(iocb); + nfs_local_vfs_getattr(iocb); + nfs_local_pgio_release(iocb); } static void nfs_local_write_aio_complete_work(struct work_struct *work) @@ -770,8 +805,7 @@ static void nfs_local_write_aio_complete_work(struct work_struct *work) struct nfs_local_kiocb *iocb = container_of(work, struct nfs_local_kiocb, work); - nfs_local_vfs_getattr(iocb); - nfs_local_pgio_release(iocb); + nfs_local_write_iocb_done(iocb); } static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) @@ -779,8 +813,10 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); - nfs_local_pgio_done(iocb->hdr, ret); - nfs_local_write_done(iocb, ret); + /* AIO completion of DIO write should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ } @@ -791,63 +827,40 @@ static void nfs_local_call_write(struct work_struct *work) struct file *filp = iocb->kiocb.ki_filp; unsigned long old_flags = current->flags; const struct cred *save_cred; + bool force_done = false; ssize_t status; + int n_iters; current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; save_cred = override_creds(filp->f_cred); file_start_write(filp); - for (int i = 0; i < iocb->n_iters ; i++) { + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; - iocb->kiocb.ki_complete = nfs_local_write_aio_complete; - iocb->aio_complete_work = nfs_local_write_aio_complete_work; - } -retry: - iocb->kiocb.ki_pos = iocb->offset[i]; + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; + iocb->aio_complete_work = nfs_local_write_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); if (status != -EIOCBQUEUED) { - if (unlikely(status >= 0 && status < iocb->iters[i].count)) { - /* partial write */ - if (i == iocb->end_iter_index) { - /* Must not account partial end, otherwise, due - * to end being issued before middle: the partial - * write accounting in nfs_local_write_done() - * would incorrectly advance hdr->args.offset - */ - status = 0; - } else { - /* Partial write at start or buffered middle, - * exit early. - */ - nfs_local_pgio_done(iocb->hdr, status); - break; - } - } else if (unlikely(status == -ENOTBLK && - (iocb->kiocb.ki_flags & IOCB_DIRECT))) { - /* VFS will return -ENOTBLK if DIO WRITE fails to - * invalidate the page cache. Retry using buffered IO. - */ - iocb->kiocb.ki_flags &= ~IOCB_DIRECT; - iocb->kiocb.ki_complete = NULL; - iocb->aio_complete_work = NULL; - goto retry; - } - nfs_local_pgio_done(iocb->hdr, status); - if (iocb->hdr->task.tk_status) + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial write */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_write_iocb_done(iocb); break; + } } } file_end_write(filp); revert_creds(save_cred); current->flags = old_flags; - - if (status != -EIOCBQUEUED) { - nfs_local_write_done(iocb, status); - nfs_local_vfs_getattr(iocb); - nfs_local_pgio_release(iocb); - } } static int diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 0d7310c1ee0c..5d97c1d38bb6 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -2,6 +2,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/sunrpc/addr.h> +#include <net/handshake.h> #include "internal.h" #include "nfs3_fs.h" #include "netns.h" @@ -98,7 +99,11 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_clp->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, .connect_timeout = connect_timeout, .reconnect_timeout = connect_timeout, }; @@ -111,9 +116,14 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_clp->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_clp->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) cl_init.nconnect = mds_clp->cl_nconnect; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 5998d6bd8a4f..3a4baed993c9 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -11,6 +11,7 @@ #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <net/handshake.h> #include "internal.h" #include "callback.h" #include "delegation.h" @@ -983,7 +984,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_srv->nfs_client->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, }; char buf[INET6_ADDRSTRLEN + 1]; @@ -992,9 +997,14 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_srv->nfs_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_srv->nfs_client->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) { cl_init.nconnect = mds_clp->cl_nconnect; cl_init.max_connect = NFS_MAX_TRANSPORTS; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 411776718494..93c6ce04332b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4715,16 +4715,19 @@ static int _nfs4_proc_lookupp(struct inode *inode, }; unsigned short task_flags = 0; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + if (server->flags & NFS_MOUNT_SOFTREVAL) task_flags |= RPC_TASK_TIMEOUT; + if (server->caps & NFS_CAP_MOVEABLE) + task_flags |= RPC_TASK_MOVEABLE; args.bitmask = nfs4_bitmask(server, fattr->label); nfs_fattr_init(fattr); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, task_flags); + status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args, + &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); return status; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a3135b5af7ee..f157d43d1312 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -317,7 +317,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); /* Notify pnfs_destroy_layout_final() that we're done */ - if (inode->i_state & (I_FREEING | I_CLEAR)) + if (inode_state_read(inode) & (I_FREEING | I_CLEAR)) wake_up_var_locked(lo, &inode->i_lock); spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7b32afb29782..9976cc16b689 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -809,8 +809,11 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, unsigned int retrans) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -834,27 +837,28 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .xprtsec = clp->cl_xprtsec, }; - if (da->da_transport != clp->cl_proto && - clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) - continue; - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.ident != clp->cl_proto) + continue; + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, NULL); continue; } - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) - da->da_transport = XPRT_TRANSPORT_TCP_TLS; - clp = get_v3_ds_connect(mds_srv, - &da->da_addr, - da->da_addrlen, da->da_transport, - timeo, retrans); + + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen, + ds_proto, timeo, retrans); if (IS_ERR(clp)) continue; clp->cl_rpcclient->cl_softerr = 0; @@ -880,7 +884,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, u32 minor_version) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -908,12 +915,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, .data = &xprtdata, }; - if (da->da_transport != clp->cl_proto && - clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) - continue; - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == - XPRT_TRANSPORT_TCP_TLS) { + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) { struct sockaddr *addr = (struct sockaddr *)&da->da_addr; struct sockaddr_in *sin = @@ -944,7 +947,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; xprt_args.servername = servername; } - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.ident != clp->cl_proto) + continue; + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; /** @@ -958,15 +964,14 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, if (xprtdata.cred) put_cred(xprtdata.cred); } else { - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == - XPRT_TRANSPORT_TCP_TLS) - da->da_transport = XPRT_TRANSPORT_TCP_TLS; - clp = nfs4_set_ds_client(mds_srv, - &da->da_addr, - da->da_addrlen, - da->da_transport, timeo, - retrans, minor_version); + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = nfs4_set_ds_client(mds_srv, &da->da_addr, + da->da_addrlen, ds_proto, + timeo, retrans, minor_version); if (IS_ERR(clp)) continue; @@ -977,7 +982,6 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, clp = ERR_PTR(-EIO); continue; } - } } diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 545148d42dcc..ea6e6168092b 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -189,6 +189,7 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, return p; kobject_put(&p->kobject); + kobject_put(&p->nfs_net_kobj); } return NULL; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e466cf52d7d7..7f7e6bb23a90 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -988,10 +988,11 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static void nfsd4_read_release(union nfsd4_op_u *u) { - if (u->read.rd_nf) + if (u->read.rd_nf) { + trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, + u->read.rd_offset, u->read.rd_length); nfsd_file_put(u->read.rd_nf); - trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, - u->read.rd_offset, u->read.rd_length); + } } static __be32 @@ -2892,10 +2893,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) rqstp->rq_lease_breaker = (void **)&cstate->clp; - trace_nfsd_compound(rqstp, args->tag, args->taglen, args->opcnt); + trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt); while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; + if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) { + /* If there are still more operations to process, + * stop here and report NFS4ERR_RESOURCE. */ + if (cstate->minorversion == 0 && + args->client_opcnt > resp->opcnt) { + op->status = nfserr_resource; + goto encode_op; + } + } + /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -2972,7 +2983,7 @@ encode_op: status = op->status; } - trace_nfsd_compound_status(args->opcnt, resp->opcnt, + trace_nfsd_compound_status(args->client_opcnt, resp->opcnt, status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 81fa7cc6c77b..8a6960500217 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1542,7 +1542,8 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); - WARN_ON(!list_empty(&stid->sc_cp_list)); + if (!list_empty(&stid->sc_cp_list)) + nfs4_free_cpntf_statelist(stid->sc_client->net, stid); kmem_cache_free(stateid_slab, stid); } @@ -3486,7 +3487,20 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; - dprintk("--> %s slot %p\n", __func__, slot); + /* + * RFC 5661 Section 2.10.6.1.2: + * + * Any time SEQUENCE ... returns an error ... [t]he replier MUST NOT + * modify the reply cache entry for the slot whenever an error is + * returned from SEQUENCE ... + * + * Because nfsd4_store_cache_entry is called only by + * nfsd4_sequence_done(), nfsd4_store_cache_entry() is called only + * when a SEQUENCE operation was part of the COMPOUND. + * nfs41_check_op_ordering() ensures SEQUENCE is the first op. + */ + if (resp->opcnt == 1 && resp->cstate.status != nfs_ok) + return; slot->sl_flags |= NFSD4_SLOT_INITIALIZED; slot->sl_opcnt = resp->opcnt; @@ -3902,6 +3916,7 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs ca->headerpadsz = 0; ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc); ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); + ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND); ca->maxresp_cached = min_t(u32, ca->maxresp_cached, NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); @@ -4348,6 +4363,36 @@ static bool replay_matches_cache(struct svc_rqst *rqstp, return true; } +/* + * Note that the response is constructed here both for the case + * of a new SEQUENCE request and for a replayed SEQUENCE request. + * We do not cache SEQUENCE responses as SEQUENCE is idempotent. + */ +static void nfsd4_construct_sequence_response(struct nfsd4_session *session, + struct nfsd4_sequence *seq) +{ + struct nfs4_client *clp = session->se_client; + + seq->maxslots_response = max(session->se_target_maxslots, + seq->maxslots); + seq->target_maxslots = session->se_target_maxslots; + + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; + } + if (!list_empty(&clp->cl_revoked)) + seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; + if (atomic_read(&clp->cl_admin_revoked)) + seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -4397,6 +4442,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("%s: slotid %d\n", __func__, seq->slotid); trace_nfsd_slot_seqid_sequence(clp, seq, slot); + + nfsd4_construct_sequence_response(session, seq); + status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags); if (status == nfserr_replay_cache) { status = nfserr_seq_misordered; @@ -4494,23 +4542,6 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } out: - seq->maxslots = max(session->se_target_maxslots, seq->maxslots); - seq->target_maxslots = session->se_target_maxslots; - - switch (clp->cl_cb_state) { - case NFSD4_CB_DOWN: - seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; - break; - case NFSD4_CB_FAULT: - seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; - break; - default: - seq->status_flags = 0; - } - if (!list_empty(&clp->cl_revoked)) - seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; - if (atomic_read(&clp->cl_admin_revoked)) - seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c0a3c6a7c8bb..67bb9c0b9fcb 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2488,8 +2488,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0) return false; - if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0) + if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0) return false; + argp->opcnt = min_t(u32, argp->client_opcnt, + NFSD_MAX_OPS_PER_COMPOUND); if (argp->opcnt > ARRAY_SIZE(argp->iops)) { argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops)); @@ -2628,10 +2630,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, __be32 *p; __be32 pathlen; int pathlen_offset; - int strlen, count=0; char *str, *end, *next; - - dprintk("nfsd4_encode_components(%s)\n", components); + int count = 0; pathlen_offset = xdr->buf->len; p = xdr_reserve_space(xdr, 4); @@ -2658,9 +2658,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, for (; *end && (*end != sep); end++) /* find sep or end of string */; - strlen = end - str; - if (strlen) { - if (xdr_stream_encode_opaque(xdr, str, strlen) < 0) + if (end > str) { + if (xdr_stream_encode_opaque(xdr, str, end - str) < 0) return nfserr_resource; count++; } else @@ -2939,6 +2938,12 @@ struct nfsd4_fattr_args { typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr, const struct nfsd4_fattr_args *args); +static __be32 nfsd4_encode_fattr4__inval(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfserr_inval; +} + static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr, const struct nfsd4_fattr_args *args) { @@ -3560,6 +3565,8 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = { [FATTR4_MODE_UMASK] = nfsd4_encode_fattr4__noop, [FATTR4_XATTR_SUPPORT] = nfsd4_encode_fattr4_xattr_support, + [FATTR4_TIME_DELEG_ACCESS] = nfsd4_encode_fattr4__inval, + [FATTR4_TIME_DELEG_MODIFY] = nfsd4_encode_fattr4__inval, [FATTR4_OPEN_ARGUMENTS] = nfsd4_encode_fattr4_open_arguments, }; @@ -5066,7 +5073,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr; /* Note slotid's are numbered from zero: */ /* sr_highest_slotid */ - nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1); + nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots_response - 1); if (nfserr != nfs_ok) return nfserr; /* sr_target_highest_slotid */ @@ -5918,8 +5925,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) */ warn_on_nonidempotent_op(op); xdr_truncate_encode(xdr, op_status_offset + XDR_UNIT); - } - if (so) { + } else if (so) { int len = xdr->buf->len - (op_status_offset + XDR_UNIT); so->so_replay.rp_status = op->status; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index ea87b42894dd..b752433c3c2c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -57,6 +57,9 @@ struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ }; +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 200 + struct nfsd_genl_rqstp { struct sockaddr rq_daddr; struct sockaddr rq_saddr; @@ -455,6 +458,7 @@ enum { #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ FATTR4_WORD2_MODE_UMASK | \ + FATTR4_WORD2_CLONE_BLKSIZE | \ NFSD4_2_SECURITY_ATTRS | \ FATTR4_WORD2_XATTR_SUPPORT | \ FATTR4_WORD2_TIME_DELEG_ACCESS | \ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 3eb724ec9566..ed85dd43da18 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -269,9 +269,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, dentry); } - fhp->fh_dentry = dentry; - fhp->fh_export = exp; - switch (fhp->fh_maxsize) { case NFS4_FHSIZE: if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR) @@ -293,6 +290,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, goto out; } + fhp->fh_dentry = dentry; + fhp->fh_export = exp; + return 0; out: exp_put(exp); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9cb20d4aeab1..cf4062ac092a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1159,7 +1159,7 @@ static int wait_for_concurrent_writes(struct file *file) dprintk("nfsd: write resume %d\n", task_pid_nr(current)); } - if (inode->i_state & I_DIRTY) { + if (inode_state_read_once(inode) & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); err = vfs_fsync(file, 0); } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d4b48602b2b0..1ce8e12ae335 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -574,8 +574,9 @@ struct nfsd4_sequence { struct nfs4_sessionid sessionid; /* request/response */ u32 seqid; /* request/response */ u32 slotid; /* request/response */ - u32 maxslots; /* request/response */ + u32 maxslots; /* request */ u32 cachethis; /* request */ + u32 maxslots_response; /* response */ u32 target_maxslots; /* response */ u32 status_flags; /* response */ }; @@ -903,6 +904,7 @@ struct nfsd4_compoundargs { char * tag; u32 taglen; u32 minorversion; + u32 client_opcnt; u32 opcnt; bool splice_ok; struct nfsd4_op *ops; diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index bcc7d76269ac..4bbdc832d7f2 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -1148,7 +1148,7 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); if (unlikely(!cpfile)) return -ENOMEM; - if (!(cpfile->i_state & I_NEW)) + if (!(inode_state_read_once(cpfile) & I_NEW)) goto out; err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index c664daba56ae..674380837ab9 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -506,7 +506,7 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); if (unlikely(!dat)) return -ENOMEM; - if (!(dat->i_state & I_NEW)) + if (!(inode_state_read_once(dat) & I_NEW)) goto out; err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di)); diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index c4cd4a4dedd0..99eb8a59009e 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -188,7 +188,7 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO); if (unlikely(!ifile)) return -ENOMEM; - if (!(ifile->i_state & I_NEW)) + if (!(inode_state_read_once(ifile) & I_NEW)) goto out; err = nilfs_mdt_init(ifile, NILFS_MDT_GFP, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 87ddde159f0c..51bde45d5865 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -365,7 +365,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) failed_after_creation: clear_nlink(inode); - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); iput(inode); /* * raw_inode will be deleted through @@ -562,7 +562,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { if (!inode->i_nlink) { iput(inode); return ERR_PTR(-ESTALE); @@ -591,7 +591,7 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; err = nilfs_init_gcinode(inode); @@ -631,7 +631,7 @@ int nilfs_attach_btree_node_cache(struct inode *inode) nilfs_iget_set, &args); if (unlikely(!btnc_inode)) return -ENOMEM; - if (btnc_inode->i_state & I_NEW) { + if (inode_state_read_once(btnc_inode) & I_NEW) { nilfs_init_btnc_inode(btnc_inode); unlock_new_inode(btnc_inode); } @@ -686,7 +686,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode) nilfs_iget_set, &args); if (unlikely(!s_inode)) return ERR_PTR(-ENOMEM); - if (!(s_inode->i_state & I_NEW)) + if (!(inode_state_read_once(s_inode) & I_NEW)) return inode; NILFS_I(s_inode)->i_flags = 0; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index f15ca6fc400d..deee16bc9d4e 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2768,7 +2768,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) if (sci->sc_task) { wake_up(&sci->sc_wait_daemon); - kthread_stop(sci->sc_task); + if (kthread_stop(sci->sc_task)) { + spin_lock(&sci->sc_state_lock); + sci->sc_task = NULL; + timer_shutdown_sync(&sci->sc_timer); + spin_unlock(&sci->sc_state_lock); + } } spin_lock(&sci->sc_state_lock); diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 330f269abedf..83f93337c01b 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1226,7 +1226,7 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); if (unlikely(!sufile)) return -ENOMEM; - if (!(sufile->i_state & I_NEW)) + if (!(inode_state_read_once(sufile) & I_NEW)) goto out; err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui)); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 46bfc543f946..d27ff5e5f165 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -52,7 +52,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 3959f23c487a..08266adc42ba 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -537,7 +537,7 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, return ERR_PTR(-ENOMEM); /* If this is a freshly allocated inode, need to read it now. */ - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) inode = ntfs_read_mft(inode, name, ref); else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) { /* diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 92a6149da9c1..619ff03b15d6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2487,7 +2487,7 @@ update: * which hasn't been populated yet, so clear the refresh flag * and let the caller handle it. */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { status = 0; if (lockres) ocfs2_complete_lock_res_refresh(lockres, 0); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index fcc89856ab95..78f81950c9ee 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -152,8 +152,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, mlog_errno(PTR_ERR(inode)); goto bail; } - trace_ocfs2_iget5_locked(inode->i_state); - if (inode->i_state & I_NEW) { + trace_ocfs2_iget5_locked(inode_state_read_once(inode)); + if (inode_state_read_once(inode) & I_NEW) { rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } @@ -1290,6 +1290,8 @@ static void ocfs2_clear_inode(struct inode *inode) void ocfs2_evict_inode(struct inode *inode) { + write_inode_now(inode, 1); + if (!inode->i_nlink || (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); @@ -1299,27 +1301,6 @@ void ocfs2_evict_inode(struct inode *inode) ocfs2_clear_inode(inode); } -/* Called under inode_lock, with no more references on the - * struct inode, so it's safe here to check the flags field - * and to manipulate i_nlink without any other locks. */ -int ocfs2_drop_inode(struct inode *inode) -{ - struct ocfs2_inode_info *oi = OCFS2_I(inode); - - trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, - inode->i_nlink, oi->ip_flags); - - assert_spin_locked(&inode->i_lock); - inode->i_state |= I_WILL_FREE; - spin_unlock(&inode->i_lock); - write_inode_now(inode, 1); - spin_lock(&inode->i_lock); - WARN_ON(inode->i_state & I_NEW); - inode->i_state &= ~I_WILL_FREE; - - return 1; -} - /* * This is called from our getattr. */ diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index accf03d4765e..07bd838e7843 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -116,7 +116,6 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode) } void ocfs2_evict_inode(struct inode *inode); -int ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e5f58ff2175f..85239807dec7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -902,15 +902,8 @@ bail: static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - struct address_space *mapping = jinode->i_vfs_inode->i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); + return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + jinode->i_dirty_start, jinode->i_dirty_end); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 54ed1495de9a..4b32fb5658ad 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1569,8 +1569,6 @@ DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode); DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode); -DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode); - TRACE_EVENT(ocfs2_inode_revalidate, TP_PROTO(void *inode, unsigned long long ino, unsigned int flags), diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 53daa4482406..2c7ba1480f7a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -129,7 +129,7 @@ static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, .free_inode = ocfs2_free_inode, - .drop_inode = ocfs2_drop_inode, + .drop_inode = inode_just_drop, .evict_inode = ocfs2_evict_inode, .sync_fs = ocfs2_sync_fs, .put_super = ocfs2_put_super, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 135c49c5d848..db80af312678 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -212,7 +212,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; bh = omfs_bread(inode->i_sb, ino); diff --git a/fs/open.c b/fs/open.c index 3d64372ecc67..e3c737ede4e4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -940,7 +940,7 @@ static int do_dentry_open(struct file *f, } error = security_file_open(f); - if (error) + if (unlikely(error)) goto cleanup_all; /* @@ -950,11 +950,11 @@ static int do_dentry_open(struct file *f, * pseudo file, this call will not change the mode. */ error = fsnotify_open_perm_and_set_mode(f); - if (error) + if (unlikely(error)) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); - if (error) + if (unlikely(error)) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 26ecda0e4d19..fb8d84bdedfb 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -236,7 +236,7 @@ found: mutex_unlock(&op_mutex); if (IS_ERR(inode)) return ERR_CAST(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { simple_inode_init_ts(inode); ent_oi = OP_I(inode); ent_oi->type = ent_type; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a01400cd41fd..d7275990ffa4 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -878,7 +878,9 @@ int orangefs_update_time(struct inode *inode, int flags) gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", get_khandle_from_ino(inode)); - flags = generic_update_time(inode, flags); + + flags = inode_update_timestamps(inode, flags); + memset(&iattr, 0, sizeof iattr); if (flags & S_ATIME) iattr.ia_valid |= ATTR_ATIME; @@ -1041,7 +1043,7 @@ struct inode *orangefs_iget(struct super_block *sb, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 0fdceb00ca07..9ab1119ebd28 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -247,7 +247,7 @@ again: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { + orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) { if (orangefs_inode->attr_valid) { spin_unlock(&inode->i_lock); write_inode_now(inode, 1); @@ -281,13 +281,13 @@ again2: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { + orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) { if (orangefs_inode->attr_valid) { spin_unlock(&inode->i_lock); write_inode_now(inode, 1); goto again2; } - if (inode->i_state & I_DIRTY_PAGES) { + if (inode_state_read(inode) & I_DIRTY_PAGES) { ret = 0; goto out_unlock; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index a5e9ddf3023b..83b955a1d55c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -686,7 +686,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, goto out_drop_write; spin_lock(&inode->i_lock); - inode->i_state |= I_CREATING; + inode_state_set(inode, I_CREATING); spin_unlock(&inode->i_lock); inode_init_owner(&nop_mnt_idmap, inode, dentry->d_parent->d_inode, mode); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index e11f310ce092..1af700668089 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -1152,7 +1152,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) if (!trap) return ERR_PTR(-ENOMEM); - if (!(trap->i_state & I_NEW)) { + if (!(inode_state_read_once(trap) & I_NEW)) { /* Conflicting layer roots? */ iput(trap); return ERR_PTR(-ELOOP); @@ -1243,7 +1243,7 @@ struct inode *ovl_get_inode(struct super_block *sb, inode = ovl_iget5(sb, oip->newinode, key); if (!inode) goto out_err; - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { /* * Verify that the underlying files stored in the inode * match those in the dentry. @@ -1303,7 +1303,7 @@ struct inode *ovl_get_inode(struct super_block *sb, if (upperdentry) ovl_check_protattr(inode, upperdentry); - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); out: return inode; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f76672f2e686..312eb4529caf 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1019,8 +1019,8 @@ bool ovl_inuse_trylock(struct dentry *dentry) bool locked = false; spin_lock(&inode->i_lock); - if (!(inode->i_state & I_OVL_INUSE)) { - inode->i_state |= I_OVL_INUSE; + if (!(inode_state_read(inode) & I_OVL_INUSE)) { + inode_state_set(inode, I_OVL_INUSE); locked = true; } spin_unlock(&inode->i_lock); @@ -1034,8 +1034,8 @@ void ovl_inuse_unlock(struct dentry *dentry) struct inode *inode = d_inode(dentry); spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_OVL_INUSE)); - inode->i_state &= ~I_OVL_INUSE; + WARN_ON(!(inode_state_read(inode) & I_OVL_INUSE)); + inode_state_clear(inode, I_OVL_INUSE); spin_unlock(&inode->i_lock); } } @@ -1046,7 +1046,7 @@ bool ovl_is_inuse(struct dentry *dentry) bool inuse; spin_lock(&inode->i_lock); - inuse = (inode->i_state & I_OVL_INUSE); + inuse = (inode_state_read(inode) & I_OVL_INUSE); spin_unlock(&inode->i_lock); return inuse; @@ -1234,9 +1234,9 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work, goto err; if (trap) goto err_unlock; - if (work && work->d_parent != workdir) + if (work && (work->d_parent != workdir || d_unhashed(work))) goto err_unlock; - if (upper && upper->d_parent != upperdir) + if (upper && (upper->d_parent != upperdir || d_unhashed(upper))) goto err_unlock; return 0; diff --git a/fs/pipe.c b/fs/pipe.c index 42fead1efe52..2d0fed2ecbfd 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -908,7 +908,7 @@ static struct inode * get_pipe_inode(void) * list because "mark_inode_dirty()" will think * that it already _is_ on the dirty list. */ - inode->i_state = I_DIRTY; + inode_state_assign_raw(inode, I_DIRTY); inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 176281112273..501889856461 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -698,6 +698,12 @@ void pde_put(struct proc_dir_entry *pde) } } +static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent) +{ + rb_erase(&pde->subdir_node, &parent->subdir); + RB_CLEAR_NODE(&pde->subdir_node); +} + /* * Remove a /proc entry and free it if it's not currently in use. */ @@ -720,7 +726,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) WARN(1, "removing permanent /proc entry '%s'", de->name); de = NULL; } else { - rb_erase(&de->subdir_node, &parent->subdir); + pde_erase(de, parent); if (S_ISDIR(de->mode)) parent->nlink--; } @@ -764,7 +770,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) root->parent->name, root->name); return -EINVAL; } - rb_erase(&root->subdir_node, &parent->subdir); + pde_erase(root, parent); de = root; while (1) { @@ -776,7 +782,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) next->parent->name, next->name); return -EINVAL; } - rb_erase(&next->subdir_node, &de->subdir); + pde_erase(next, de); de = next; continue; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index e399e2dd3a12..31d78da203ea 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -290,7 +290,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; qnx4_inode = qnx4_raw_inode(inode); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 3310d1ad4d0e..88d285005083 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -521,7 +521,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ei = QNX6_I(inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6c4a6ee1fa2b..376739f6420e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1033,7 +1033,7 @@ static int add_dquot_ref(struct super_block *sb, int type) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || !atomic_read(&inode->i_writecount) || !dqinit_needed(inode, type)) { spin_unlock(&inode->i_lock); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 0addcc849ff2..360b00854115 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -302,7 +302,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) if (!i) return ERR_PTR(-ENOMEM); - if (!(i->i_state & I_NEW)) + if (!(inode_state_read_once(i) & I_NEW)) return i; /* precalculate the data offset */ diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index b8ac7b7faf61..e3ea6fe7edb4 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -16,6 +16,7 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); static void smb2_close_cached_fid(struct kref *ref); static void cfids_laundromat_worker(struct work_struct *work); +static void close_cached_dir_locked(struct cached_fid *cfid); struct cached_dir_dentry { struct list_head entry; @@ -388,11 +389,11 @@ out: * lease. Release one here, and the second below. */ cfid->has_lease = false; - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir_locked(cfid); } spin_unlock(&cfids->cfid_list_lock); - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir(cfid); } else { *ret_cfid = cfid; atomic_inc(&tcon->num_remote_opens); @@ -438,12 +439,14 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, static void smb2_close_cached_fid(struct kref *ref) +__releases(&cfid->cfids->cfid_list_lock) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); int rc; - spin_lock(&cfid->cfids->cfid_list_lock); + lockdep_assert_held(&cfid->cfids->cfid_list_lock); + if (cfid->on_list) { list_del(&cfid->entry); cfid->on_list = false; @@ -478,15 +481,49 @@ void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon, spin_lock(&cfid->cfids->cfid_list_lock); if (cfid->has_lease) { cfid->has_lease = false; - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir_locked(cfid); } spin_unlock(&cfid->cfids->cfid_list_lock); close_cached_dir(cfid); } - +/** + * close_cached_dir - drop a reference of a cached dir + * + * The release function will be called with cfid_list_lock held to remove the + * cached dirs from the list before any other thread can take another @cfid + * ref. Must not be called with cfid_list_lock held; use + * close_cached_dir_locked() called instead. + * + * @cfid: cached dir + */ void close_cached_dir(struct cached_fid *cfid) { + lockdep_assert_not_held(&cfid->cfids->cfid_list_lock); + kref_put_lock(&cfid->refcount, smb2_close_cached_fid, &cfid->cfids->cfid_list_lock); +} + +/** + * close_cached_dir_locked - put a reference of a cached dir with + * cfid_list_lock held + * + * Calling close_cached_dir() with cfid_list_lock held has the potential effect + * of causing a deadlock if the invariant of refcount >= 2 is false. + * + * This function is used in paths that hold cfid_list_lock and expect at least + * two references. If that invariant is violated, WARNs and returns without + * dropping a reference; the final put must still go through + * close_cached_dir(). + * + * @cfid: cached dir + */ +static void close_cached_dir_locked(struct cached_fid *cfid) +{ + lockdep_assert_held(&cfid->cfids->cfid_list_lock); + + if (WARN_ON(kref_read(&cfid->refcount) < 2)) + return; + kref_put(&cfid->refcount, smb2_close_cached_fid); } @@ -596,7 +633,7 @@ cached_dir_offload_close(struct work_struct *work) WARN_ON(cfid->on_list); - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir(cfid); cifs_put_tcon(tcon, netfs_trace_tcon_ref_put_cached_close); } @@ -762,7 +799,7 @@ static void cfids_laundromat_worker(struct work_struct *work) * Drop the ref-count from above, either the lease-ref (if there * was one) or the extra one acquired. */ - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir(cfid); } queue_delayed_work(cfid_put_wq, &cfids->laundromat_work, dir_cache_timeout * HZ); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 4f959f1e08d2..1d4958ff4d4a 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -173,7 +173,7 @@ module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); module_param(enable_gcm_256, bool, 0644); -MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0"); +MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/1"); module_param(require_gcm_256, bool, 0644); MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); @@ -500,7 +500,7 @@ cifs_evict_inode(struct inode *inode) { netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_NETFS_WB) + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); cifs_fscache_release_inode_cookie(inode); clear_inode(inode); diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index fb1813cbe0eb..3528c365a452 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -616,6 +616,8 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, extern struct TCP_Server_Info * cifs_find_tcp_session(struct smb3_fs_context *ctx); +struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal); + void __cifs_put_smb_ses(struct cifs_ses *ses); extern struct cifs_ses * diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 7da194f29fef..dcc50a2bfa4b 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1363,6 +1363,14 @@ do_retry: if (rdata->result == -ENODATA) { rdata->result = 0; __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + trace_smb3_read_err(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->subreq.len - rdata->subreq.transferred, + rdata->result); } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && @@ -1374,6 +1382,13 @@ do_retry: } if (rdata->got_bytes) __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); + trace_smb3_read_done(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->got_bytes); } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, @@ -1445,6 +1460,13 @@ cifs_async_readv(struct cifs_io_subrequest *rdata) rdata->iov[1].iov_base = (char *)smb + 4; rdata->iov[1].iov_len = get_rfc1002_length(smb); + trace_smb3_read_enter(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.netfid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start, rdata->subreq.len); + rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive, cifs_readv_callback, NULL, rdata, 0, NULL); diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index dd12f3eb61dc..2f94d93b95e9 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -310,6 +310,8 @@ cifs_abort_connection(struct TCP_Server_Info *server) server->ssocket->flags); sock_release(server->ssocket); server->ssocket = NULL; + } else if (cifs_rdma_enabled(server)) { + smbd_destroy(server); } server->sequence_number = 0; server->session_estab = false; @@ -338,12 +340,6 @@ cifs_abort_connection(struct TCP_Server_Info *server) mid_execute_callback(mid); release_mid(mid); } - - if (cifs_rdma_enabled(server)) { - cifs_server_lock(server); - smbd_destroy(server); - cifs_server_unlock(server); - } } static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets) @@ -2015,39 +2011,31 @@ static int match_session(struct cifs_ses *ses, /** * cifs_setup_ipc - helper to setup the IPC tcon for the session * @ses: smb session to issue the request on - * @ctx: the superblock configuration context to use for building the - * new tree connection for the IPC (interprocess communication RPC) + * @seal: if encryption is requested * * A new IPC connection is made and stored in the session * tcon_ipc. The IPC tcon has the same lifetime as the session. */ -static int -cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) +struct cifs_tcon *cifs_setup_ipc(struct cifs_ses *ses, bool seal) { int rc = 0, xid; struct cifs_tcon *tcon; char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0}; - bool seal = false; struct TCP_Server_Info *server = ses->server; /* * If the mount request that resulted in the creation of the * session requires encryption, force IPC to be encrypted too. */ - if (ctx->seal) { - if (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) - seal = true; - else { - cifs_server_dbg(VFS, - "IPC: server doesn't support encryption\n"); - return -EOPNOTSUPP; - } + if (seal && !(server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) { + cifs_server_dbg(VFS, "IPC: server doesn't support encryption\n"); + return ERR_PTR(-EOPNOTSUPP); } /* no need to setup directory caching on IPC share, so pass in false */ tcon = tcon_info_alloc(false, netfs_trace_tcon_ref_new_ipc); if (tcon == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); spin_lock(&server->srv_lock); scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", server->hostname); @@ -2057,13 +2045,13 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->ses = ses; tcon->ipc = true; tcon->seal = seal; - rc = server->ops->tree_connect(xid, ses, unc, tcon, ctx->local_nls); + rc = server->ops->tree_connect(xid, ses, unc, tcon, ses->local_nls); free_xid(xid); if (rc) { - cifs_server_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc); + cifs_server_dbg(VFS | ONCE, "failed to connect to IPC (rc=%d)\n", rc); tconInfoFree(tcon, netfs_trace_tcon_ref_free_ipc_fail); - goto out; + return ERR_PTR(rc); } cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid); @@ -2071,9 +2059,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) spin_lock(&tcon->tc_lock); tcon->status = TID_GOOD; spin_unlock(&tcon->tc_lock); - ses->tcon_ipc = tcon; -out: - return rc; + return tcon; } static struct cifs_ses * @@ -2347,6 +2333,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) { struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + struct cifs_tcon *ipc; struct cifs_ses *ses; unsigned int xid; int retries = 0; @@ -2525,7 +2512,12 @@ retry_new_session: list_add(&ses->smb_ses_list, &server->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); - cifs_setup_ipc(ses, ctx); + ipc = cifs_setup_ipc(ses, ctx->seal); + spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); + ses->tcon_ipc = !IS_ERR(ipc) ? ipc : NULL; + spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); free_xid(xid); @@ -4459,6 +4451,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) out: kfree(ctx->username); + kfree(ctx->domainname); kfree_sensitive(ctx->password); kfree(origin_fullpath); kfree(ctx); diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 4dada26d56b5..f2ad0ccd08a7 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1120,24 +1120,63 @@ static bool target_share_equal(struct cifs_tcon *tcon, const char *s1) return match; } -static bool is_ses_good(struct cifs_ses *ses) +static bool is_ses_good(struct cifs_tcon *tcon, struct cifs_ses *ses) { struct TCP_Server_Info *server = ses->server; - struct cifs_tcon *tcon = ses->tcon_ipc; + struct cifs_tcon *ipc = NULL; bool ret; + spin_lock(&cifs_tcp_ses_lock); spin_lock(&ses->ses_lock); spin_lock(&ses->chan_lock); + ret = !cifs_chan_needs_reconnect(ses, server) && - ses->ses_status == SES_GOOD && - !tcon->need_reconnect; + ses->ses_status == SES_GOOD; + spin_unlock(&ses->chan_lock); + + if (!ret) + goto out; + + if (likely(ses->tcon_ipc)) { + if (ses->tcon_ipc->need_reconnect) { + ret = false; + goto out; + } + } else { + spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); + + ipc = cifs_setup_ipc(ses, tcon->seal); + + spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->ses_lock); + if (!IS_ERR(ipc)) { + if (!ses->tcon_ipc) { + ses->tcon_ipc = ipc; + ipc = NULL; + } + } else { + ret = false; + ipc = NULL; + } + } + +out: spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); + if (ipc && server->ops->tree_disconnect) { + unsigned int xid = get_xid(); + + (void)server->ops->tree_disconnect(xid, ipc); + _free_xid(xid); + } + tconInfoFree(ipc, netfs_trace_tcon_ref_free_ipc); return ret; } /* Refresh dfs referral of @ses */ -static void refresh_ses_referral(struct cifs_ses *ses) +static void refresh_ses_referral(struct cifs_tcon *tcon, struct cifs_ses *ses) { struct cache_entry *ce; unsigned int xid; @@ -1153,7 +1192,7 @@ static void refresh_ses_referral(struct cifs_ses *ses) } ses = CIFS_DFS_ROOT_SES(ses); - if (!is_ses_good(ses)) { + if (!is_ses_good(tcon, ses)) { cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n", __func__); goto out; @@ -1241,7 +1280,7 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh) up_read(&htable_rw_lock); ses = CIFS_DFS_ROOT_SES(ses); - if (!is_ses_good(ses)) { + if (!is_ses_good(tcon, ses)) { cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n", __func__); goto out; @@ -1309,7 +1348,7 @@ void dfs_cache_refresh(struct work_struct *work) tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work); list_for_each_entry(ses, &tcon->dfs_ses_list, dlist) - refresh_ses_referral(ses); + refresh_ses_referral(tcon, ses); refresh_tcon_referral(tcon, false); queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index e60927b2a7c8..2a0d8b87bd8e 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1435,12 +1435,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_errorf(fc, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } + kfree(ctx->source); ctx->source = smb3_fs_context_fullpath(ctx, '/'); if (IS_ERR(ctx->source)) { ctx->source = NULL; cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } + kfree(fc->source); fc->source = kstrdup(ctx->source, GFP_KERNEL); if (fc->source == NULL) { cifs_errorf(fc, "OOM when copying UNC string\n"); @@ -1468,7 +1470,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; } - if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) > + if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) == CIFS_MAX_USERNAME_LEN) { pr_warn("username too long\n"); goto cifs_parse_mount_err; @@ -1832,6 +1834,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->password = NULL; kfree_sensitive(ctx->password2); ctx->password2 = NULL; + kfree(ctx->source); + ctx->source = NULL; + kfree(fc->source); + fc->source = NULL; return -EINVAL; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index cac355364e43..6be72e91264a 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -101,7 +101,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) cifs_dbg(FYI, "%s: revalidating inode %llu\n", __func__, cifs_i->uniqueid); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { cifs_dbg(FYI, "%s: inode %llu is new\n", __func__, cifs_i->uniqueid); return; @@ -146,7 +146,7 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) */ if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) { /* only provide fake values on a new inode */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { if (fattr->cf_cifsattrs & ATTR_DIRECTORY) set_nlink(inode, 2); else @@ -167,12 +167,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - if (!(inode->i_state & I_NEW) && + if (!(inode_state_read_once(inode) & I_NEW) && unlikely(inode_wrong_type(inode, fattr->cf_mode))) { CIFS_I(inode)->time = 0; /* force reval */ return -ESTALE; } - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) CIFS_I(inode)->netfs.zero_point = fattr->cf_eof; cifs_revalidate_cache(inode, fattr); @@ -194,7 +194,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, inode->i_gid = fattr->cf_gid; /* if dynperm is set, don't clobber existing mode */ - if (inode->i_state & I_NEW || + if (inode_state_read(inode) & I_NEW || !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) inode->i_mode = fattr->cf_mode; @@ -236,7 +236,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, if (fattr->cf_flags & CIFS_FATTR_JUNCTION) inode->i_flags |= S_AUTOMOUNT; - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { cifs_set_netfs_context(inode); cifs_set_ops(inode); } @@ -1638,7 +1638,7 @@ retry_iget5_locked: cifs_fattr_to_inode(inode, fattr, false); if (sb->s_flags & SB_NOATIME) inode->i_flags |= S_NOATIME | S_NOCMTIME; - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_ino = hash; cifs_fscache_get_inode_cookie(inode); unlock_new_inode(inode); diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 09e3fc81d7cb..69cb81fa0d3a 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -1294,6 +1294,8 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, smb2_to_name = cifs_convert_path_to_utf16(to_name, cifs_sb); if (smb2_to_name == NULL) { rc = -ENOMEM; + if (cfile) + cifsFileInfo_put(cfile); goto smb2_rename_path; } in_iov.iov_base = smb2_to_name; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 0f9130ef2e7d..1e39f2165e42 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2799,11 +2799,12 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; int rc; __le16 *utf16_path; - struct cached_fid *cfid = NULL; + struct cached_fid *cfid; int retries = 0, cur_sleep = 1; replay_again: /* reinitialize for possible replay */ + cfid = NULL; flags = CIFS_CP_CREATE_CLOSE_OP; oplock = SMB2_OPLOCK_LEVEL_NONE; server = cifs_pick_channel(ses); diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index b0739a2661bf..8b4a4573e9c3 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4054,9 +4054,12 @@ replay_again: smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; - smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset), - le32_to_cpu(smb_rsp->OutputBufferLength), &rsp_iov, + rc = smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), + &rsp_iov, sizeof(struct file_notify_information)); + if (rc) + goto cnotify_exit; *out_data = kmemdup((char *)smb_rsp + le16_to_cpu(smb_rsp->OutputBufferOffset), le32_to_cpu(smb_rsp->OutputBufferLength), GFP_KERNEL); diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 85a4c55b61b8..c6c428c2e08d 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -290,6 +290,9 @@ static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) break; case SMBDIRECT_SOCKET_CREATED: + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + case SMBDIRECT_SOCKET_CONNECTED: sc->status = SMBDIRECT_SOCKET_ERROR; break; diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 051cd9dbba13..915cedde5d66 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -830,7 +830,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!server || server->terminate) continue; - if (CIFS_CHAN_NEEDS_RECONNECT(ses, i)) + if (CIFS_CHAN_NEEDS_RECONNECT(ses, cur)) continue; /* diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 46f87fd1ce1c..2c08cccfa680 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -263,10 +263,16 @@ static void ipc_msg_handle_free(int handle) static int handle_response(int type, void *payload, size_t sz) { - unsigned int handle = *(unsigned int *)payload; + unsigned int handle; struct ipc_msg_table_entry *entry; int ret = 0; + /* Prevent 4-byte read beyond declared payload size */ + if (sz < sizeof(unsigned int)) + return -EINVAL; + + handle = *(unsigned int *)payload; + ipc_update_last_active(); down_read(&ipc_msg_table_lock); hash_for_each_possible(ipc_msg_table, entry, ipc_table_hlist, handle) { diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 89b02efdba0c..e2be9a496154 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -334,6 +334,9 @@ smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) break; case SMBDIRECT_SOCKET_CREATED: + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + case SMBDIRECT_SOCKET_CONNECTED: sc->status = SMBDIRECT_SOCKET_ERROR; break; @@ -418,9 +421,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) sc->ib.dev = sc->rdma.cm_id->device; - INIT_WORK(&sc->recv_io.posted.refill_work, - smb_direct_post_recv_credits); - INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer); conn = ksmbd_conn_alloc(); @@ -469,6 +469,9 @@ static void free_transport(struct smb_direct_transport *t) disable_delayed_work_sync(&sc->idle.timer_work); disable_work_sync(&sc->idle.immediate_work); + if (sc->rdma.cm_id) + rdma_lock_handler(sc->rdma.cm_id); + if (sc->ib.qp) { ib_drain_qp(sc->ib.qp); sc->ib.qp = NULL; @@ -497,8 +500,10 @@ static void free_transport(struct smb_direct_transport *t) ib_free_cq(sc->ib.recv_cq); if (sc->ib.pd) ib_dealloc_pd(sc->ib.pd); - if (sc->rdma.cm_id) + if (sc->rdma.cm_id) { + rdma_unlock_handler(sc->rdma.cm_id); rdma_destroy_id(sc->rdma.cm_id); + } smb_direct_destroy_pools(sc); ksmbd_conn_free(KSMBD_TRANS(t)->conn); @@ -1727,10 +1732,10 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, } case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DISCONNECTED: { - ib_drain_qp(sc->ib.qp); - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; smb_direct_disconnect_rdma_work(&sc->disconnect_work); + if (sc->ib.qp) + ib_drain_qp(sc->ib.qp); break; } case RDMA_CM_EVENT_CONNECT_ERROR: { @@ -1881,6 +1886,7 @@ static int smb_direct_accept_client(struct smbdirect_socket *sc) static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) { struct smbdirect_recv_io *recvmsg; + bool recv_posted = false; int ret; WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); @@ -1897,6 +1903,7 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) pr_err("Can't post recv: %d\n", ret); goto out_err; } + recv_posted = true; ret = smb_direct_accept_client(sc); if (ret) { @@ -1904,10 +1911,16 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) goto out_err; } - smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); return 0; out_err: - put_recvmsg(sc, recvmsg); + /* + * If the recv was never posted, return it to the free list. + * If it was posted, leave it alone so disconnect teardown can + * drain the QP and complete it (flush) and the completion path + * will unmap it exactly once. + */ + if (!recv_posted) + put_recvmsg(sc, recvmsg); return ret; } @@ -2249,8 +2262,8 @@ static int smb_direct_prepare(struct ksmbd_transport *t) return -ECONNABORTED; ret = smb_direct_check_recvmsg(recvmsg); - if (ret == -ECONNABORTED) - goto out; + if (ret) + goto put; req = (struct smbdirect_negotiate_req *)recvmsg->packet; sp->max_recv_size = min_t(int, sp->max_recv_size, @@ -2265,14 +2278,38 @@ static int smb_direct_prepare(struct ksmbd_transport *t) sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); - ret = smb_direct_send_negotiate_response(sc, ret); -out: +put: spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); sc->recv_io.reassembly.queue_length--; list_del(&recvmsg->list); spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); put_recvmsg(sc, recvmsg); + if (ret == -ECONNABORTED) + return ret; + + if (ret) + goto respond; + + /* + * We negotiated with success, so we need to refill the recv queue. + * We do that with sc->idle.immediate_work still being disabled + * via smbdirect_socket_init(), so that queue_work(sc->workqueue, + * &sc->idle.immediate_work) in smb_direct_post_recv_credits() + * is a no-op. + * + * The message that grants the credits to the client is + * the negotiate response. + */ + INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); + smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); + if (unlikely(sc->first_error)) + return sc->first_error; + INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); + +respond: + ret = smb_direct_send_negotiate_response(sc, ret); + return ret; } @@ -2581,7 +2618,7 @@ void ksmbd_rdma_destroy(void) } } -bool ksmbd_rdma_capable_netdev(struct net_device *netdev) +static bool ksmbd_find_rdma_capable_netdev(struct net_device *netdev) { struct smb_direct_device *smb_dev; int i; @@ -2623,6 +2660,28 @@ out: return rdma_capable; } +bool ksmbd_rdma_capable_netdev(struct net_device *netdev) +{ + struct net_device *lower_dev; + struct list_head *iter; + + if (ksmbd_find_rdma_capable_netdev(netdev)) + return true; + + /* check if netdev is bridge or VLAN */ + if (netif_is_bridge_master(netdev) || + netdev->priv_flags & IFF_802_1Q_VLAN) + netdev_for_each_lower_dev(netdev, lower_dev, iter) + if (ksmbd_find_rdma_capable_netdev(lower_dev)) + return true; + + /* check if netdev is IPoIB safely without layer violation */ + if (netdev->type == ARPHRD_INFINIBAND) + return true; + + return false; +} + static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .prepare = smb_direct_prepare, .disconnect = smb_direct_disconnect, diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 7a1e3dcc2cde..d2e391c29464 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -290,8 +290,11 @@ static int ksmbd_kthread_fn(void *p) } } up_read(&conn_list_lock); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + /* Per-IP limit hit: release the just-accepted socket. */ + sock_release(client_sk); continue; + } skip_max_ip_conns_limit: if (server_conf.max_connections && diff --git a/fs/splice.c b/fs/splice.c index f5094b6d00a0..d338fe56b50b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1498,7 +1498,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, /* * For lack of a better implementation, implement vmsplice() to userspace - * as a simple copy of the pipes pages to the user iov. + * as a simple copy of the pipe's pages to the user iov. */ static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index cceae3b78698..82b687414e65 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -86,7 +86,7 @@ struct inode *squashfs_iget(struct super_block *sb, long long ino, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; err = squashfs_read_inode(inode, ino); diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..7c66b96b59be 100644 --- a/fs/super.c +++ b/fs/super.c @@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; return s; fail: @@ -1183,11 +1184,14 @@ static inline bool get_active_super(struct super_block *sb) static const char *filesystems_freeze_ptr = "filesystems_freeze"; -static void filesystems_freeze_callback(struct super_block *sb, void *unused) +static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; + if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE)) + return; + if (!get_active_super(sb)) return; @@ -1201,9 +1205,13 @@ static void filesystems_freeze_callback(struct super_block *sb, void *unused) deactivate_super(sb); } -void filesystems_freeze(void) +void filesystems_freeze(bool freeze_all) { - __iterate_supers(filesystems_freeze_callback, NULL, + void *freeze_all_ptr = NULL; + + if (freeze_all) + freeze_all_ptr = &freeze_all; + __iterate_supers(filesystems_freeze_callback, freeze_all_ptr, SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE); } diff --git a/fs/sync.c b/fs/sync.c index 2955cd4c77a3..431fc5f5be06 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -117,16 +117,17 @@ SYSCALL_DEFINE0(sync) static void do_sync_work(struct work_struct *work) { int nowait = 0; + int wait = 1; /* * Sync twice to reduce the possibility we skipped some inodes / pages * because they were temporarily locked */ - iterate_supers(sync_inodes_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); sync_bdevs(false); - iterate_supers(sync_inodes_one_sb, &nowait); - iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, NULL); + iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); @@ -182,7 +183,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) if (!file->f_op->fsync) return -EINVAL; - if (!datasync && (inode->i_state & I_DIRTY_TIME)) + if (!datasync && (inode_state_read_once(inode) & I_DIRTY_TIME)) mark_inode_dirty_sync(inode); return file->f_op->fsync(file, start, end, datasync); } @@ -280,14 +281,12 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { - int sync_mode = WB_SYNC_NONE; - if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == SYNC_FILE_RANGE_WRITE_AND_WAIT) - sync_mode = WB_SYNC_ALL; - - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - sync_mode); + ret = filemap_fdatawrite_range(mapping, offset, + endbyte); + else + ret = filemap_flush_range(mapping, offset, endbyte); if (ret < 0) goto out; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index ca41ce8208c4..c3265b8804f5 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1323,7 +1323,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) inode_lock(inode); /* Synchronize the inode unless this is a 'datasync()' call. */ - if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { + if (!datasync || (inode_state_read_once(inode) & I_DIRTY_DATASYNC)) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 46952a33c4e6..f453c37cee37 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -114,7 +114,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) inode = iget_locked(sb, inum); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ui = ubifs_inode(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a79d73f28aa7..7fae8002344a 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1962,7 +1962,7 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { if (UDF_I(inode)->i_hidden != hidden_inode) { iput(inode); return ERR_PTR(-EFSCORRUPTED); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 8361c00e8fa6..e2b0a35de2a7 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -655,7 +655,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ufsi = UFS_I(inode); diff --git a/fs/utimes.c b/fs/utimes.c index c7c7958e57b2..3e7156396230 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -76,6 +76,7 @@ retry_deleg: out: return error; } +EXPORT_SYMBOL_GPL(vfs_utimes); static int do_utimes_path(int dfd, const char __user *filename, struct timespec64 *times, int flags) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index de840abc0bcd..57e47077c75a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -73,7 +73,8 @@ #define XFS_ERRTAG_WRITE_DELAY_MS 43 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 #define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 -#define XFS_ERRTAG_MAX 46 +#define XFS_ERRTAG_FORCE_ZERO_RANGE 46 +#define XFS_ERRTAG_MAX 47 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ -XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \ +XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4) #endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index d36a6ae0abe5..d4fcf591e63d 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -50,6 +50,12 @@ struct xfs_rtgroup { uint8_t *rtg_rsum_cache; struct xfs_open_zone *rtg_open_zone; }; + + /* + * Count of outstanding GC operations for zoned XFS. Any RTG with a + * non-zero rtg_gccount will not be picked as new GC victim. + */ + atomic_t rtg_gccount; }; /* diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 2ef7742be7d3..7bfa37c99480 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1249,7 +1249,7 @@ xchk_irele( * hits do not clear DONTCACHE, so we must do it here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index a90a011c7e5f..4f7040c9ddf0 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -1933,7 +1933,7 @@ xrep_inode_pptr( * Unlinked inodes that cannot be added to the directory tree will not * have a parent pointer. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return 0; /* Children of the superblock do not have parent pointers. */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 3b692c4acc1e..11d5de10fd56 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -915,7 +915,7 @@ xchk_pptr_looks_zapped( * Temporary files that cannot be linked into the directory tree do not * have attr forks because they cannot ever have parents. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return false; /* diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 5902398185a8..df629892462f 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -184,7 +184,7 @@ xrep_symlink_salvage_inline( sc->ip->i_disk_size == 1 && old_target[0] == '?') return 0; - nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); + nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes); memcpy(target_buf, ifp->if_data, nr); return nr; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a26f79815533..0c2ed00733f2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -742,14 +742,15 @@ xfs_vm_read_folio( struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &xfs_read_iomap_ops); + iomap_bio_read_folio(folio, &xfs_read_iomap_ops); + return 0; } STATIC void xfs_vm_readahead( struct readahead_control *rac) { - iomap_readahead(rac, &xfs_read_iomap_ops); + iomap_bio_readahead(rac, &xfs_read_iomap_ops); } static int diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 06ca11731e43..2208a720ec3f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -514,7 +514,7 @@ xfs_can_free_eofblocks( * Caller must either hold the exclusive io lock; or be inactivating * the inode, which guarantees there are no other users of the inode. */ - if (!(VFS_I(ip)->i_state & I_FREEING)) + if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING)) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); /* prealloc/delalloc exists only on regular files */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index ee49f20875af..6917de832191 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -726,8 +726,10 @@ xfs_trim_rtgroup_extents( break; } - if (!tr.queued) + if (!tr.queued) { + kfree(tr.extents); break; + } /* * We hand the extent list to the discard function here so the diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2702fef2c90c..6108612182e2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -27,6 +27,8 @@ #include "xfs_file.h" #include "xfs_aops.h" #include "xfs_zone_alloc.h" +#include "xfs_error.h" +#include "xfs_errortag.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -674,8 +676,17 @@ xfs_file_dio_write_aligned( struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int dio_flags = 0; ssize_t ret; + /* + * For always COW inodes, each bio must be aligned to the file system + * block size and not just the device sector size because we need to + * allocate a block-aligned amount of space for each write. + */ + if (xfs_is_always_cow_inode(ip)) + dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -693,7 +704,7 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); out_unlock: xfs_iunlock(ip, iolock); return ret; @@ -890,15 +901,7 @@ xfs_file_dio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* - * For always COW inodes we also must check the alignment of each - * individual iovec segment, as they could end up with different - * I/Os due to the way bio_iov_iter_get_pages works, and we'd - * then overwrite an already written block. - */ - if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || - (xfs_is_always_cow_inode(ip) && - (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) return xfs_file_dio_write_unaligned(ip, iocb, from); if (xfs_is_zoned_inode(ip)) return xfs_file_dio_write_zoned(ip, iocb, from); @@ -1254,23 +1257,36 @@ xfs_falloc_zero_range( struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); unsigned int blksize = i_blocksize(inode); loff_t new_size = 0; int error; - trace_xfs_zero_file_space(XFS_I(inode)); + trace_xfs_zero_file_space(ip); error = xfs_falloc_newsize(file, mode, offset, len, &new_size); if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len, ac); - if (error) - return error; + /* + * Zero range implements a full zeroing mechanism but is only used in + * limited situations. It is more efficient to allocate unwritten + * extents than to perform zeroing here, so use an errortag to randomly + * force zeroing on DEBUG kernels for added test coverage. + */ + if (XFS_TEST_ERROR(ip->i_mount, + XFS_ERRTAG_FORCE_ZERO_RANGE)) { + error = xfs_zero_range(ip, offset, len, ac, NULL); + } else { + error = xfs_free_file_space(ip, offset, len, ac); + if (error) + return error; - len = round_up(offset + len, blksize) - round_down(offset, blksize); - offset = round_down(offset, blksize); - error = xfs_alloc_file_space(XFS_I(inode), offset, len); + len = round_up(offset + len, blksize) - + round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(ip, offset, len); + } if (error) return error; return xfs_falloc_setsize(file, new_size); diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 7c541fb373d5..3c1557fb1cf0 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -285,7 +285,7 @@ xfs_inode_mark_sick( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } @@ -309,7 +309,7 @@ xfs_inode_mark_corrupt( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e44040206851..f3fc4d21bfe1 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -334,7 +334,7 @@ xfs_reinit_inode( dev_t dev = inode->i_rdev; kuid_t uid = inode->i_uid; kgid_t gid = inode->i_gid; - unsigned long state = inode->i_state; + unsigned long state = inode_state_read_once(inode); error = inode_init_always(mp->m_super, inode); @@ -345,7 +345,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; - inode->i_state = state; + inode_state_assign_raw(inode, state); mapping_set_folio_min_order(inode->i_mapping, M_IGEO(mp)->min_folio_order); return error; @@ -411,7 +411,7 @@ xfs_iget_recycle( ip->i_flags |= XFS_INEW; xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - inode->i_state = I_NEW; + inode_state_assign_raw(inode, I_NEW); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 36b39539e561..f1f88e48fe22 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1580,7 +1580,7 @@ xfs_iunlink_reload_next( next_ip->i_prev_unlinked = prev_agino; trace_xfs_iunlink_reload_next(next_ip); rele: - ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); + ASSERT(!(inode_state_read_once(VFS_I(next_ip)) & I_DONTCACHE)); if (xfs_is_quotacheck_running(mp) && next_ip) xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); xfs_irele(next_ip); @@ -2111,7 +2111,7 @@ xfs_rename_alloc_whiteout( */ xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); - VFS_I(tmpfile)->i_state |= I_LINKABLE; + inode_state_set_raw(VFS_I(tmpfile), I_LINKABLE); *wip = tmpfile; return 0; @@ -2330,7 +2330,7 @@ retry: * flag from the inode so it doesn't accidentally get misused in * future. */ - VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE; + inode_state_clear_raw(VFS_I(du_wip.ip), I_LINKABLE); } out_commit: diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 1bd411a1114c..2eb0c6011a2e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -113,9 +113,9 @@ xfs_inode_item_precommit( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & I_DIRTY_TIME) { + if (inode_state_read_once(inode) & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d3f6e3e42a11..04f39ea15898 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1091,6 +1091,29 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { }; #endif /* CONFIG_XFS_RT */ +#ifdef DEBUG +static void +xfs_check_atomic_cow_conversion( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + const struct xfs_bmbt_irec *cmap) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec cmap2 = { }; + + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap2)) + xfs_trim_extent(&cmap2, offset_fsb, count_fsb); + + ASSERT(cmap2.br_startoff == cmap->br_startoff); + ASSERT(cmap2.br_blockcount == cmap->br_blockcount); + ASSERT(cmap2.br_startblock == cmap->br_startblock); + ASSERT(cmap2.br_state == cmap->br_state); +} +#else +# define xfs_check_atomic_cow_conversion(...) ((void)0) +#endif + static int xfs_atomic_write_cow_iomap_begin( struct inode *inode, @@ -1102,9 +1125,10 @@ xfs_atomic_write_cow_iomap_begin( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); - xfs_filblks_t count_fsb = end_fsb - offset_fsb; + const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + const xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + const xfs_filblks_t count_fsb = end_fsb - offset_fsb; + xfs_filblks_t hole_count_fsb; int nmaps = 1; xfs_filblks_t resaligned; struct xfs_bmbt_irec cmap; @@ -1130,7 +1154,7 @@ xfs_atomic_write_cow_iomap_begin( return -EAGAIN; trace_xfs_iomap_atomic_write_cow(ip, offset, length); - +retry: xfs_ilock(ip, XFS_ILOCK_EXCL); if (!ip->i_cowfp) { @@ -1141,14 +1165,22 @@ xfs_atomic_write_cow_iomap_begin( if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) cmap.br_startoff = end_fsb; if (cmap.br_startoff <= offset_fsb) { + if (isnullstartblock(cmap.br_startblock)) + goto convert_delay; + + /* + * cmap could extend outside the write range due to previous + * speculative preallocations. We must trim cmap to the write + * range because the cow fork treats written mappings to mean + * "write in progress". + */ xfs_trim_extent(&cmap, offset_fsb, count_fsb); goto found; } - end_fsb = cmap.br_startoff; - count_fsb = end_fsb - offset_fsb; + hole_count_fsb = cmap.br_startoff - offset_fsb; - resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, + resaligned = xfs_aligned_fsb_count(offset_fsb, hole_count_fsb, xfs_get_cowextsz_hint(ip)); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1169,8 +1201,10 @@ xfs_atomic_write_cow_iomap_begin( if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) cmap.br_startoff = end_fsb; if (cmap.br_startoff <= offset_fsb) { - xfs_trim_extent(&cmap, offset_fsb, count_fsb); xfs_trans_cancel(tp); + if (isnullstartblock(cmap.br_startblock)) + goto convert_delay; + xfs_trim_extent(&cmap, offset_fsb, count_fsb); goto found; } @@ -1182,7 +1216,7 @@ xfs_atomic_write_cow_iomap_begin( * atomic writes to that same range will be aligned (and don't require * this COW-based method). */ - error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + error = xfs_bmapi_write(tp, ip, offset_fsb, hole_count_fsb, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps); if (error) { @@ -1195,21 +1229,43 @@ xfs_atomic_write_cow_iomap_begin( if (error) goto out_unlock; + /* + * cmap could map more blocks than the range we passed into bmapi_write + * because of EXTSZALIGN or adjacent pre-existing unwritten mappings + * that were merged. Trim cmap to the original write range so that we + * don't convert more than we were asked to do for this write. + */ + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + found: if (cmap.br_state != XFS_EXT_NORM) { - error = xfs_reflink_convert_cow_locked(ip, offset_fsb, - count_fsb); + error = xfs_reflink_convert_cow_locked(ip, cmap.br_startoff, + cmap.br_blockcount); if (error) goto out_unlock; cmap.br_state = XFS_EXT_NORM; + xfs_check_atomic_cow_conversion(ip, offset_fsb, count_fsb, + &cmap); } - length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); - trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + trace_xfs_iomap_found(ip, offset, length, XFS_COW_FORK, &cmap); seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); +convert_delay: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_convert_delalloc(ip, XFS_COW_FORK, offset, iomap, + NULL); + if (error) + return error; + + /* + * Try the lookup again, because the delalloc conversion might have + * turned the COW mapping into unwritten, but we need it to be in + * written state. + */ + goto retry; out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -1702,6 +1758,8 @@ xfs_buffered_write_iomap_begin( struct iomap *iomap, struct iomap *srcmap) { + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, + iomap); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -1767,21 +1825,41 @@ xfs_buffered_write_iomap_begin( } /* - * For zeroing, trim a delalloc extent that extends beyond the EOF - * block. If it starts beyond the EOF block, convert it to an + * For zeroing, trim extents that extend beyond the EOF block. If a + * delalloc extent starts beyond the EOF block, convert it to an * unwritten extent. */ - if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && - isnullstartblock(imap.br_startblock)) { + if (flags & IOMAP_ZERO) { xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + u64 end; - if (offset_fsb >= eof_fsb) + if (isnullstartblock(imap.br_startblock) && + offset_fsb >= eof_fsb) goto convert_delay; - if (end_fsb > eof_fsb) { + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) end_fsb = eof_fsb; - xfs_trim_extent(&imap, offset_fsb, - end_fsb - offset_fsb); + + /* + * Look up dirty folios for unwritten mappings within EOF. + * Providing this bypasses the flush iomap uses to trigger + * extent conversion when unwritten mappings have dirty + * pagecache in need of zeroing. + * + * Trim the mapping to the end pos of the lookup, which in turn + * was trimmed to the end of the batch if it became full before + * the end of the mapping. + */ + if (imap.br_state == XFS_EXT_UNWRITTEN && + offset_fsb < eof_fsb) { + loff_t len = min(count, + XFS_FSB_TO_B(mp, imap.br_blockcount)); + + end = iomap_fill_dirty_folios(iter, offset, len); + end_fsb = min_t(xfs_fileoff_t, end_fsb, + XFS_B_TO_FSB(mp, end)); } + + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); } /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index caff0125faea..ad94fbf55014 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1420,7 +1420,7 @@ xfs_setup_inode( bool is_meta = xfs_is_internal_inode(ip); inode->i_ino = ip->i_ino; - inode->i_state |= I_NEW; + inode_state_set_raw(inode, I_NEW); inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 36cda724da89..9d1ed9bb0bee 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -17,7 +17,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip) { struct inode *inode = VFS_I(ip); - if ((inode->i_state & I_DIRTY_PAGES) || + if ((inode_state_read_once(inode) & I_DIRTY_PAGES) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&inode->i_dio_count)) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1067ebb3b001..bc71aa9dcee8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1693,7 +1693,10 @@ xfs_fs_fill_super( if (error) return error; - sb_min_blocksize(sb, BBSIZE); + if (!sb_min_blocksize(sb, BBSIZE)) { + xfs_err(mp, "unable to set blocksize"); + return -EINVAL; + } sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; #ifdef CONFIG_XFS_QUOTA diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 23cdab4515bb..8dde444596f1 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -246,6 +246,14 @@ xfs_zoned_map_extent( * If a data write raced with this GC write, keep the existing data in * the data fork, mark our newly written GC extent as reclaimable, then * move on to the next extent. + * + * Note that this can also happen when racing with operations that do + * not actually invalidate the data, but just move it to a different + * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the + * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the + * data was just moved around, GC fails to free the zone, but the zone + * becomes a GC candidate again as soon as all previous GC I/O has + * finished and these blocks will be moved out eventually. */ if (old_startblock != NULLFSBLOCK && old_startblock != data.br_startblock) @@ -607,7 +615,7 @@ xfs_select_open_zone_mru( lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, false)) + if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -1196,6 +1204,7 @@ xfs_mount_zones( .mp = mp, }; struct xfs_buftarg *bt = mp->m_rtdev_targp; + xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks; int error; if (!bt) { @@ -1226,10 +1235,33 @@ xfs_mount_zones( return -ENOMEM; xfs_info(mp, "%u zones of %u blocks (%u max open zones)", - mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, - mp->m_max_open_zones); + mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); + /* + * The writeback code switches between inodes regularly to provide + * fairness. The default lower bound is 4MiB, but for zoned file + * systems we want to increase that both to reduce seeks, but also more + * importantly so that workloads that writes files in a multiple of the + * zone size do not get fragmented and require garbage collection when + * they shouldn't. Increase is to the zone size capped by the max + * extent len. + * + * Note that because s_min_writeback_pages is a superblock field, this + * value also get applied to non-zoned files on the data device if + * there are any. On typical zoned setup all data is on the RT device + * because using the more efficient sequential write required zones + * is the reason for using the zone allocator, and either the RT device + * and the (meta)data device are on the same block device, or the + * (meta)data device is on a fast SSD while the data on the RT device + * is on a SMR HDD. In any combination of the above cases enforcing + * the higher min_writeback_pages for non-RT inodes is either a noop + * or beneficial. + */ + mp->m_super->s_min_writeback_pages = + XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >> + PAGE_SHIFT; + if (bdev_is_zoned(bt->bt_bdev)) { error = blkdev_report_zones(bt->bt_bdev, XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), @@ -1241,8 +1273,10 @@ xfs_mount_zones( while ((rtg = xfs_rtgroup_next(mp, rtg))) { error = xfs_init_zone(&iz, rtg, NULL); - if (error) + if (error) { + xfs_rtgroup_rele(rtg); goto out_free_zone_info; + } } } diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 109877d9a6bf..4ade54445532 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -114,6 +114,8 @@ struct xfs_gc_bio { /* Open Zone being written to */ struct xfs_open_zone *oz; + struct xfs_rtgroup *victim_rtg; + /* Bio used for reads and writes, including the bvec used by it */ struct bio_vec bv; struct bio bio; /* must be last */ @@ -264,6 +266,7 @@ xfs_zone_gc_iter_init( iter->rec_count = 0; iter->rec_idx = 0; iter->victim_rtg = victim_rtg; + atomic_inc(&victim_rtg->rtg_gccount); } /* @@ -362,6 +365,7 @@ xfs_zone_gc_query( return 0; done: + atomic_dec(&iter->victim_rtg->rtg_gccount); xfs_rtgroup_rele(iter->victim_rtg); iter->victim_rtg = NULL; return 0; @@ -451,6 +455,20 @@ xfs_zone_gc_pick_victim_from( if (!rtg) continue; + /* + * If the zone is already undergoing GC, don't pick it again. + * + * This prevents us from picking one of the zones for which we + * already submitted GC I/O, but for which the remapping hasn't + * concluded yet. This won't cause data corruption, but + * increases write amplification and slows down GC, so this is + * a bad thing. + */ + if (atomic_read(&rtg->rtg_gccount)) { + xfs_rtgroup_rele(rtg); + continue; + } + /* skip zones that are just waiting for a reset */ if (rtg_rmap(rtg)->i_used_blocks == 0 || rtg_rmap(rtg)->i_used_blocks >= victim_used) { @@ -688,6 +706,9 @@ xfs_zone_gc_start_chunk( chunk->scratch = &data->scratch[data->scratch_idx]; chunk->data = data; chunk->oz = oz; + chunk->victim_rtg = iter->victim_rtg; + atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); bio->bi_end_io = xfs_zone_gc_end_io; @@ -710,6 +731,8 @@ static void xfs_zone_gc_free_chunk( struct xfs_gc_bio *chunk) { + atomic_dec(&chunk->victim_rtg->rtg_gccount); + xfs_rtgroup_rele(chunk->victim_rtg); list_del(&chunk->entry); xfs_open_zone_put(chunk->oz); xfs_irele(chunk->ip); @@ -770,6 +793,10 @@ xfs_zone_gc_split_write( split_chunk->oz = chunk->oz; atomic_inc(&chunk->oz->oz_ref); + split_chunk->victim_rtg = chunk->victim_rtg; + atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); + chunk->offset += split_len; chunk->len -= split_len; chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 90e2ad8ee5f4..c1e5e30e90a0 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -112,12 +112,13 @@ static const struct iomap_ops zonefs_write_iomap_ops = { static int zonefs_read_folio(struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &zonefs_read_iomap_ops); + iomap_bio_read_folio(folio, &zonefs_read_iomap_ops); + return 0; } static void zonefs_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &zonefs_read_iomap_ops); + iomap_bio_readahead(rac, &zonefs_read_iomap_ops); } /* diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 70be0b3dda49..086a31269198 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -644,7 +644,7 @@ static struct inode *zonefs_get_file_inode(struct inode *dir, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { WARN_ON_ONCE(inode->i_private != z); return inode; } @@ -683,7 +683,7 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; inode->i_ino = ino; |
