diff options
Diffstat (limited to 'fs')
248 files changed, 3605 insertions, 1781 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 795c6388744c..1581ebac5bb4 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - return generic_drop_inode(inode); + return inode_generic_drop(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 69e1dd55b160..894d2bad6b6c 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -42,7 +42,7 @@ static void afs_volume_init_callback(struct afs_volume *volume) list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb); - queue_work(system_unbound_wq, &vnode->cb_work); + queue_work(system_dfl_wq, &vnode->cb_work); } } @@ -90,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas if (reason != afs_cb_break_for_deleted && vnode->status.type == AFS_FTYPE_FILE && atomic_read(&vnode->cb_nr_mmap)) - queue_work(system_unbound_wq, &vnode->cb_work); + queue_work(system_dfl_wq, &vnode->cb_work); trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true); } else { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index bfb69e066672..89d36e3e5c79 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1823,7 +1823,8 @@ error: static void afs_rename_success(struct afs_operation *op) { - struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + struct afs_vnode *vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; _enter("op=%08x", op->debug_id); @@ -1834,22 +1835,40 @@ static void afs_rename_success(struct afs_operation *op) op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); } + if (op->more_files[0].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[0]); + if (op->more_files[1].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[1]); /* If we're moving a subdir between dirs, we need to update * its DV counter too as the ".." will be altered. */ - if (S_ISDIR(vnode->netfs.inode.i_mode) && - op->file[0].vnode != op->file[1].vnode) { - u64 new_dv; + if (op->file[0].vnode != op->file[1].vnode) { + if (S_ISDIR(vnode->netfs.inode.i_mode)) { + u64 new_dv; - write_seqlock(&vnode->cb_lock); + write_seqlock(&vnode->cb_lock); - new_dv = vnode->status.data_version + 1; - trace_afs_set_dv(vnode, new_dv); - vnode->status.data_version = new_dv; - inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + new_dv = vnode->status.data_version + 1; + trace_afs_set_dv(vnode, new_dv); + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); - write_sequnlock(&vnode->cb_lock); + write_sequnlock(&vnode->cb_lock); + } + + if ((op->rename.rename_flags & RENAME_EXCHANGE) && + S_ISDIR(new_vnode->netfs.inode.i_mode)) { + u64 new_dv; + + write_seqlock(&new_vnode->cb_lock); + + new_dv = new_vnode->status.data_version + 1; + new_vnode->status.data_version = new_dv; + inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv); + + write_sequnlock(&new_vnode->cb_lock); + } } } @@ -1900,8 +1919,8 @@ static void afs_rename_edit_dir(struct afs_operation *op) if (S_ISDIR(vnode->netfs.inode.i_mode) && new_dvnode != orig_dvnode && test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - afs_edit_dir_update_dotdot(vnode, new_dvnode, - afs_edit_dir_for_rename_sub); + afs_edit_dir_update(vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); new_inode = d_inode(new_dentry); if (new_inode) { @@ -1915,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op) /* Now we can update d_fsdata on the dentries to reflect their * new parent's data_version. - * - * Note that if we ever implement RENAME_EXCHANGE, we'll have - * to update both dentries with opposing dir versions. */ afs_update_dentry_version(op, new_dvp, op->dentry); afs_update_dentry_version(op, new_dvp, op->dentry_2); @@ -1930,6 +1946,67 @@ static void afs_rename_edit_dir(struct afs_operation *op) fscache_end_operation(&new_cres); } +static void afs_rename_exchange_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode *orig_dvnode = orig_dvp->vnode; + struct afs_vnode *new_dvnode = new_dvp->vnode; + struct afs_vnode *old_vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; + struct dentry *old_dentry = op->dentry; + struct dentry *new_dentry = op->dentry_2; + + _enter("op=%08x", op->debug_id); + + if (new_dvnode == orig_dvnode) { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) { + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + afs_edit_dir_update(orig_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + } + + d_exchange(old_dentry, new_dentry); + up_write(&orig_dvnode->validate_lock); + } else { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + + up_write(&orig_dvnode->validate_lock); + down_write(&new_dvnode->validate_lock); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta) + afs_edit_dir_update(new_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + + if (S_ISDIR(old_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags)) + afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); + + if (S_ISDIR(new_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags)) + afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode, + afs_edit_dir_for_rename_sub); + + /* Now we can update d_fsdata on the dentries to reflect their + * new parents' data_version. + */ + afs_update_dentry_version(op, new_dvp, old_dentry); + afs_update_dentry_version(op, orig_dvp, new_dentry); + + d_exchange(old_dentry, new_dentry); + up_write(&new_dvnode->validate_lock); + } +} + static void afs_rename_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); @@ -1948,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = { .put = afs_rename_put, }; +#if 0 /* Autoswitched in yfs_fs_rename_replace(). */ +static const struct afs_operation_ops afs_rename_replace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_replace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; +#endif + +static const struct afs_operation_ops afs_rename_noreplace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_noreplace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; + +static const struct afs_operation_ops afs_rename_exchange_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_exchange, + .success = afs_rename_success, + .edit_dir = afs_rename_exchange_edit_dir, + .put = afs_rename_put, +}; + /* * rename a file in an AFS filesystem and/or move it between directories */ @@ -1956,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *new_dentry, unsigned int flags) { struct afs_operation *op; - struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; + struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL; int ret; - if (flags) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; /* Don't allow silly-rename files be moved around. */ @@ -1969,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + if (d_is_positive(new_dentry)) + new_vnode = AFS_FS_I(d_inode(new_dentry)); _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, @@ -1989,6 +2094,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (ret < 0) goto error; + ret = -ENOMEM; + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) + goto error; + afs_op_set_vnode(op, 0, orig_dvnode); afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; @@ -1997,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = vnode; + op->more_files[0].speculative = true; + op->more_files[1].vnode = new_vnode; + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old_dentry; op->dentry_2 = new_dentry; + op->rename.rename_flags = flags; op->rename.new_negative = d_is_negative(new_dentry); - op->ops = &afs_rename_operation; - /* For non-directories, check whether the target is busy and if so, - * make a copy of the dentry and then do a silly-rename. If the - * silly-rename succeeds, the copied dentry is hashed and becomes the - * new target. - */ - if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { - /* To prevent any new references to the target during the - * rename, we unhash the dentry in advance. + if (flags & RENAME_NOREPLACE) { + op->ops = &afs_rename_noreplace_operation; + } else if (flags & RENAME_EXCHANGE) { + op->ops = &afs_rename_exchange_operation; + d_drop(new_dentry); + } else { + /* If we might displace the target, we might need to do silly + * rename. */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - op->rename.rehash = new_dentry; - } + op->ops = &afs_rename_operation; - if (d_count(new_dentry) > 2) { - /* copy the target dentry's name */ - op->rename.tmp = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!op->rename.tmp) { - afs_op_nomem(op); - goto error; + /* For non-directories, check whether the target is busy and if + * so, make a copy of the dentry and then do a silly-rename. + * If the silly-rename succeeds, the copied dentry is hashed + * and becomes the new target. + */ + if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { + /* To prevent any new references to the target during + * the rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + op->rename.rehash = new_dentry; } - ret = afs_sillyrename(new_dvnode, - AFS_FS_I(d_inode(new_dentry)), - new_dentry, op->key); - if (ret) { - afs_op_set_error(op, ret); - goto error; + if (d_count(new_dentry) > 2) { + /* copy the target dentry's name */ + op->rename.tmp = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!op->rename.tmp) { + afs_op_nomem(op); + goto error; + } + + ret = afs_sillyrename(new_dvnode, + AFS_FS_I(d_inode(new_dentry)), + new_dentry, op->key); + if (ret) { + afs_op_set_error(op, ret); + goto error; + } + + op->dentry_2 = op->rename.tmp; + op->rename.rehash = NULL; + op->rename.new_negative = true; } - - op->dentry_2 = op->rename.tmp; - op->rename.rehash = NULL; - op->rename.new_negative = true; } } @@ -2052,6 +2179,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, d_drop(old_dentry); ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -EINVAL; out: afs_dir_unuse_cookie(orig_dvnode, ret); if (new_dvnode != orig_dvnode) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 60a549f1d9c5..4b1342c72089 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -522,11 +522,11 @@ error: } /* - * Edit a subdirectory that has been moved between directories to update the - * ".." entry. + * Edit an entry in a directory to update the vnode it refers to. This is also + * used to update the ".." entry in a directory. */ -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why) +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *block; union afs_xdr_dirent *de; @@ -557,7 +557,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) goto already_invalidated; - slot = afs_dir_scan_block(block, &dotdot_name, b); + slot = afs_dir_scan_block(block, name, b); if (slot >= 0) goto found_dirent; @@ -566,7 +566,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d /* Didn't find the dirent to clobber. Download the directory again. */ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd); goto out; @@ -576,7 +576,7 @@ found_dirent: de->u.unique = htonl(new_dvnode->fid.unique); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, - ntohl(de->u.vnode), ntohl(de->u.unique), ".."); + ntohl(de->u.vnode), ntohl(de->u.unique), name->name); kunmap_local(block); netfs_single_mark_inode_dirty(&vnode->netfs.inode); @@ -589,12 +589,12 @@ out: already_invalidated: kunmap_local(block); trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; error: trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, - 0, 0, 0, 0, ".."); + 0, 0, 0, 0, name->name); goto out; } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 0b80eb93fa40..014495d4b868 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode if (IS_ERR(op)) return PTR_ERR(op); + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) { + afs_put_operation(op); + return -ENOMEM; + } + afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; @@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = AFS_FS_I(d_inode(old)); + op->more_files[0].speculative = true; + op->more_files[1].vnode = AFS_FS_I(d_inode(new)); + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old; op->dentry_2 = new; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e9538e91f848..e1cb17b85791 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode) _enter(""); if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) - return generic_delete_inode(inode); + return inode_just_drop(inode); else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1124ea4000cb..444a3ea4fdf6 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -562,6 +562,7 @@ struct afs_server { #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ +#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ @@ -891,9 +892,10 @@ struct afs_operation { bool need_rehash; } unlink; struct { - struct dentry *rehash; - struct dentry *tmp; - bool new_negative; + struct dentry *rehash; + struct dentry *tmp; + unsigned int rename_flags; + bool new_negative; } rename; struct { struct netfs_io_subrequest *subreq; @@ -1100,8 +1102,8 @@ int afs_single_writepages(struct address_space *mapping, extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); -void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, - enum afs_edit_dir_reason why); +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); /* @@ -1693,6 +1695,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); +void yfs_fs_rename_replace(struct afs_operation *op); +void yfs_fs_rename_noreplace(struct afs_operation *op); +void yfs_fs_rename_exchange(struct afs_operation *op); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 02475d415d88..e6bb8237db98 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -169,13 +169,13 @@ static int __init afs_init(void) printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); - afs_wq = alloc_workqueue("afs", 0, 0); + afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0); if (!afs_wq) goto error_afs_wq; afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!afs_async_calls) goto error_async; - afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); + afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!afs_lock_manager) goto error_lockmgr; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 8f2b3a177690..c8a7f266080d 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -131,6 +131,7 @@ int afs_abort_to_error(u32 abort_code) case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; case RXGEN_OPCODE: return -ENOTSUPP; + case RX_INVALID_OPERATION: return -ENOTSUPP; default: return -EREMOTEIO; } diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index e4cd89c44c46..b2f06c1917c2 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -50,6 +50,9 @@ enum YFS_FS_Operations { YFSREMOVEACL = 64171, YFSREMOVEFILE2 = 64173, YFSSTOREOPAQUEACL2 = 64174, + YFSRENAME_REPLACE = 64176, + YFSRENAME_NOREPLACE = 64177, + YFSRENAME_EXCHANGE = 64187, YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */ YFSFETCHDATA64 = 64537, /* YFS Fetch file data */ YFSSTOREDATA64 = 64538, /* YFS Store file data */ diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index a1c24f589d9e..6a4e7da10fc4 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op) afs_op_set_error(op, -EDQUOT); goto failed_but_online; + case RX_INVALID_OPERATION: + case RXGEN_OPCODE: + /* Handle downgrading to an older operation. */ + afs_op_set_error(op, -ENOTSUPP); + if (op->flags & AFS_OPERATION_DOWNGRADE) { + op->flags &= ~AFS_OPERATION_DOWNGRADE; + goto go_again; + } + goto failed_but_online; + default: afs_op_accumulate_error(op, error, abort_code); failed_but_online: @@ -620,12 +630,13 @@ iterate_address: op->addr_index = addr_index; set_bit(addr_index, &op->addr_tried); - op->volsync.creation = TIME64_MIN; - op->volsync.update = TIME64_MIN; - op->call_responded = false; _debug("address [%u] %u/%u %pISp", op->server_index, addr_index, alist->nr_addrs, rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); +go_again: + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; + op->call_responded = false; _leave(" = t"); return true; diff --git a/fs/afs/server.c b/fs/afs/server.c index a97562f831eb..c4428ebddb1d 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -331,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate, void afs_put_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - unsigned int a, debug_id = server->debug_id; + unsigned int a, debug_id; bool zero; int r; if (!server) return; + debug_id = server->debug_id; a = atomic_read(&server->active); zero = __refcount_dec_and_test(&server->ref, &r); trace_afs_server(debug_id, r - 1, a, reason); diff --git a/fs/afs/write.c b/fs/afs/write.c index 2e7526ea883a..93ad86ff3345 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -172,7 +172,7 @@ static void afs_issue_write_worker(struct work_struct *work) void afs_issue_write(struct netfs_io_subrequest *subreq) { subreq->work.func = afs_issue_write_worker; - if (!queue_work(system_unbound_wq, &subreq->work)) + if (!queue_work(system_dfl_wq, &subreq->work)) WARN_ON_ONCE(1); } diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 257af259c04a..febf13a49f0b 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1042,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op) _enter(""); + if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags)) + return yfs_fs_rename_replace(op); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + @@ -1071,6 +1074,252 @@ void yfs_fs_rename(struct afs_operation *op) } /* + * Deliver reply data to a YFS.Rename_NoReplace operation. This does not + * return the status of a displaced target inode as there cannot be one. + */ +static int yfs_deliver_fs_rename_1(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +/* + * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange + * operation. These return the status of the displaced target inode if there + * was one. + */ +static int yfs_deliver_fs_rename_2(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + struct afs_vnode_param *new_vp = &op->more_files[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSFid(&bp, &new_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +static void yfs_done_fs_rename_replace(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags); + call->op->flags |= AFS_OPERATION_DOWNGRADE; + } +} + +/* + * YFS.Rename_Replace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Replace = { + .name = "FS.Rename_Replace", + .op = yfs_FS_Rename_Replace, + .deliver = yfs_deliver_fs_rename_2, + .done = yfs_done_fs_rename_replace, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_NoReplace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_NoReplace = { + .name = "FS.Rename_NoReplace", + .op = yfs_FS_Rename_NoReplace, + .deliver = yfs_deliver_fs_rename_1, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_Exchange operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Exchange = { + .name = "FS.Rename_Exchange", + .op = yfs_FS_Rename_Exchange, + .deliver = yfs_deliver_fs_rename_2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory, replacing the target if it exists. The status + * of a displaced target is returned. + */ +void yfs_fs_rename_replace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_REPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Rename a file or directory, failing if the target dirent exists. + */ +void yfs_fs_rename_noreplace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Exchange a pair of files directories. + */ +void yfs_fs_rename_exchange(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* * YFS.StoreData64 operation type. */ static const struct afs_call_type yfs_RXYFSStoreData64 = { @@ -636,7 +636,7 @@ static void free_ioctx_reqs(struct percpu_ref *ref) /* Synchronize against RCU protected table->table[] dereferences */ INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); - queue_rcu_work(system_wq, &ctx->free_rwork); + queue_rcu_work(system_percpu_wq, &ctx->free_rwork); } /* diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 1d847a939f29..180a458fc4f7 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -129,7 +129,7 @@ struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *n } return inode; } -EXPORT_SYMBOL_GPL_FOR_MODULES(anon_inode_make_secure_inode, "kvm"); +EXPORT_SYMBOL_FOR_MODULES(anon_inode_make_secure_inode, "kvm"); static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 4b095235a0d2..0afb44ce1a85 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -827,7 +827,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ if (bch2_btree_write_buffer_should_flush(c) && __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && - !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) + !queue_work(system_dfl_wq, &c->btree_write_buffer.flush_work)) enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); if (dst->wb == &wb->flushing) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index e0874ad9a6cf..460e2e6341f1 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -684,7 +684,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); + RBIO_CONTEXT_UNBOUND, system_dfl_wq); } else { rbio = bch2_rbio_free(rbio); @@ -921,10 +921,10 @@ csum_err: bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); goto out; decompression_err: - bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq); goto out; decrypt_err: - bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq); goto out; } @@ -963,7 +963,7 @@ static void bch2_read_endio(struct bio *bio) rbio->promote || crc_is_compressed(rbio->pick.crc) || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) - context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; + context = RBIO_CONTEXT_UNBOUND, wq = system_dfl_wq; else if (rbio->pick.crc.csum_type) context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 9e028dbcc3d0..29bea8e0e495 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1362,7 +1362,7 @@ int bch2_journal_read(struct bch_fs *c, BCH_DEV_READ_REF_journal_read)) closure_call(&ca->journal.read, bch2_journal_read_device, - system_unbound_wq, + system_dfl_wq, &jlist.cl); else degraded = true; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index c46b1053a02c..f2417d298b84 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -801,13 +801,13 @@ int bch2_fs_init_rw(struct bch_fs *c) if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_PERCPU, 1)) || !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE, 0))) + WQ_FREEZABLE|WQ_PERCPU, 0))) return bch_err_throw(c, ENOMEM_fs_other_alloc); int ret = bch2_fs_btree_interior_update_init(c) ?: @@ -975,7 +975,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, sizeof(struct sort_iter_set); if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 512)) || enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, bch2_writes_disabled) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9bf282d2453c..fcd274d83fd7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, bg1 = list_entry(a, struct btrfs_block_group, bg_list); bg2 = list_entry(b, struct btrfs_block_group, bg_list); - return bg1->used > bg2->used; + /* + * Some other task may be updating the ->used field concurrently, but it + * is not serious if we get a stale value or load/store tearing issues, + * as sorting the list of block groups to reclaim is not critical and an + * occasional imperfect order is ok. So silence KCSAN and avoid the + * overhead of locking or any other synchronization. + */ + return data_race(bg1->used > bg2->used); } static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) @@ -2031,7 +2038,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) btrfs_reclaim_sweep(fs_info); spin_lock(&fs_info->unused_bgs_lock); if (!list_empty(&fs_info->reclaim_bgs)) - queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); + queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work); spin_unlock(&fs_info->unused_bgs_lock); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b99fb0273292..3bb504c1e32a 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -248,7 +248,7 @@ struct btrfs_inode { u64 new_delalloc_bytes; /* * The offset of the last dir index key that was logged. - * This is used only for directories. + * This is used only for directories. Protected by 'log_mutex'. */ u64 last_dir_index_offset; }; @@ -338,6 +338,11 @@ struct btrfs_inode { struct list_head delayed_iput; struct rw_semaphore i_mmap_lock; + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif + struct inode vfs_inode; }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d09d622016ef..35e3071cec06 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1616,25 +1616,29 @@ out: } /* - * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level. Negative level - * numbers are allowed. + * Convert the compression suffix (eg. after "zlib" starting with ":") to level. + * + * If the resulting level exceeds the algo's supported levels, it will be clamped. + * + * Return <0 if no valid string can be found. + * Return 0 if everything is fine. */ -int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret) { int level = 0; int ret; - if (!type) + if (!type) { + *level_ret = btrfs_compress_set_level(type, level); return 0; + } if (str[0] == ':') { ret = kstrtoint(str + 1, 10, &level); if (ret) - level = 0; + return ret; } - level = btrfs_compress_set_level(type, level); - - return level; + *level_ret = btrfs_compress_set_level(type, level); + return 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1b38e707bbd9..7b41b2b5ff44 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -102,7 +102,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0f8d8e275143..c0c1ddd46b67 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1843,7 +1843,6 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; struct inode *vfs_inode = &inode->vfs_inode; @@ -1864,8 +1863,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 70fc4e7cc5a0..aa4393eba997 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1958,7 +1958,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; - unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE; + unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 835b0deef9bb..b21cb72835cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -111,6 +111,24 @@ struct btrfs_bio_ctrl { */ unsigned long submit_bitmap; struct readahead_control *ractl; + + /* + * The start offset of the last used extent map by a read operation. + * + * This is for proper compressed read merge. + * U64_MAX means we are starting the read and have made no progress yet. + * + * The current btrfs_bio_is_contig() only uses disk_bytenr as + * the condition to check if the read can be merged with previous + * bio, which is not correct. E.g. two file extents pointing to the + * same extent but with different offset. + * + * So here we need to do extra checks to only merge reads that are + * covered by the same extent map. + * Just extent_map::start will be enough, as they are unique + * inside the same inode. + */ + u64 last_em_start; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -909,7 +927,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, * return 0 on success, otherwise return error */ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -1019,12 +1037,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, * non-optimal behavior (submitting 2 bios for the same extent). */ if (compress_type != BTRFS_COMPRESS_NONE && - prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->start) + bio_ctrl->last_em_start != U64_MAX && + bio_ctrl->last_em_start != em->start) force_bio_submit = true; - if (prev_em_start) - *prev_em_start = em->start; + bio_ctrl->last_em_start = em->start; btrfs_free_extent_map(em); em = NULL; @@ -1238,12 +1255,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio) const u64 start = folio_pos(folio); const u64 end = start + folio_size(folio) - 1; struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, + .last_em_start = U64_MAX, + }; struct extent_map *em_cached = NULL; int ret; lock_extents_for_read(inode, start, end, &cached_state); - ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); btrfs_free_extent_map(em_cached); @@ -1512,7 +1532,7 @@ out: /* * Return 0 if we have submitted or queued the sector for submission. - * Return <0 for critical errors. + * Return <0 for critical errors, and the sector will have its dirty flag cleared. * * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ @@ -1535,8 +1555,17 @@ static int submit_one_sector(struct btrfs_inode *inode, ASSERT(filepos < i_size); em = btrfs_get_extent(inode, NULL, filepos, sectorsize); - if (IS_ERR(em)) + if (IS_ERR(em)) { + /* + * When submission failed, we should still clear the folio dirty. + * Or the folio will be written back again but without any + * ordered extent. + */ + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); + btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); return PTR_ERR(em); + } extent_offset = filepos - em->start; em_end = btrfs_extent_map_end(em); @@ -1609,8 +1638,12 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, folio_unlock(folio); return 1; } - if (ret < 0) + if (ret < 0) { + btrfs_folio_clear_dirty(fs_info, folio, start, len); + btrfs_folio_set_writeback(fs_info, folio, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); return ret; + } for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); @@ -1666,8 +1699,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * Here we set writeback and clear for the range. If the full folio * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. * - * If we hit any error, the corresponding sector will still be dirty - * thus no need to clear PAGECACHE_TAG_DIRTY. + * If we hit any error, the corresponding sector will have its dirty + * flag cleared and writeback finished, thus no need to handle the error case. */ if (!submitted_io && !error) { btrfs_folio_set_writeback(fs_info, folio, start, len); @@ -1813,6 +1846,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e xas_load(&xas); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); @@ -2569,7 +2603,8 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD, - .ractl = rac + .ractl = rac, + .last_em_start = U64_MAX, }; struct folio *folio; struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); @@ -2577,12 +2612,11 @@ void btrfs_readahead(struct readahead_control *rac) const u64 end = start + readahead_length(rac) - 1; struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; - u64 prev_em_start = (u64)-1; lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) - btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); @@ -4331,15 +4365,18 @@ static int try_release_subpage_extent_buffer(struct folio *folio) unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; int ret; - xa_lock_irq(&fs_info->buffer_tree); + rcu_read_lock(); xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { /* * The same as try_release_extent_buffer(), to ensure the eb * won't disappear out from under us. */ spin_lock(&eb->refs_lock); + rcu_read_unlock(); + if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); + rcu_read_lock(); continue; } @@ -4358,11 +4395,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ - xa_unlock_irq(&fs_info->buffer_tree); release_extent_buffer(eb); - xa_lock_irq(&fs_info->buffer_tree); + rcu_read_lock(); } - xa_unlock_irq(&fs_info->buffer_tree); + rcu_read_unlock(); /* * Finally to check if we have cleared folio private, as if we have @@ -4375,7 +4411,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio) ret = 0; spin_unlock(&folio->mapping->i_private_lock); return ret; - } int try_release_extent_buffer(struct folio *folio) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 57f52585a6dd..9a5a497edc97 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1372,7 +1372,7 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) return; - queue_work(system_unbound_wq, &fs_info->em_shrinker_work); + queue_work(system_dfl_wq, &fs_info->em_shrinker_work); } void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b77dd22b8cdb..27942d0ad9de 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -401,10 +401,12 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, while (index <= end_index) { folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); - index++; - if (IS_ERR(folio)) + if (IS_ERR(folio)) { + index++; continue; + } + index = folio_end(folio) >> PAGE_SHIFT; /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle @@ -2013,7 +2015,7 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio * cleaered by the caller. */ if (ret < 0) - btrfs_cleanup_ordered_extents(inode, file_pos, end); + btrfs_cleanup_ordered_extents(inode, file_pos, len); return ret; } @@ -3883,10 +3885,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(inode); - if (ret) - goto out; - ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; @@ -3918,8 +3916,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); @@ -3951,6 +3947,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path btrfs_set_inode_mapping_order(inode); cache_index: + ret = btrfs_init_file_extent_tree(inode); + if (ret) + goto out; + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any @@ -4187,6 +4188,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, return ret; } +static void update_time_after_link_or_unlink(struct btrfs_inode *dir) +{ + struct timespec64 now; + + /* + * If we are replaying a log tree, we do not want to update the mtime + * and ctime of the parent directory with the current time, since the + * log replay procedure is responsible for setting them to their correct + * values (the ones it had when the fsync was done). + */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags)) + return; + + now = inode_set_ctime_current(&dir->vfs_inode); + inode_set_mtime_to_ts(&dir->vfs_inode, now); +} + /* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and @@ -4287,7 +4305,7 @@ skip_backref: inode_inc_iversion(&inode->vfs_inode); inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + update_time_after_link_or_unlink(dir); return btrfs_update_inode(trans, dir); } @@ -4538,7 +4556,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) inode = btrfs_find_first_inode(root, min_ino); while (inode) { - if (atomic_read(&inode->vfs_inode.i_count) > 1) + if (icount_read(&inode->vfs_inode) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; @@ -5677,7 +5695,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode) bool empty = false; xa_lock(&root->inodes); - entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + /* + * This btrfs_inode is being freed and has already been unhashed at this + * point. It's possible that another btrfs_inode has already been + * allocated for the same inode and inserted itself into the root, so + * don't delete it in that case. + * + * Note that this shouldn't need to allocate memory, so the gfp flags + * don't really matter. + */ + entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL, + GFP_ATOMIC); if (entry == inode) empty = xa_empty(&root->inodes); xa_unlock(&root->inodes); @@ -6681,15 +6709,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name->len * 2); inode_inc_iversion(&parent_inode->vfs_inode); - /* - * If we are replaying a log tree, we do not want to update the mtime - * and ctime of the parent directory with the current time, since the - * log replay procedure is responsible for setting them to their correct - * values (the ones it had when the fsync was done). - */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) - inode_set_mtime_to_ts(&parent_inode->vfs_inode, - inode_set_ctime_current(&parent_inode->vfs_inode)); + update_time_after_link_or_unlink(parent_inode); ret = btrfs_update_inode(trans, parent_inode); if (ret) @@ -6794,7 +6814,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct fscrypt_name fname; u64 index; int ret; - int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) @@ -6826,44 +6845,44 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, /* There are several dir indexes for this inode, clear the cache. */ BTRFS_I(inode)->dir_index = 0ULL; - inc_nlink(inode); inode_inc_iversion(inode); inode_set_ctime_current(inode); - ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); + if (ret) + goto fail; + /* Link added now we update the inode item with the new link count. */ + inc_nlink(inode); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { - drop_inode = 1; - } else { - struct dentry *parent = dentry->d_parent; + btrfs_abort_transaction(trans, ret); + goto fail; + } - ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) + if (inode->i_nlink == 1) { + /* + * If the new hard link count is 1, it's a file created with the + * open(2) O_TMPFILE flag. + */ + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); goto fail; - if (inode->i_nlink == 1) { - /* - * If new hard link count is 1, it's a file created - * with open(2) O_TMPFILE flag. - */ - ret = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (ret) - goto fail; } - d_instantiate(dentry, inode); - btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); } + /* Grab reference for the new dentry passed to d_instantiate(). */ + ihold(inode); + d_instantiate(dentry, inode); + btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent); + fail: fscrypt_free_filename(&fname); if (trans) btrfs_end_transaction(trans); - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } btrfs_btree_balance_dirty(fs_info); return ret; } @@ -7819,6 +7838,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + /* new_delalloc_bytes and last_dir_index_offset are in a union. */ ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; @@ -7953,7 +7973,7 @@ int btrfs_drop_inode(struct inode *inode) if (btrfs_root_refs(&root->root_item) == 0) return 1; else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } static void init_once(void *foo) @@ -7961,6 +7981,9 @@ static void init_once(void *foo) struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } void __cold btrfs_destroy_cachep(void) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1a5972178b3a..da102da169fd 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1453,9 +1453,9 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *src, int sign) { struct btrfs_qgroup *qgroup; - struct btrfs_qgroup *cur; LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; + u64 num_bytes_cmpr = src->excl_cmpr; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -1463,15 +1463,16 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, goto out; qgroup_iterator_add(&qgroup_list, qgroup); - list_for_each_entry(cur, &qgroup_list, iterator) { + list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes_cmpr; WARN_ON(sign < 0 && qgroup->excl < num_bytes); + WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr); qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes_cmpr; if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 3871c3a6c743..9f1858b42c0e 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -980,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; + extent_root = btrfs_extent_root(fs_info, 0); + /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */ + if (IS_ERR(extent_root)) { + btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling"); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - extent_root = btrfs_extent_root(fs_info, 0); eb = btrfs_read_lock_root_node(extent_root); level = btrfs_header_level(eb); path->nodes[level] = eb; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e58151933844..7256f6748c8f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -602,6 +602,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, if (btrfs_root_id(root) == objectid) { u64 commit_root_gen; + /* + * Relocation will wait for cleaner thread, and any half-dropped + * subvolume will be fully cleaned up at mount time. + * So here we shouldn't hit a subvolume with non-zero drop_progress. + * + * If this isn't the case, error out since it can make us attempt to + * drop references for extents that were already dropped before. + */ + if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) { + struct btrfs_key cpu_key; + + btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); + btrfs_err(fs_info, + "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)", + objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset); + ret = -EUCLEAN; + goto fail; + } + /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 0481c693ac2e..c573d80550ad 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1830,7 +1830,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, async_work); + queue_work(system_dfl_wq, async_work); } } else { list_add_tail(&ticket.list, @@ -1847,7 +1847,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, need_preemptive_reclaim(fs_info, space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); - queue_work(system_unbound_wq, + queue_work(system_dfl_wq, &fs_info->preempt_reclaim_work); } } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index c9b3821957f7..cb4f97833dc3 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -448,8 +448,25 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + + /* + * Don't clear the TOWRITE tag when starting writeback on a still-dirty + * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it, + * assume writeback is complete, and exit too early — violating sync + * ordering guarantees. + */ if (!folio_test_writeback(folio)) - folio_start_writeback(folio); + __folio_start_writeback(folio, true); + if (!folio_test_dirty(folio)) { + struct address_space *mapping = folio_mapping(folio); + XA_STATE(xas, &mapping->i_pages, folio->index); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); + } spin_unlock_irqrestore(&bfs->lock, flags); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 68e35a3700ff..b06b8f325537 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -88,6 +88,9 @@ struct btrfs_fs_context { refcount_t refs; }; +static void btrfs_emit_options(struct btrfs_fs_info *info, + struct btrfs_fs_context *old); + enum { Opt_acl, Opt_clear_cache, @@ -273,6 +276,7 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, const struct fs_parameter *param, int opt) { const char *string = param->string; + int ret; /* * Provide the same semantics as older kernels that don't use fs @@ -291,21 +295,30 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zlib", true)) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (btrfs_match_compress_type(string, "lzo", false)) { + } else if (btrfs_match_compress_type(string, "lzo", true)) { ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = 0; + ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3, + &ctx->compress_level); + if (ret < 0) + goto error; + if (string[3] == ':' && string[4]) + btrfs_warn(NULL, "Compression level ignored for LZO"); btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zstd", true)) { ctx->compress_type = BTRFS_COMPRESS_ZSTD; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); @@ -316,10 +329,14 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); } else { - btrfs_err(NULL, "unrecognized compression value %s", string); - return -EINVAL; + ret = -EINVAL; + goto error; } return 0; +error: + btrfs_err(NULL, "failed to parse compression option '%s'", string); + return ret; + } static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) @@ -698,12 +715,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info, if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { - btrfs_info(info, "disk space caching is enabled"); btrfs_warn(info, "space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2"); } - if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) - btrfs_info(info, "using free-space-tree"); } return ret; @@ -980,6 +994,8 @@ static int btrfs_fill_super(struct super_block *sb, return ret; } + btrfs_emit_options(fs_info, NULL); + inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -1077,7 +1093,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); - if (info->compress_level) + if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO) seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) @@ -1437,7 +1453,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info, { btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); - btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow"); btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations"); btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme"); btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); @@ -1459,10 +1475,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info, btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums"); btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags"); + btrfs_info_if_unset(info, old, NODATASUM, "setting datasum"); btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); - btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers"); btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log"); btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching"); btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree"); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 0f556f4de3f9..a997c7cc35a2 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1756,10 +1756,10 @@ static int check_inode_ref(struct extent_buffer *leaf, while (ptr < end) { u16 namelen; - if (unlikely(ptr + sizeof(iref) > end)) { + if (unlikely(ptr + sizeof(*iref) > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", - ptr, end, sizeof(iref)); + ptr, end, sizeof(*iref)); return -EUCLEAN; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2186e87fb61b..7a63afedd01e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1964,7 +1964,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = key->objectid; + search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len); ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); if (ret < 0) { goto out; @@ -2605,14 +2605,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, /* * Correctly adjust the reserved bytes occupied by a log tree extent buffer */ -static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) +static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); if (!cache) { btrfs_err(fs_info, "unable to find block group for %llu", start); - return; + return -ENOENT; } spin_lock(&cache->space_info->lock); @@ -2623,27 +2623,22 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) spin_unlock(&cache->space_info->lock); btrfs_put_block_group(cache); + + return 0; } static int clean_log_buffer(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { - int ret; - btrfs_tree_lock(eb); btrfs_clear_buffer_dirty(trans, eb); wait_on_extent_buffer_writeback(eb); btrfs_tree_unlock(eb); - if (trans) { - ret = btrfs_pin_reserved_extent(trans, eb); - if (ret) - return ret; - } else { - unaccount_log_buffer(eb->fs_info, eb->start); - } + if (trans) + return btrfs_pin_reserved_extent(trans, eb); - return 0; + return unaccount_log_buffer(eb->fs_info, eb->start); } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, @@ -3345,6 +3340,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, return 0; } +static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + bool ret = false; + + /* + * Do this only if ->logged_trans is still 0 to prevent races with + * concurrent logging as we may see the inode not logged when + * inode_logged() is called but it gets logged after inode_logged() did + * not find it in the log tree and we end up setting ->logged_trans to a + * value less than trans->transid after the concurrent logging task has + * set it to trans->transid. As a consequence, subsequent rename, unlink + * and link operations may end up not logging new names and removing old + * names from the log. + */ + spin_lock(&inode->lock); + if (inode->logged_trans == 0) + inode->logged_trans = trans->transid - 1; + else if (inode->logged_trans == trans->transid) + ret = true; + spin_unlock(&inode->lock); + + return ret; +} + /* * Check if an inode was logged in the current transaction. This correctly deals * with the case where the inode was logged but has a logged_trans of 0, which @@ -3362,15 +3382,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans, struct btrfs_key key; int ret; - if (inode->logged_trans == trans->transid) + /* + * Quick lockless call, since once ->logged_trans is set to the current + * transaction, we never set it to a lower value anywhere else. + */ + if (data_race(inode->logged_trans) == trans->transid) return 1; /* - * If logged_trans is not 0, then we know the inode logged was not logged - * in this transaction, so we can return false right away. + * If logged_trans is not 0 and not trans->transid, then we know the + * inode was not logged in this transaction, so we can return false + * right away. We take the lock to avoid a race caused by load/store + * tearing with a concurrent btrfs_log_inode() call or a concurrent task + * in this function further below - an update to trans->transid can be + * teared into two 32 bits updates for example, in which case we could + * see a positive value that is not trans->transid and assume the inode + * was not logged when it was. */ - if (inode->logged_trans > 0) + spin_lock(&inode->lock); + if (inode->logged_trans == trans->transid) { + spin_unlock(&inode->lock); + return 1; + } else if (inode->logged_trans > 0) { + spin_unlock(&inode->lock); return 0; + } + spin_unlock(&inode->lock); /* * If no log tree was created for this root in this transaction, then @@ -3379,10 +3416,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans, * transaction's ID, to avoid the search below in a future call in case * a log tree gets created after this. */ - if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { - inode->logged_trans = trans->transid - 1; - return 0; - } + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) + return mark_inode_as_not_logged(trans, inode); /* * We have a log tree and the inode's logged_trans is 0. We can't tell @@ -3436,8 +3471,7 @@ static int inode_logged(const struct btrfs_trans_handle *trans, * Set logged_trans to a value greater than 0 and less then the * current transaction to avoid doing the search in future calls. */ - inode->logged_trans = trans->transid - 1; - return 0; + return mark_inode_as_not_logged(trans, inode); } /* @@ -3445,20 +3479,9 @@ static int inode_logged(const struct btrfs_trans_handle *trans, * the current transacion's ID, to avoid future tree searches as long as * the inode is not evicted again. */ + spin_lock(&inode->lock); inode->logged_trans = trans->transid; - - /* - * If it's a directory, then we must set last_dir_index_offset to the - * maximum possible value, so that the next attempt to log the inode does - * not skip checking if dir index keys found in modified subvolume tree - * leaves have been logged before, otherwise it would result in attempts - * to insert duplicate dir index keys in the log tree. This must be done - * because last_dir_index_offset is an in-memory only field, not persisted - * in the inode item or any other on-disk structure, so its value is lost - * once the inode is evicted. - */ - if (S_ISDIR(inode->vfs_inode.i_mode)) - inode->last_dir_index_offset = (u64)-1; + spin_unlock(&inode->lock); return 1; } @@ -4050,7 +4073,7 @@ done: /* * If the inode was logged before and it was evicted, then its - * last_dir_index_offset is (u64)-1, so we don't the value of the last index + * last_dir_index_offset is 0, so we don't know the value of the last index * key offset. If that's the case, search for it and update the inode. This * is to avoid lookups in the log tree every time we try to insert a dir index * key from a leaf changed in the current transaction, and to allow us to always @@ -4066,7 +4089,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode, lockdep_assert_held(&inode->log_mutex); - if (inode->last_dir_index_offset != (u64)-1) + if (inode->last_dir_index_offset != 0) return 0; if (!ctx->logged_before) { diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index b7a96a005487..4633cbcfcdb9 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -802,6 +802,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations btrfs_verityops = { + .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) - + (int)offsetof(struct btrfs_inode, vfs_inode), .begin_enable_verity = btrfs_begin_enable_verity, .end_enable_verity = btrfs_end_enable_verity, .get_verity_descriptor = btrfs_get_verity_descriptor, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fa7a929a0461..c6e3efd6f602 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error; } + if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = true; down_write(&sb->s_umount); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 245e813ecd78..71cf9be75c42 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -17,6 +17,7 @@ #include "accessors.h" #include "bio.h" #include "transaction.h" +#include "sysfs.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -42,6 +43,9 @@ /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 +/* Default number of max active zones when the device has no limits. */ +#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128 + /* * Minimum of active zones we need: * @@ -416,7 +420,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - max_active_zones = bdev_max_active_zones(bdev); + max_active_zones = min_not_zero(bdev_max_active_zones(bdev), + bdev_max_open_zones(bdev)); + if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES) + max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES; if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", @@ -507,6 +514,11 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (max_active_zones) { if (nactive > max_active_zones) { + if (bdev_max_active_zones(bdev) == 0) { + max_active_zones = 0; + zone_info->max_active_zones = 0; + goto validate; + } btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", nactive, rcu_dereference(device->name), @@ -519,6 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } +validate: /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -2168,10 +2181,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - /* No space left */ - if (btrfs_zoned_bg_is_full(block_group)) { - ret = false; - goto out_unlock; + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { + /* The caller should check if the block group is full. */ + if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) { + ret = false; + goto out_unlock; + } + } else { + /* Since it is already written, it should have been active. */ + WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start); } for (i = 0; i < map->num_stripes; i++) { @@ -2230,7 +2248,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; struct extent_buffer *eb; - unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); + unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); rcu_read_lock(); xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { @@ -2245,6 +2263,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) rcu_read_unlock(); } +static int call_zone_finish(struct btrfs_block_group *block_group, + struct btrfs_io_stripe *stripe) +{ + struct btrfs_device *device = stripe->dev; + const u64 physical = stripe->physical; + struct btrfs_zoned_device_info *zinfo = device->zone_info; + int ret; + + if (!device->bdev) + return 0; + + if (zinfo->max_active_zones == 0) + return 0; + + if (btrfs_dev_is_sequential(device, physical)) { + unsigned int nofs_flags; + + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); + + if (ret) + return ret; + } + + if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + zinfo->reserved_active_zones++; + btrfs_dev_clear_active_zone(device, physical); + + return 0; +} + static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -2329,31 +2381,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { - struct btrfs_device *device = map->stripes[i].dev; - const u64 physical = map->stripes[i].physical; - struct btrfs_zoned_device_info *zinfo = device->zone_info; - unsigned int nofs_flags; - - if (!device->bdev) - continue; - - if (zinfo->max_active_zones == 0) - continue; - - nofs_flags = memalloc_nofs_save(); - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT); - memalloc_nofs_restore(nofs_flags); + ret = call_zone_finish(block_group, &map->stripes[i]); if (ret) { up_read(&dev_replace->rwsem); return ret; } - - if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) - zinfo->reserved_active_zones++; - btrfs_dev_clear_active_zone(device, physical); } up_read(&dev_replace->rwsem); @@ -2488,7 +2521,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, refcount_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); - queue_work(system_unbound_wq, &bg->zone_finish_work); + queue_work(system_dfl_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) @@ -2504,12 +2537,12 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - struct btrfs_space_info *space_info = data_sinfo->sub_group[0]; + struct btrfs_space_info *space_info = data_sinfo; struct btrfs_trans_handle *trans; struct btrfs_block_group *bg; struct list_head *bg_list; u64 alloc_flags; - bool initial = false; + bool first = true; bool did_chunk_alloc = false; int index; int ret; @@ -2523,21 +2556,52 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) if (sb_rdonly(fs_info->sb)) return; - ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); index = btrfs_bg_flags_to_raid_index(alloc_flags); - bg_list = &data_sinfo->block_groups[index]; + /* Scan the data space_info to find empty block groups. Take the second one. */ again: + bg_list = &space_info->block_groups[index]; list_for_each_entry(bg, bg_list, list) { - if (bg->used > 0) + if (bg->alloc_offset != 0) continue; - if (!initial) { - initial = true; + if (first) { + first = false; continue; } + if (space_info == data_sinfo) { + /* Migrate the block group to the data relocation space_info. */ + struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; + int factor; + + ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + factor = btrfs_bg_type_to_factor(bg->flags); + + down_write(&space_info->groups_sem); + list_del_init(&bg->list); + /* We can assume this as we choose the second empty one. */ + ASSERT(!list_empty(&space_info->block_groups[index])); + up_write(&space_info->groups_sem); + + spin_lock(&space_info->lock); + space_info->total_bytes -= bg->length; + space_info->disk_total -= bg->length * factor; + space_info->disk_total -= bg->zone_unusable; + /* There is no allocation ever happened. */ + ASSERT(bg->used == 0); + /* No super block in a block group on the zoned setup. */ + ASSERT(bg->bytes_super == 0); + spin_unlock(&space_info->lock); + + bg->space_info = reloc_sinfo; + if (reloc_sinfo->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(bg); + + btrfs_add_bg_to_space_info(fs_info, bg); + } + fs_info->data_reloc_bg = bg->start; set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags); btrfs_zone_activate(bg); @@ -2552,11 +2616,18 @@ again: if (IS_ERR(trans)) return; + /* Allocate new BG in the data relocation space_info. */ + space_info = data_sinfo->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret == 1) { + /* + * We allocated a new block group in the data relocation space_info. We + * can take that one. + */ + first = false; did_chunk_alloc = true; - bg_list = &space_info->block_groups[index]; goto again; } } @@ -2650,7 +2721,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) spin_lock(&block_group->lock); if (block_group->reserved || block_group->alloc_offset == 0 || - (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || + !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) || test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); continue; diff --git a/fs/buffer.c b/fs/buffer.c index ead4dc85debd..6a8752f7bbed 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -157,8 +157,8 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { - __end_buffer_read_notouch(bh, uptodate); put_bh(bh); + __end_buffer_read_notouch(bh, uptodate); } EXPORT_SYMBOL(end_buffer_read_sync); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8b202d789e93..322ed268f14a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, 0, gfp_flags); if (IS_ERR(pages[index])) { - if (PTR_ERR(pages[index]) == -EINVAL) { + int err = PTR_ERR(pages[index]); + + if (err == -EINVAL) { pr_err_client(cl, "inode->i_blkbits=%hhu\n", inode->i_blkbits); } @@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, BUG_ON(ceph_wbc->locked_pages == 0); pages[index] = NULL; - return PTR_ERR(pages[index]); + return err; } } else { pages[index] = &folio->page; @@ -1687,6 +1689,7 @@ get_more_pages: process_folio_batch: rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); if (rc) goto release_folios; @@ -1695,8 +1698,6 @@ process_folio_batch: goto release_folios; if (ceph_wbc.processed_in_fbatch) { - ceph_shift_unused_folios_left(&ceph_wbc.fbatch); - if (folio_batch_count(&ceph_wbc.fbatch) == 0 && ceph_wbc.locked_pages < ceph_wbc.max_pages) { doutc(cl, "reached end fbatch, trying for more\n"); diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index cab722619207..7026e794813c 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -133,6 +133,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) } static struct fscrypt_operations ceph_fscrypt_ops = { + .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) - + (int)offsetof(struct ceph_inode_info, netfs.inode), .needs_bounce_pages = 1, .get_context = ceph_crypt_get_context, .set_context = ceph_crypt_set_context, diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fdd404fc8112..f3fe786b4143 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; - int pathlen = 0; - u64 pathbase; char *path; mutex_lock(&mdsc->mutex); @@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p) if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); } else if (req->r_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_dentry->d_lock); @@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); @@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p) } if (req->r_old_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_old_dentry->d_lock); @@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8478e7e75df6..32973c62c1a2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, /* If op failed, mark everyone involved for errors */ if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); /* mark error on parent + clear complete */ mapping_set_error(req->r_parent->i_mapping, result); @@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_old_inode->i_mapping, result); pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<<bad>>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path_info(&path_info); } out: iput(req->r_old_inode); @@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) int err = -EROFS; int op; char *path; - int pathlen; - u64 pathbase; if (ceph_snap(dir) == CEPH_SNAPDIR) { /* rmdir .snap/foo is RMSNAP */ @@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c02f100f8552..978acd3d4b32 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file) int flags, fmode, wanted; struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; int mask = MAY_READ; @@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file) if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ @@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_parent->i_mapping, result); if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); pr_warn_client(cl, "async create failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<<bad>>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path_info(&path_info); ceph_dir_clear_complete(req->r_parent); if (!d_unhashed(dentry)) @@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, int mask; int err; char *path; - int pathlen; - u64 pathbase; doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n", dir, ceph_vinop(dir), dentry, dentry, @@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; @@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, mask |= MAY_WRITE; err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fc543075b827..949f0badc944 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } +/* + * Check if the parent inode matches the vino from directory reply info + */ +static inline bool ceph_vino_matches_parent(struct inode *parent, + struct ceph_vino vino) +{ + return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap; +} + +/* + * Validate that the directory inode referenced by @req->r_parent matches the + * inode number and snapshot id contained in the reply's directory record. If + * they do not match – which can theoretically happen if the parent dentry was + * moved between the time the request was issued and the reply arrived – fall + * back to looking up the correct inode in the inode cache. + * + * A reference is *always* returned. Callers that receive a different inode + * than the original @parent are responsible for dropping the extra reference + * once the reply has been processed. + */ +static struct inode *ceph_get_reply_dir(struct super_block *sb, + struct inode *parent, + struct ceph_mds_reply_info_parsed *rinfo) +{ + struct ceph_vino vino; + + if (unlikely(!rinfo->diri.in)) + return parent; /* nothing to compare against */ + + /* If we didn't have a cached parent inode to begin with, just bail out. */ + if (!parent) + return NULL; + + vino.ino = le64_to_cpu(rinfo->diri.in->ino); + vino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (likely(ceph_vino_matches_parent(parent, vino))) + return parent; /* matches – use the original reference */ + + /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */ + WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n", + ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap); + + return ceph_get_inode(sb, vino, NULL); +} + /** * ceph_new_inode - allocate a new inode in advance of an expected create * @dir: parent directory for new inode @@ -665,6 +711,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); #ifdef CONFIG_FS_ENCRYPTION + ci->i_crypt_info = NULL; ci->fscrypt_auth = NULL; ci->fscrypt_auth_len = 0; #endif @@ -1523,6 +1570,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) struct ceph_vino tvino, dvino; struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_client *cl = fsc->client; + struct inode *parent_dir = NULL; int err = 0; doutc(cl, "%p is_dentry %d is_target %d\n", req, @@ -1536,10 +1584,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_parent; - - if (dir) { - err = ceph_fill_inode(dir, NULL, &rinfo->diri, + /* + * r_parent may be stale, in cases when R_PARENT_LOCKED is not set, + * so we need to get the correct inode + */ + parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo); + if (unlikely(IS_ERR(parent_dir))) { + err = PTR_ERR(parent_dir); + goto done; + } + if (parent_dir) { + err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, rinfo->dirfrag, session, -1, &req->r_caps_reservation); if (err < 0) @@ -1548,14 +1603,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) WARN_ON_ONCE(1); } - if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { bool is_nokey = false; struct qstr dname; struct dentry *dn, *parent; struct fscrypt_str oname = FSTR_INIT(NULL, 0); - struct ceph_fname fname = { .dir = dir, + struct ceph_fname fname = { .dir = parent_dir, .name = rinfo->dname, .ctext = rinfo->altname, .name_len = rinfo->dname_len, @@ -1564,10 +1619,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) BUG_ON(!rinfo->head->is_target); BUG_ON(req->r_dentry); - parent = d_find_any_alias(dir); + parent = d_find_any_alias(parent_dir); BUG_ON(!parent); - err = ceph_fname_alloc_buffer(dir, &oname); + err = ceph_fname_alloc_buffer(parent_dir, &oname); if (err < 0) { dput(parent); goto done; @@ -1576,7 +1631,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); if (err < 0) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); goto done; } dname.name = oname.name; @@ -1595,7 +1650,7 @@ retry_lookup: dname.len, dname.name, dn); if (!dn) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); err = -ENOMEM; goto done; } @@ -1610,12 +1665,12 @@ retry_lookup: ceph_snap(d_inode(dn)) != tvino.snap)) { doutc(cl, " dn %p points to wrong inode %p\n", dn, d_inode(dn)); - ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(parent_dir); d_delete(dn); dput(dn); goto retry_lookup; } - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); req->r_dentry = dn; dput(parent); @@ -1794,6 +1849,9 @@ retry_lookup: &dvino, ptvino); } done: + /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */ + if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent)) + iput(parent_dir); doutc(cl, "done err=%d\n", err); return err; } @@ -2487,22 +2545,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, int truncate_retry = 20; /* The RMW will take around 50ms */ struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; dentry = d_find_alias(inode); if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0f497c39ff82..73da2648fa0f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = atomic_read(&inode->i_count); + count = icount_read(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", @@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * ceph_mdsc_build_path - build a path string to a given dentry * @mdsc: mds client * @dentry: dentry to which path should be built - * @plen: returned length of string - * @pbase: returned base inode number + * @path_info: output path, length, base ino+snap, and freepath ownership flag * @for_wire: is this path going to be sent to the MDS? * * Build a string that represents the path to the dentry. This is mostly called @@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - int *plen, u64 *pbase, int for_wire) + struct ceph_path_info *path_info, int for_wire) { struct ceph_client *cl = mdsc->fsc->client; struct dentry *cur; @@ -2810,16 +2809,28 @@ retry: return ERR_PTR(-ENAMETOOLONG); } - *pbase = base; - *plen = PATH_MAX - 1 - pos; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + + path_info->vino.ino = base; + path_info->pathlen = PATH_MAX - 1 - pos; + path_info->path = path + pos; + path_info->freepath = true; + + /* Set snap from dentry if available */ + if (d_inode(dentry)) + path_info->vino.snap = ceph_snap(d_inode(dentry)); + else + path_info->vino.snap = CEPH_NOSNAP; + doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), - base, *plen, path + pos); + base, PATH_MAX - 1 - pos, path + pos); return path + pos; } static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - struct inode *dir, const char **ppath, int *ppathlen, - u64 *pino, bool *pfreepath, bool parent_locked) + struct inode *dir, struct ceph_path_info *path_info, + bool parent_locked) { char *path; @@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry dir = d_inode_rcu(dentry->d_parent); if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && !IS_ENCRYPTED(dir)) { - *pino = ceph_ino(dir); + path_info->vino.ino = ceph_ino(dir); + path_info->vino.snap = ceph_snap(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + path_info->path = dentry->d_name.name; + path_info->pathlen = dentry->d_name.len; + path_info->freepath = false; return 0; } rcu_read_unlock(); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap handling. + */ return 0; } -static int build_inode_path(struct inode *inode, - const char **ppath, int *ppathlen, u64 *pino, - bool *pfreepath) +static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct dentry *dentry; char *path; if (ceph_snap(inode) == CEPH_NOSNAP) { - *pino = ceph_ino(inode); - *ppathlen = 0; + path_info->vino.ino = ceph_ino(inode); + path_info->vino.snap = ceph_snap(inode); + path_info->pathlen = 0; + path_info->freepath = false; return 0; } dentry = d_find_alias(inode); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); dput(dentry); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap from dentry. + * Override with inode's snap since that's what this function is for. + */ + path_info->vino.snap = ceph_snap(inode); return 0; } @@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode, */ static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, - const char *rpath, u64 rino, const char **ppath, - int *pathlen, u64 *ino, bool *freepath, + const char *rpath, u64 rino, + struct ceph_path_info *path_info, bool parent_locked) { struct ceph_client *cl = mdsc->fsc->client; int r = 0; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + if (rinode) { - r = build_inode_path(rinode, ppath, pathlen, ino, freepath); + r = build_inode_path(rinode, path_info); doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino, - freepath, parent_locked); - doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); + r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked); + doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino, + path_info->pathlen, path_info->path); } else if (rpath || rino) { - *ino = rino; - *ppath = rpath; - *pathlen = rpath ? strlen(rpath) : 0; - doutc(cl, " path %.*s\n", *pathlen, rpath); + path_info->vino.ino = rino; + path_info->vino.snap = CEPH_NOSNAP; + path_info->path = rpath; + path_info->pathlen = rpath ? strlen(rpath) : 0; + path_info->freepath = false; + + doutc(cl, " path %.*s\n", path_info->pathlen, rpath); } return r; @@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_request_head_legacy *lhead; - const char *path1 = NULL; - const char *path2 = NULL; - u64 ino1 = 0, ino2 = 0; - int pathlen1 = 0, pathlen2 = 0; - bool freepath1 = false, freepath2 = false; + struct ceph_path_info path_info1 = {0}; + struct ceph_path_info path_info2 = {0}; struct dentry *old_dentry = NULL; int len; u16 releases; @@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, u16 request_head_version = mds_supported_head_version(session); kuid_t caller_fsuid = req->r_cred->fsuid; kgid_t caller_fsgid = req->r_cred->fsgid; + bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, - req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1, - test_bit(CEPH_MDS_R_PARENT_LOCKED, - &req->r_req_flags)); + req->r_parent, req->r_path1, req->r_ino1.ino, + &path_info1, parent_locked); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* + * When the parent directory's i_rwsem is *not* locked, req->r_parent may + * have become stale (e.g. after a concurrent rename) between the time the + * dentry was looked up and now. If we detect that the stored r_parent + * does not match the inode number we just encoded for the request, switch + * to the correct inode so that the MDS receives a valid parent reference. + */ + if (!parent_locked && req->r_parent && path_info1.vino.ino && + ceph_ino(req->r_parent) != path_info1.vino.ino) { + struct inode *old_parent = req->r_parent; + struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL); + if (!IS_ERR(correct_dir)) { + WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n", + ceph_ino(old_parent), path_info1.vino.ino); + /* + * Transfer CEPH_CAP_PIN from the old parent to the new one. + * The pin was taken earlier in ceph_mdsc_submit_request(). + */ + ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN); + iput(old_parent); + req->r_parent = correct_dir; + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + } + } + /* If r_old_dentry is set, then assume that its parent is locked */ if (req->r_old_dentry && !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) old_dentry = req->r_old_dentry; ret = set_request_path_attr(mdsc, NULL, old_dentry, - req->r_old_dentry_dir, - req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2, true); + req->r_old_dentry_dir, + req->r_path2, req->r_ino2.ino, + &path_info2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; @@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, /* filepaths */ len += 2 * (1 + sizeof(u32) + sizeof(u64)); - len += pathlen1 + pathlen2; + len += path_info1.pathlen + path_info2.pathlen; /* cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, !!req->r_old_inode_drop + !!req->r_old_dentry_drop); if (req->r_dentry_drop) - len += pathlen1; + len += path_info1.pathlen; if (req->r_old_dentry_drop) - len += pathlen2; + len += path_info2.pathlen; /* MClientRequest tail */ @@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, lhead->ino = cpu_to_le64(req->r_deleg_ino); lhead->args = req->r_args; - ceph_encode_filepath(&p, end, ino1, path1); - ceph_encode_filepath(&p, end, ino2, path2); + ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path); + ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path); /* make note of release offset, in case we need to replay */ req->r_request_release_offset = p - msg->front.iov_base; @@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.data_off = cpu_to_le16(0); out_free2: - if (freepath2) - ceph_mdsc_free_path((char *)path2, pathlen2); + ceph_mdsc_free_path_info(&path_info2); out_free1: - if (freepath1) - ceph_mdsc_free_path((char *)path1, pathlen1); + ceph_mdsc_free_path_info(&path_info1); out: return msg; out_err: @@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; struct ceph_cap *cap; - char *path; - int pathlen = 0, err; - u64 pathbase; + struct ceph_path_info path_info = {0}; + int err; u64 snap_follows; dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, recon_state->msg_version >= 2); dput(dentry); if (IS_ERR(path)) { err = PTR_ERR(path); goto out_err; } - } else { - path = NULL; - pathbase = 0; } spin_lock(&ci->i_ceph_lock); @@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.pathbase = cpu_to_le64(path_info.vino.ino); rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { @@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) ts = inode_get_atime(inode); ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v1.pathbase = cpu_to_le64(pathbase); + rec.v1.pathbase = cpu_to_le64(path_info.vino.ino); } if (list_empty(&ci->i_cap_snaps)) { @@ -4706,7 +4744,7 @@ encode_again: sizeof(struct ceph_filelock); rec.v2.flock_len = cpu_to_le32(struct_len); - struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); + struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2); if (struct_v >= 2) struct_len += sizeof(u64); /* snap_follows */ @@ -4730,7 +4768,7 @@ encode_again: ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, struct_len); } - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); ceph_locks_to_pagelist(flocks, pagelist, num_fcntl_locks, num_flock_locks); @@ -4741,17 +4779,17 @@ out_freeflocks: } else { err = ceph_pagelist_reserve(pagelist, sizeof(u64) + sizeof(u32) + - pathlen + sizeof(rec.v1)); + path_info.pathlen + sizeof(rec.v1)); if (err) goto out_err; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); } out_err: - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); if (!err) recon_state->nr_caps++; return err; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3e2a6fa7c19a..0428a5eaf28c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); -static inline void ceph_mdsc_free_path(char *path, int len) +/* + * Structure to group path-related output parameters for build_*_path functions + */ +struct ceph_path_info { + const char *path; + int pathlen; + struct ceph_vino vino; + bool freepath; +}; + +static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info) { - if (!IS_ERR_OR_NULL(path)) - __putname(path - (PATH_MAX - 1 - len)); + if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path)) + __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen)); } extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, - struct dentry *dentry, int *plen, u64 *base, + struct dentry *dentry, struct ceph_path_info *path_info, int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3eb651862c5..db6c2db68f96 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -862,7 +862,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0); if (!fsc->inode_wq) goto fail_client; - fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); + fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1); if (!fsc->cap_wq) goto fail_inode_wq; @@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cf176aab0f82..25d8bacbcf44 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -463,6 +463,7 @@ struct ceph_inode_info { unsigned long i_work_mask; #ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; u32 fscrypt_auth_len; u32 fscrypt_file_len; u8 *fscrypt_auth; diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 740f18b60c9d..456c4a2efb53 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode) static const struct super_operations configfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .free_inode = configfs_free_inode, }; diff --git a/fs/coredump.c b/fs/coredump.c index fedbead956ed..0d9a5d07a75d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -345,7 +345,7 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm, was_space = false; err = cn_printf(cn, "%c", '\0'); if (err) - return err; + return false; (*argv)[(*argc)++] = cn->used; } } @@ -635,7 +635,7 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) /* * Usermode helpers are childen of either - * system_unbound_wq or of kthreadd. So we know that + * system_dfl_wq or of kthreadd. So we know that * we're starting off with a clean file descriptor * table. So we should always be able to use * COREDUMP_PIDFD_NUMBER as our file descriptor value. @@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, ssize_t retval; char old_core_pattern[CORENAME_MAX_SIZE]; + if (write) + return proc_dostring(table, write, buffer, lenp, ppos); + retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); error = proc_dostring(table, write, buffer, lenp, ppos); if (error) return error; + if (!check_coredump_socket()) { strscpy(core_pattern, old_core_pattern, retval + 1); return -EINVAL; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b002e9b734f9..12daa85ed941 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode_nohighmem(inode); inode->i_data.a_ops = &cramfs_aops; break; - default: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: init_special_inode(inode, cramfs_inode->mode, old_decode_dev(cramfs_inode->size)); + break; + default: + printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + iget_failed(inode); + return ERR_PTR(-EIO); } inode->i_mode = cramfs_inode->mode; diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 486fcb2ecf13..0d746de4cd10 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -113,7 +113,7 @@ out: int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b6ccab524fde..07f9cbfe3ea4 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; @@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_ENCRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); @@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + @@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_DECRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f9f6713e144f..fb77ad1ca74a 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -94,7 +94,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -138,7 +138,7 @@ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -274,8 +274,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { - return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, - orig_len, max_len, + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); + + return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); @@ -543,7 +544,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_inode_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir); WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d8b485b9881c..245e6b84aa17 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -249,8 +249,8 @@ struct fscrypt_prepared_key { * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is - * allocated and stored in ->i_crypt_info. Once created, it remains until the - * inode is evicted. + * allocated and a pointer to it is stored in the file's in-memory inode. Once + * created, it remains until the inode is evicted. */ struct fscrypt_inode_info { diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index e0b32ac841f7..7a5d4c168c49 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -199,7 +199,7 @@ int fscrypt_prepare_setflags(struct inode *inode, err = fscrypt_require_key(inode); if (err) return err; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index caaff809765b..5dee7c498bc8 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { - return inode->i_crypt_info->ci_inlinecrypt; + return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt; } EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); @@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, if (!fscrypt_inode_uses_inline_crypto(inode)) return; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); fscrypt_generate_dun(ci, first_lblk, dun); bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); @@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { const struct bio_crypt_ctx *bc = bio->bi_crypt_context; + const struct fscrypt_inode_info *ci; u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; if (!!bc != fscrypt_inode_uses_inline_crypto(inode)) return false; if (!bc) return true; + ci = fscrypt_get_inode_info_raw(inode); /* * Comparing the key pointers is good enough, as all I/O for each key * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) + if (bc->bc_key != ci->ci_enc_key.blk_key) return false; - fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); + fscrypt_generate_dun(ci, next_lblk, next_dun); return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); @@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) if (nr_blocks <= 1) return nr_blocks; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (!(fscrypt_policy_flags(&ci->ci_policy) & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) return nr_blocks; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4f3b9ecbfe4e..c1f85715c276 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -642,15 +642,16 @@ fscrypt_setup_encryption_info(struct inode *inode, goto out; /* - * For existing inodes, multiple tasks may race to set ->i_crypt_info. - * So use cmpxchg_release(). This pairs with the smp_load_acquire() in - * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with - * a RELEASE barrier so that other tasks can ACQUIRE it. + * For existing inodes, multiple tasks may race to set the inode's + * fscrypt info pointer. So use cmpxchg_release(). This pairs with the + * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the + * pointer with a RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) == + NULL) { /* - * We won the race and set ->i_crypt_info to our crypt_info. - * Now link it into the master key's inode list. + * We won the race and set the inode's fscrypt info to our + * crypt_info. Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; @@ -681,13 +682,13 @@ out: * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * - * Set up ->i_crypt_info, if it hasn't already been done. + * Set up the inode's encryption key, if it hasn't already been done. * - * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * - * Return: 0 if ->i_crypt_info was set or was already set, *or* if the - * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * Return: 0 if the key is now set up, *or* if it couldn't be set up because the + * needed master key is absent. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) @@ -741,9 +742,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * - * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * If the directory is encrypted, set up its encryption key in preparation for * encrypting the name of the new file. Also, if the new inode will be - * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * encrypted, set up its encryption key too and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino @@ -752,8 +753,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * - * Return: 0 on success, -ENOKEY if the encryption key is missing, or another - * -errno code + * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode + * but the needed master key is absent, or another -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) @@ -800,8 +801,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); */ void fscrypt_put_encryption_info(struct inode *inode) { - put_crypt_info(inode->i_crypt_info); - inode->i_crypt_info = NULL; + /* + * Ideally we'd start with a lightweight IS_ENCRYPTED() check here + * before proceeding to retrieve and check the pointer. However, during + * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If + * an error occurs, it needs to be cleaned up regardless. + */ + struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode); + + put_crypt_info(*ci_addr); + *ci_addr = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6ad30ae07c06..9d51f3500de3 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); - return &dir->i_crypt_info->ci_policy; + return &fscrypt_get_inode_info_raw(dir)->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); @@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); @@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); */ int fscrypt_set_context(struct inode *inode, void *fs_data) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ctxsize; @@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); @@ -1743,6 +1743,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t done = 0; int ret; + if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC)) + return -EIO; + if (!iomi.len) return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d51..65cc11939654 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2509,8 +2509,8 @@ static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { - unsigned n = dir->i_dir_seq; - if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + unsigned n = READ_ONCE(dir->i_dir_seq); + if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1)) return n; cpu_relax(); } @@ -2922,6 +2922,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) write_sequnlock(&rename_lock); } +EXPORT_SYMBOL(d_exchange); /** * d_ancestor - search for an ancestor diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index a0357b0cf362..c12d649df6a5 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -183,6 +183,9 @@ static int debugfs_reconfigure(struct fs_context *fc) struct debugfs_fs_info *sb_opts = sb->s_fs_info; struct debugfs_fs_info *new_opts = fc->s_fs_info; + if (!new_opts) + return 0; + sync_filesystem(sb); /* structure copy of new mount options to sb */ @@ -282,10 +285,16 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc) static int debugfs_get_tree(struct fs_context *fc) { + int err; + if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return -EPERM; - return get_tree_single(fc, debugfs_fill_super); + err = get_tree_single(fc, debugfs_fill_super); + if (err) + return err; + + return debugfs_reconfigure(fc); } static void debugfs_free_fc(struct fs_context *fc) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index e4373bce1bc2..9a0b6c2b6b01 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1703,7 +1703,7 @@ static int work_start(void) return -ENOMEM; } - process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH, 0); + process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH | WQ_PERCPU, 0); if (!process_workqueue) { log_print("can't start dlm_process"); destroy_workqueue(io_workqueue); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 4887c8a05318..a44d16da7187 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -52,7 +52,7 @@ static int __init init_dlm(void) if (error) goto out_user; - dlm_wq = alloc_workqueue("dlm_wq", 0, 0); + dlm_wq = alloc_workqueue("dlm_wq", WQ_PERCPU, 0); if (!dlm_wq) { error = -ENOMEM; goto out_plock; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index c4a139911356..1f4d8ce56667 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb); static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .alloc_inode = efivarfs_alloc_inode, .free_inode = efivarfs_free_inode, .show_options = efivarfs_show_options, @@ -152,6 +152,10 @@ static int efivarfs_d_compare(const struct dentry *dentry, { int guid = len - EFI_VARIABLE_GUID_LEN; + /* Parallel lookups may produce a temporary invalid filename */ + if (guid <= 0) + return 1; + if (name->len != len) return 1; diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 7b26efc271ee..d81f3318417d 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,8 +3,18 @@ config EROFS_FS tristate "EROFS filesystem support" depends on BLOCK + select CACHEFILES if EROFS_FS_ONDEMAND select CRC32 + select CRYPTO if EROFS_FS_ZIP_ACCEL + select CRYPTO_DEFLATE if EROFS_FS_ZIP_ACCEL select FS_IOMAP + select LZ4_DECOMPRESS if EROFS_FS_ZIP + select NETFS_SUPPORT if EROFS_FS_ONDEMAND + select XXHASH if EROFS_FS_XATTR + select XZ_DEC if EROFS_FS_ZIP_LZMA + select XZ_DEC_MICROLZMA if EROFS_FS_ZIP_LZMA + select ZLIB_INFLATE if EROFS_FS_ZIP_DEFLATE + select ZSTD_DECOMPRESS if EROFS_FS_ZIP_ZSTD help EROFS (Enhanced Read-Only File System) is a lightweight read-only file system with modern designs (e.g. no buffer heads, inline @@ -38,7 +48,6 @@ config EROFS_FS_DEBUG config EROFS_FS_XATTR bool "EROFS extended attributes" depends on EROFS_FS - select XXHASH default y help Extended attributes are name:value pairs associated with inodes by @@ -94,7 +103,6 @@ config EROFS_FS_BACKED_BY_FILE config EROFS_FS_ZIP bool "EROFS Data Compression Support" depends on EROFS_FS - select LZ4_DECOMPRESS default y help Enable transparent compression support for EROFS file systems. @@ -104,8 +112,6 @@ config EROFS_FS_ZIP config EROFS_FS_ZIP_LZMA bool "EROFS LZMA compressed data support" depends on EROFS_FS_ZIP - select XZ_DEC - select XZ_DEC_MICROLZMA help Saying Y here includes support for reading EROFS file systems containing LZMA compressed data, specifically called microLZMA. It @@ -117,7 +123,6 @@ config EROFS_FS_ZIP_LZMA config EROFS_FS_ZIP_DEFLATE bool "EROFS DEFLATE compressed data support" depends on EROFS_FS_ZIP - select ZLIB_INFLATE help Saying Y here includes support for reading EROFS file systems containing DEFLATE compressed data. It gives better compression @@ -132,7 +137,6 @@ config EROFS_FS_ZIP_DEFLATE config EROFS_FS_ZIP_ZSTD bool "EROFS Zstandard compressed data support" depends on EROFS_FS_ZIP - select ZSTD_DECOMPRESS help Saying Y here includes support for reading EROFS file systems containing Zstandard compressed data. It gives better compression @@ -147,8 +151,6 @@ config EROFS_FS_ZIP_ZSTD config EROFS_FS_ZIP_ACCEL bool "EROFS hardware decompression support" depends on EROFS_FS_ZIP - select CRYPTO - select CRYPTO_DEFLATE help Saying Y here includes hardware accelerator support for reading EROFS file systems containing compressed data. It gives better @@ -163,9 +165,7 @@ config EROFS_FS_ZIP_ACCEL config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support (deprecated)" depends on EROFS_FS - select NETFS_SUPPORT select FSCACHE - select CACHEFILES select CACHEFILES_ONDEMAND help This permits EROFS to use fscache-backed data blobs with on-demand diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 377ee12b8b96..3d5738f80072 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -12,10 +12,12 @@ /* to allow for x86 boot sectors and other oddities. */ #define EROFS_SUPER_OFFSET 1024 -#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 -#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 -#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 +#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 +#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 #define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008 +#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010 + /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4ccc5f0ee8df..9319c66e86c3 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -234,6 +234,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX) +EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX) static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid) { diff --git a/fs/erofs/super.c b/fs/erofs/super.c index e1020aa60771..db13b40a78e0 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -174,6 +174,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (!erofs_is_fileio_mode(sbi)) { dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), &dif->dax_part_off, NULL, NULL); + if (!dif->dax_dev && test_opt(&sbi->opt, DAX_ALWAYS)) { + erofs_info(sb, "DAX unsupported by %s. Turning off DAX.", + dif->path); + clear_opt(&sbi->opt, DAX_ALWAYS); + } } else if (!S_ISREG(file_inode(file)->i_mode)) { fput(file); return -EINVAL; @@ -210,8 +215,13 @@ static int erofs_scan_devices(struct super_block *sb, ondisk_extradevs, sbi->devs->extra_devices); return -EINVAL; } - if (!ondisk_extradevs) + if (!ondisk_extradevs) { + if (test_opt(&sbi->opt, DAX_ALWAYS) && !sbi->dif0.dax_dev) { + erofs_info(sb, "DAX unsupported by block device. Turning off DAX."); + clear_opt(&sbi->opt, DAX_ALWAYS); + } return 0; + } if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb)) sbi->devs->flatdev = true; @@ -313,8 +323,8 @@ static int erofs_read_superblock(struct super_block *sb) sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) { sbi->root_nid = le64_to_cpu(dsb->rootnid_8b); - sbi->dif0.blocks = (sbi->dif0.blocks << 32) | - le16_to_cpu(dsb->rb.blocks_hi); + sbi->dif0.blocks = sbi->dif0.blocks | + ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32); } else { sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b); } @@ -338,7 +348,6 @@ static int erofs_read_superblock(struct super_block *sb) if (ret < 0) goto out; - /* handle multiple devices */ ret = erofs_scan_devices(sb, dsb); if (erofs_sb_has_48bit(sbi)) @@ -671,14 +680,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return invalfc(fc, "cannot use fsoffset in fscache mode"); } - if (test_opt(&sbi->opt, DAX_ALWAYS)) { - if (!sbi->dif0.dax_dev) { - errorfc(fc, "DAX unsupported by block device. Turning off DAX."); - clear_opt(&sbi->opt, DAX_ALWAYS); - } else if (sbi->blkszbits != PAGE_SHIFT) { - errorfc(fc, "unsupported blocksize for DAX"); - clear_opt(&sbi->opt, DAX_ALWAYS); - } + if (test_opt(&sbi->opt, DAX_ALWAYS) && sbi->blkszbits != PAGE_SHIFT) { + erofs_info(sb, "unsupported blocksize for DAX"); + clear_opt(&sbi->opt, DAX_ALWAYS); } sb->s_time_gran = 1; @@ -1014,10 +1018,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static void erofs_evict_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + dax_break_layout_final(inode); +#endif + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + const struct super_operations erofs_sops = { .put_super = erofs_put_super, .alloc_inode = erofs_alloc_inode, .free_inode = erofs_free_inode, + .evict_inode = erofs_evict_inode, .statfs = erofs_statfs, .show_options = erofs_show_options, }; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index eaa9efd766ee..396536d9a862 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; struct erofs_xattr_prefix_item *pfs; int ret = 0, i, len; + bool plain = erofs_sb_has_plain_xattr_pfx(sbi); if (!sbi->xattr_prefix_count) return 0; @@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb) if (!pfs) return -ENOMEM; - if (sbi->packed_inode) - buf.mapping = sbi->packed_inode->i_mapping; - else + if (!plain) { + if (erofs_sb_has_metabox(sbi)) + (void)erofs_init_metabuf(&buf, sb, true); + else if (sbi->packed_inode) + buf.mapping = sbi->packed_inode->i_mapping; + else + plain = true; + } + if (plain) (void)erofs_init_metabuf(&buf, sb, false); for (i = 0; i < sbi->xattr_prefix_count; i++) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 792f20888a8f..2d73297003d2 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1432,6 +1432,16 @@ static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) } #endif +/* Use (kthread_)work in atomic contexts to minimize scheduling overhead */ +static inline bool z_erofs_in_atomic(void) +{ + if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth()) + return true; + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return true; + return !preemptible(); +} + static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, int bios) { @@ -1446,8 +1456,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; - /* Use (kthread_)work and sync decompression for atomic contexts only */ - if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { + if (z_erofs_in_atomic()) { #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD struct kthread_worker *worker; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index a93efd95c555..798223e6da9c 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode, .map = map, .in_mbox = erofs_inode_in_metabox(inode), }; - int err = 0; - unsigned int endoff, afmt; + unsigned int endoff; unsigned long initial_lcn; unsigned long long ofs, end; + int err; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && @@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode, err = -EFSCORRUPTED; goto unmap_out; } - afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? - Z_EROFS_COMPRESSION_INTERLACED : - Z_EROFS_COMPRESSION_SHIFTED; + if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) + map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + map->m_algorithmformat = vi->z_algorithmtype[1]; } else { - afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? - vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; - if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { - erofs_err(sb, "inconsistent algorithmtype %u for nid %llu", - afmt, vi->nid); - err = -EFSCORRUPTED; - goto unmap_out; - } + map->m_algorithmformat = vi->z_algorithmtype[0]; } - map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && @@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; struct z_erofs_map_header *h; + erofs_off_t pos; + int err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { /* @@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; - err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; @@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) vi->z_idata_size = le16_to_cpu(h->h_idata_size); - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_unlock; - } - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -726,6 +711,30 @@ out_unlock: return err; } +static int z_erofs_map_sanity_check(struct inode *inode, + struct erofs_map_blocks *map) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + if (!(map->m_flags & EROFS_MAP_ENCODED)) + return 0; + if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) { + erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel", + map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid); + return -EOPNOTSUPP; + } + if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX && + !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + map->m_algorithmformat, EROFS_I(inode)->nid); + return -EFSCORRUPTED; + } + if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + return -EOPNOTSUPP; + return 0; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { @@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, else err = z_erofs_map_blocks_fo(inode, map, flags); } - if (!err && (map->m_flags & EROFS_MAP_ENCODED) && - unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || - map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) - err = -EOPNOTSUPP; + if (!err) + err = z_erofs_map_sanity_check(inode, map); if (err) map->m_llen = 0; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b22d6f819f78..ee7c4b683ec3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -46,10 +46,10 @@ * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->lock (rwlock) + * 3) ep->lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a rwlock (ep->lock) because we manipulate objects + * We need a spinlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -195,7 +195,7 @@ struct eventpoll { struct list_head rdllist; /* Lock which protects rdllist and ovflist */ - rwlock_t lock; + spinlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; @@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) * in a lockless way. */ lockdep_assert_irqs_enabled(); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, @@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep, { struct epitem *epi, *nepi; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep, wake_up(&ep->wq); } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_get(struct eventpoll *ep) @@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) rb_erase_cached(&epi->rbn, &ep->rbr); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep) return -ENOMEM; mutex_init(&ep->mtx); - rwlock_init(&ep->lock); + spin_lock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); @@ -1240,99 +1240,9 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, #endif /* CONFIG_KCMP */ /* - * Adds a new entry to the tail of the list in a lockless way, i.e. - * multiple CPUs are allowed to call this function concurrently. - * - * Beware: it is necessary to prevent any other modifications of the - * existing list until all changes are completed, in other words - * concurrent list_add_tail_lockless() calls should be protected - * with a read lock, where write lock acts as a barrier which - * makes sure all list_add_tail_lockless() calls are fully - * completed. - * - * Also an element can be locklessly added to the list only in one - * direction i.e. either to the tail or to the head, otherwise - * concurrent access will corrupt the list. - * - * Return: %false if element has been already added to the list, %true - * otherwise. - */ -static inline bool list_add_tail_lockless(struct list_head *new, - struct list_head *head) -{ - struct list_head *prev; - - /* - * This is simple 'new->next = head' operation, but cmpxchg() - * is used in order to detect that same element has been just - * added to the list from another CPU: the winner observes - * new->next == new. - */ - if (!try_cmpxchg(&new->next, &new, head)) - return false; - - /* - * Initially ->next of a new element must be updated with the head - * (we are inserting to the tail) and only then pointers are atomically - * exchanged. XCHG guarantees memory ordering, thus ->next should be - * updated before pointers are actually swapped and pointers are - * swapped before prev->next is updated. - */ - - prev = xchg(&head->prev, new); - - /* - * It is safe to modify prev->next and new->prev, because a new element - * is added only to the tail and new->next is updated before XCHG. - */ - - prev->next = new; - new->prev = prev; - - return true; -} - -/* - * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, - * i.e. multiple CPUs are allowed to call this function concurrently. - * - * Return: %false if epi element has been already chained, %true otherwise. - */ -static inline bool chain_epi_lockless(struct epitem *epi) -{ - struct eventpoll *ep = epi->ep; - - /* Fast preliminary check */ - if (epi->next != EP_UNACTIVE_PTR) - return false; - - /* Check that the same epi has not been just chained from another CPU */ - if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) - return false; - - /* Atomically exchange tail */ - epi->next = xchg(&ep->ovflist, epi); - - return true; -} - -/* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. - * - * This callback takes a read lock in order not to contend with concurrent - * events from another file descriptor, thus all modifications to ->rdllist - * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_start/done_scan(), which stops all list modifications and guarantees - * that lists state is seen correctly. - * - * Another thing worth to mention is that ep_poll_callback() can be called - * concurrently for the same @epi from different CPUs if poll table was inited - * with several wait queues entries. Plural wakeup from different CPUs of a - * single wait queue is serialized by wq.lock, but the case when multiple wait - * queues are used should be detected accordingly. This is detected using - * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { @@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v unsigned long flags; int ewake = 0; - read_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (chain_epi_lockless(epi)) + if (epi->next == EP_UNACTIVE_PTR) { + epi->next = READ_ONCE(ep->ovflist); + WRITE_ONCE(ep->ovflist, epi); ep_pm_stay_awake_rcu(epi); + } } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ - if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) - ep_pm_stay_awake_rcu(epi); + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake_rcu(epi); } /* @@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v pwake++; out_unlock: - read_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, } /* We have to drop the new item inside our item list to keep track of it */ - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) @@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, if (waitqueue_active(&ep->poll_wait)) pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ @@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, init_wait(&wait); wait.func = ep_autoremove_wake_function; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it @@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); if (!eavail) timed_out = !ep_schedule_timeout(to) || @@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = 1; if (!list_empty_careful(&wait.entry)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its @@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } } } diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a1..e861a4b7ffda 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) + if (!error && !write) validate_coredump_safety(); return error; } diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 0a056d97e640..cf0a0970c095 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -227,6 +227,8 @@ static bool ext4_has_stable_inodes(struct super_block *sb) } const struct fscrypt_operations ext4_cryptops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 01a6e2de7fc3..6cb784a56b3b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1182,6 +1182,14 @@ struct ext4_inode_info { __u32 i_csum_seed; kprojid_t i_projid; + +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif }; /* diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 383c6edea6dd..91185c40f755 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -393,6 +393,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, /* Reserved GDT blocks */ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + + /* + * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases, + * check for that. + */ + if (!len) + return 0; + error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_RESV_GDT); if (error) @@ -526,6 +534,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, ext4_group_t end_ag; ext4_grpblk_t first_cluster; ext4_grpblk_t last_cluster; + struct ext4_fsmap irec; int error = 0; bofs = le32_to_cpu(sbi->s_es->s_first_data_block); @@ -609,10 +618,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb, goto err; } - /* Report any gaps at the end of the bg */ + /* + * The dummy record below will cause ext4_getfsmap_helper() to report + * any allocated blocks at the end of the range. + */ + irec.fmr_device = 0; + irec.fmr_physical = end_fsb + 1; + irec.fmr_length = 0; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + info->gfi_last = true; - error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1, - 0, info); + error = ext4_getfsmap_helper(sb, info, &irec); if (error) goto err; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df4051613b29..ba4fd9aba1c1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (atomic_read(&inode->i_count) > 1) { + if (icount_read(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, - atomic_read(&inode->i_count)); + icount_read(inode)); return; } if (inode->i_nlink) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 7de327fa7b1c..d45124318200 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -539,7 +539,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, int indirect_blks; int blocks_to_boundary = 0; int depth; - int count = 0; + u64 count = 0; ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); @@ -588,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, count++; /* Fill in size of a hole we found */ map->m_pblk = 0; - map->m_len = min_t(unsigned int, map->m_len, count); + map->m_len = umin(map->m_len, count); goto cleanup; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ed54c4d0f2f9..5b7a15db4953 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -146,7 +146,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, */ int ext4_inode_is_fast_symlink(struct inode *inode) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + if (!ext4_has_feature_ea_inode(inode->i_sb)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; @@ -3155,7 +3155,7 @@ retry: folio_unlock(folio); folio_put(folio); /* - * block_write_begin may have instantiated a few blocks + * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5898d92ba19f..8b18802e83eb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3995,7 +3995,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) list_splice_tail(&freed_data_list, &sbi->s_discard_list); spin_unlock(&sbi->s_md_lock); if (wake) - queue_work(system_unbound_wq, &sbi->s_discard_work); + queue_work(system_dfl_wq, &sbi->s_discard_work); } else { list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) kmem_cache_free(ext4_free_data_cachep, entry); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d83f91b62317..2cd36f59c9e3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2965,7 +2965,6 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode) { struct buffer_head *dir_block = NULL; - struct ext4_dir_entry_2 *de; ext4_lblk_t block = 0; int err; @@ -2982,10 +2981,7 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); - de = (struct ext4_dir_entry_2 *)dir_block->b_data; err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0); - if (err) - goto out; out: brelse(dir_block); return err; diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 7c7f792ad6ab..524d4658fa40 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -589,8 +589,9 @@ int ext4_init_orphan_info(struct super_block *sb) } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; - oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), - GFP_KERNEL); + oi->of_binfo = kmalloc_array(oi->of_blocks, + sizeof(struct ext4_orphan_block), + GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3d8b0f6d2dea..39abfeec5f36 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -547,7 +547,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, * first page of the bio. Otherwise it can deadlock. */ if (io->io_bio) - gfp_flags = GFP_NOWAIT | __GFP_NOWARN; + gfp_flags = GFP_NOWAIT; retry_encrypt: bounce_page = fscrypt_encrypt_pagecache_blocks(folio, enc_bytes, 0, gfp_flags); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7d39da7e733..7f2d4014d128 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -268,7 +268,7 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, - sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); + sb->s_blocksize, GFP_NOWAIT); if (likely(bh)) { if (trylock_buffer(bh)) @@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) static int ext4_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); @@ -1470,6 +1470,12 @@ static void init_once(void *foo) init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + ei->i_crypt_info = NULL; +#endif +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } static int __init init_inodecache(void) @@ -1998,6 +2004,9 @@ int ext4_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &ext4_context_ops; + /* i_version is always enabled now */ + fc->sb_flags |= SB_I_VERSION; + return 0; } @@ -2975,6 +2984,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (nodefs && sb->s_flags & SB_I_VERSION) + SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & @@ -5314,9 +5325,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); - /* i_version is always enabled now */ - sb->s_flags |= SB_I_VERSION; - /* HSM events are allowed by default. */ sb->s_iflags |= SB_I_ALLOW_HSM; @@ -5414,6 +5422,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; + if (bdev_read_only(sb->s_bdev)) + needs_recovery = 0; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index d9203228ce97..b0acb0c50313 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -389,6 +389,8 @@ static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations ext4_verityops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .begin_enable_verity = ext4_begin_enable_verity, .end_enable_verity = ext4_end_enable_verity, .get_verity_descriptor = ext4_get_verity_descriptor, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 46be7560548c..6e465bbc85ee 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -907,6 +907,12 @@ struct f2fs_inode_info { unsigned int atomic_write_cnt; loff_t original_i_size; /* original i_size before atomic write */ +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ +#endif +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; /* filesystem verity info */ +#endif }; static inline void get_read_extent_info(struct extent_info *ext, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c2..2619cbbd7d2d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -480,6 +480,12 @@ static void init_once(void *foo) struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; inode_init_once(&fi->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + fi->i_crypt_info = NULL; +#endif +#ifdef CONFIG_FS_VERITY + fi->i_verity_info = NULL; +#endif } #ifdef CONFIG_QUOTA @@ -1744,7 +1750,7 @@ static int f2fs_drop_inode(struct inode *inode) if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); /* should remain fi->extent_tree for writepage */ @@ -1763,12 +1769,12 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); spin_lock(&inode->i_lock); - atomic_dec(&inode->i_count); + iput(inode); } trace_f2fs_drop_inode(inode, 0); return 0; } - ret = generic_drop_inode(inode); + ret = inode_generic_drop(inode); if (!ret) ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); @@ -3570,6 +3576,8 @@ static struct block_device **f2fs_get_devices(struct super_block *sb, } static const struct fscrypt_operations f2fs_cryptops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_crypt_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, @@ -3581,7 +3589,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .has_stable_inodes = f2fs_has_stable_inodes, .get_devices = f2fs_get_devices, }; -#endif +#endif /* CONFIG_FS_ENCRYPTION */ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 2287f238ae09..f0ab9a3c7a82 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -287,6 +287,8 @@ static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations f2fs_verityops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_verity_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .begin_enable_verity = f2fs_begin_enable_verity, .end_enable_verity = f2fs_end_enable_verity, .get_verity_descriptor = f2fs_get_verity_descriptor, diff --git a/fs/fcntl.c b/fs/fcntl.c index 5598e4d57422..72f8433d9109 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -355,8 +355,7 @@ static bool rw_hint_valid(u64 hint) } } -static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_get_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -367,8 +366,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, return 0; } -static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_set_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -547,10 +545,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: - err = fcntl_get_rw_hint(filp, cmd, arg); + err = fcntl_get_rw_hint(filp, arg); break; case F_SET_RW_HINT: - err = fcntl_set_rw_hint(filp, cmd, arg); + err = fcntl_set_rw_hint(filp, arg); break; default: break; diff --git a/fs/fhandle.c b/fs/fhandle.c index 7c236f64cdea..052f9c9368fb 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -11,6 +11,7 @@ #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/compat.h> +#include <linux/nsfs.h> #include "internal.h" #include "mount.h" @@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root) return 0; } + if (fd == FD_NSFS_ROOT) { + nsfs_get_root(root); + return 0; + } + return -EBADF; } @@ -208,6 +214,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry) return 1; /* + * Verify that the decoded dentry itself has a valid id mapping. + * In case the decoded dentry is the mountfd root itself, this + * verifies that the mountfd inode itself has a valid id mapping. + */ + if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry))) + return 0; + + /* * It's racy as we're not taking rename_lock but we're able to ignore * permissions and we just need an approximation whether we were able * to follow a path to the file. @@ -402,7 +416,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, if (retval) return retval; - CLASS(get_unused_fd, fd)(O_CLOEXEC); + CLASS(get_unused_fd, fd)(open_flag); if (fd < 0) return fd; diff --git a/fs/file.c b/fs/file.c index 6d2275c3be9c..28743b742e3c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; - return do_dup2(files, file, fd, flags); + err = do_dup2(files, file, fd, flags); + if (err < 0) + return err; + return 0; out_unlock: spin_unlock(&files->file_lock); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 52129267e3bd..2b35e80037fe 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1162,7 +1162,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, dirty = dirty * 10 / 8; /* issue the writeback work */ - work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); + work = kzalloc(sizeof(*work), GFP_NOWAIT); if (work) { work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; @@ -1219,7 +1219,7 @@ void cgroup_writeback_umount(struct super_block *sb) static int __init cgroup_writeback_init(void) { - isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); + isw_wq = alloc_workqueue("inode_switch_wbs", WQ_PERCPU, 0); if (!isw_wq) return -ENOMEM; return 0; @@ -1806,7 +1806,7 @@ static int writeback_single_inode(struct inode *inode, int ret = 0; spin_lock(&inode->i_lock); - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); @@ -2481,7 +2481,7 @@ static int dirtytime_interval_handler(const struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) - mod_delayed_work(system_wq, &dirtytime_work, 0); + mod_delayed_work(system_percpu_wq, &dirtytime_work, 0); return ret; } @@ -2647,10 +2647,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); - spin_unlock(&wb->list_lock); - spin_unlock(&inode->i_lock); - trace_writeback_dirty_inode_enqueue(inode); - /* * If this is the first dirty inode for this bdi, * we have to wake-up the corresponding bdi thread @@ -2660,6 +2656,11 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (wakeup_bdi && (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb); + + spin_unlock(&wb->list_lock); + spin_unlock(&inode->i_lock); + trace_writeback_dirty_inode_enqueue(inode); + return; } } diff --git a/fs/fsopen.c b/fs/fsopen.c index 1aaf4cb2afb2..f645c99204eb 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -18,50 +18,56 @@ #include "internal.h" #include "mount.h" +static inline const char *fetch_message_locked(struct fc_log *log, size_t len, + bool *need_free) +{ + const char *p; + int index; + + if (unlikely(log->head == log->tail)) + return ERR_PTR(-ENODATA); + + index = log->tail & (ARRAY_SIZE(log->buffer) - 1); + p = log->buffer[index]; + if (unlikely(strlen(p) > len)) + return ERR_PTR(-EMSGSIZE); + + log->buffer[index] = NULL; + *need_free = log->need_free & (1 << index); + log->need_free &= ~(1 << index); + log->tail++; + + return p; +} + /* * Allow the user to read back any error, warning or informational messages. + * Only one message is returned for each read(2) call. */ static ssize_t fscontext_read(struct file *file, char __user *_buf, size_t len, loff_t *pos) { struct fs_context *fc = file->private_data; - struct fc_log *log = fc->log.log; - unsigned int logsize = ARRAY_SIZE(log->buffer); - ssize_t ret; - char *p; + ssize_t err; + const char *p __free(kfree) = NULL, *message; bool need_free; - int index, n; + int n; - ret = mutex_lock_interruptible(&fc->uapi_mutex); - if (ret < 0) - return ret; - - if (log->head == log->tail) { - mutex_unlock(&fc->uapi_mutex); - return -ENODATA; - } - - index = log->tail & (logsize - 1); - p = log->buffer[index]; - need_free = log->need_free & (1 << index); - log->buffer[index] = NULL; - log->need_free &= ~(1 << index); - log->tail++; + err = mutex_lock_interruptible(&fc->uapi_mutex); + if (err < 0) + return err; + message = fetch_message_locked(fc->log.log, len, &need_free); mutex_unlock(&fc->uapi_mutex); + if (IS_ERR(message)) + return PTR_ERR(message); - ret = -EMSGSIZE; - n = strlen(p); - if (n > len) - goto err_free; - ret = -EFAULT; - if (copy_to_user(_buf, p, n) != 0) - goto err_free; - ret = n; - -err_free: if (need_free) - kfree(p); - return ret; + p = message; + + n = strlen(message); + if (copy_to_user(_buf, message, n)) + return -EFAULT; + return n; } static int fscontext_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049..66a1ba8c56b5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -119,7 +119,7 @@ void fuse_check_timeout(struct work_struct *work) goto abort_conn; out: - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); return; @@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num) { + while (num && ap->num_folios < num_pages) { struct folio *folio; unsigned int folio_offset; unsigned int nr_bytes; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2d817d7cab26..5c569c3cb53f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else - blkbits = inode->i_sb->s_blocksize_bits; + blkbits = fc->blkbits; stat->blksize = 1 << blkbits; } @@ -1377,6 +1377,7 @@ retry: generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; + stat->blksize = 1 << fi->cached_i_blkbits; if (test_bit(FUSE_I_BTIME, &fi->state)) { stat->btime = fi->i_btime; stat->result_mask |= STATX_BTIME; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5525a4520b0f..4adcf09d4b01 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = len, + .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), .flags = flags }; struct fuse_write_out outarg; @@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, fc->no_copy_file_range = 1; err = -EOPNOTSUPP; } + if (!err && outarg.size > len) + err = -EIO; + if (err) goto out; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ec248d13c8bf..cc428d04be3e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -210,6 +210,12 @@ struct fuse_inode { /** Reference to backing file in passthrough mode */ struct fuse_backing *fb; #endif + + /* + * The underlying inode->i_blkbits value will not be modified, + * so preserve the blocksize specified by the server. + */ + u8 cached_i_blkbits; }; /** FUSE inode state bits */ @@ -969,6 +975,14 @@ struct fuse_conn { /* Request timeout (in jiffies). 0 = no timeout */ unsigned int req_timeout; } timeout; + + /* + * This is a workaround until fuse uses iomap for reads. + * For fuseblk servers, this represents the blocksize passed in at + * mount time and for regular fuse servers, this is equivalent to + * inode->i_blkbits. + */ + u8 blkbits; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ecb869e895ab..7485a41af892 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -289,10 +289,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, } } - if (attr->blksize != 0) - inode->i_blkbits = ilog2(attr->blksize); + if (attr->blksize) + fi->cached_i_blkbits = ilog2(attr->blksize); else - inode->i_blkbits = inode->i_sb->s_blocksize_bits; + fi->cached_i_blkbits = fc->blkbits; /* * Don't set the sticky bit in i_mode, unless we want the VFS @@ -1209,7 +1209,7 @@ static const struct super_operations fuse_super_operations = { .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, @@ -1273,7 +1273,7 @@ static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) { fc->timeout.req_timeout = secs_to_jiffies(timeout); INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); - queue_delayed_work(system_wq, &fc->timeout.work, + queue_delayed_work(system_percpu_wq, &fc->timeout.work, fuse_timeout_timer_freq); } @@ -1810,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; + /* + * This is a workaround until fuse hooks into iomap for reads. + * Use PAGE_SIZE for the blocksize else if the writeback cache + * is enabled, buffered writes go through iomap and a read may + * overwrite partially written data if blocksize < PAGE_SIZE + */ + fc->blkbits = sb->s_blocksize_bits; + if (ctx->blksize != PAGE_SIZE && + !sb_set_blocksize(sb, PAGE_SIZE)) + goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; + fc->blkbits = sb->s_blocksize_bits; } sb->s_subtype = ctx->subtype; diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 607ef735ad4a..eb97ac009e75 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) if (!file) goto out; + /* read/write/splice/mmap passthrough only relevant for regular files */ + res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; + if (!d_is_reg(file->f_path.dentry)) + goto out_fput; + backing_sb = file_inode(file)->i_sb; res = -ELOOP; if (backing_sb->s_stack_depth >= fc->max_stack_depth) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index c826e7ca49f5..76c8fd0bfc75 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = fs->window_phys_addr + offset; + *pfn = PHYS_PFN(fs->window_phys_addr + offset); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 0727f60ad028..9d65719353fa 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -151,7 +151,8 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs2_recovery_wq = alloc_workqueue("gfs2_recovery", - WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, + 0); if (!gfs2_recovery_wq) goto fail_wq1; @@ -160,7 +161,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_control_wq) goto fail_wq2; - gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0); + gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", WQ_PERCPU, 0); if (!gfs2_freeze_wq) goto fail_wq3; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index efe99b732551..aa15183f9a16 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1193,13 +1193,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) error = -ENOMEM; sdp->sd_glock_wq = alloc_workqueue("gfs2-glock/%s", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0, + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE | WQ_PERCPU, + 0, sdp->sd_fsname); if (!sdp->sd_glock_wq) goto fail_iput; sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname); + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, 0, + sdp->sd_fsname); if (!sdp->sd_delete_wq) goto fail_glock_wq; @@ -1754,7 +1756,7 @@ static void gfs2_evict_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); continue; } - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b42e2110084b..644b2d1e7276 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(SDF_EVICTING, &sdp->sd_flags)) return 1; - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /** diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 01e516175bcd..1e1acf5775ab 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, .free_inode = hostfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = hostfs_evict_inode, .statfs = hostfs_statfs, .show_options = hostfs_show_options, diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index a59e8fa630db..34008442ee26 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (hpfs_inode->i_rddir_off && !icount_read(i)) { if (*hpfs_inode->i_rddir_off) pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 09d4baef29cf..be4be99304bc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -517,14 +517,16 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, /* * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) while holding - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. + * unmapped in caller or hugetlb_vmdelete_list() skips + * unmapping it due to fail to grab lock. Unmap (again) + * while holding the fault mutex. The mutex will prevent + * faults until we finish removing the folio. Hold folio + * lock to guarantee no concurrent migration. */ + folio_lock(folio); if (unlikely(folio_mapped(folio))) hugetlb_unmap_file_folio(h, mapping, folio, index); - folio_lock(folio); /* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e..ec9339024ac3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; @@ -550,11 +550,11 @@ static void __inode_add_lru(struct inode *inode, bool rotate) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { - void *bit_address; + void *bit_address; - bit_address = inode_state_wait_address(inode, bit); - init_wait_var_entry(wqe, bit_address, 0); - return __var_waitqueue(bit_address); + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); } EXPORT_SYMBOL(inode_bit_waitqueue); @@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) continue; spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_count)) { + if (icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } @@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ - if (atomic_read(&inode->i_count) || + if (icount_read(inode) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); @@ -1279,6 +1279,8 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; + might_sleep(); + again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data, true); @@ -1382,6 +1384,8 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; + might_sleep(); + again: inode = find_inode(sb, head, test, data, false); if (inode) { @@ -1422,6 +1426,9 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); if (inode) { @@ -1605,6 +1612,9 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; + + might_sleep(); + again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { @@ -1630,6 +1640,9 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); @@ -1780,6 +1793,8 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); + might_sleep(); + while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); @@ -1826,6 +1841,8 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, { struct inode *old; + might_sleep(); + inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); @@ -1838,11 +1855,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, EXPORT_SYMBOL(insert_inode_locked4); -int generic_delete_inode(struct inode *inode) +int inode_just_drop(struct inode *inode) { return 1; } -EXPORT_SYMBOL(generic_delete_inode); +EXPORT_SYMBOL(inode_just_drop); /* * Called when we're dropping the last reference @@ -1866,7 +1883,7 @@ static void iput_final(struct inode *inode) if (op->drop_inode) drop = op->drop_inode(inode); else - drop = generic_drop_inode(inode); + drop = inode_generic_drop(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && @@ -1908,20 +1925,45 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (!inode) + might_sleep(); + if (unlikely(!inode)) return; - BUG_ON(inode->i_state & I_CLEAR); + retry: - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { - if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { - atomic_inc(&inode->i_count); - spin_unlock(&inode->i_lock); - trace_writeback_lazytime_iput(inode); - mark_inode_dirty_sync(inode); - goto retry; - } - iput_final(inode); + lockdep_assert_not_held(&inode->i_lock); + VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode); + /* + * Note this assert is technically racy as if the count is bogusly + * equal to one, then two CPUs racing to further drop it can both + * conclude it's fine. + */ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode); + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) { + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + + spin_lock(&inode->i_lock); + if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) { + spin_unlock(&inode->i_lock); + goto retry; } + + if (!atomic_dec_and_test(&inode->i_count)) { + spin_unlock(&inode->i_lock); + return; + } + + /* + * iput_final() drops ->i_lock, we can't assert on it as the inode may + * be deallocated by the time the call returns. + */ + iput_final(inode); } EXPORT_SYMBOL(iput); @@ -2189,7 +2231,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -int file_remove_privs_flags(struct file *file, unsigned int flags) +static int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2214,7 +2256,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) @@ -2519,21 +2560,28 @@ void __init inode_init(void) void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; - if (S_ISCHR(mode)) { + switch (inode->i_mode & S_IFMT) { + case S_IFCHR: inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; - } else if (S_ISBLK(mode)) { + break; + case S_IFBLK: if (IS_ENABLED(CONFIG_BLOCK)) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; - } else if (S_ISFIFO(mode)) + break; + case S_IFIFO: inode->i_fop = &pipefifo_fops; - else if (S_ISSOCK(mode)) - ; /* leave it no_open_fops */ - else + break; + case S_IFSOCK: + /* leave it no_open_fops */ + break; + default: printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); + break; + } } EXPORT_SYMBOL(init_special_inode); @@ -2911,10 +2959,18 @@ EXPORT_SYMBOL(mode_strip_sgid); * * TODO: add a proper inode dumping routine, this is a stub to get debug off the * ground. + * + * TODO: handle getting to fs type with get_kernel_nofault()? + * See dump_mapping() above. */ void dump_inode(struct inode *inode, const char *reason) { - pr_warn("%s encountered for inode %px", reason, inode); + struct super_block *sb = inode->i_sb; + + pr_warn("%s encountered for inode %px\n" + "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", + reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, + inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); diff --git a/fs/internal.h b/fs/internal.h index 38e8aab27bbd..a33d18ee5b74 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); void pidfs_get_root(struct path *path); +void nsfs_get_root(struct path *path); diff --git a/fs/ioctl.c b/fs/ioctl.c index 0248cb8db2d3..1c152c2b1b67 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) out: return error; } -EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { @@ -426,7 +425,7 @@ static int ioctl_file_dedupe_range(struct file *file, goto out; } - size = offsetof(struct file_dedupe_range, info[count]); + size = struct_size(same, info, count); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fd827398afd2..8b847a1e27f1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -304,6 +304,9 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, size_t size = i_size_read(iter->inode) - iomap->offset; size_t offset = offset_in_folio(folio, iomap->offset); + if (WARN_ON_ONCE(!iomap->inline_data)) + return -EIO; + if (folio_test_uptodate(folio)) return 0; @@ -894,7 +897,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, return true; } -static void iomap_write_end_inline(const struct iomap_iter *iter, +static bool iomap_write_end_inline(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; @@ -903,12 +906,16 @@ static void iomap_write_end_inline(const struct iomap_iter *iter, WARN_ON_ONCE(!folio_test_uptodate(folio)); BUG_ON(!iomap_inline_data_valid(iomap)); + if (WARN_ON_ONCE(!iomap->inline_data)) + return false; + flush_dcache_folio(folio); addr = kmap_local_folio(folio, pos); memcpy(iomap_inline_data(iomap, pos), addr, copied); kunmap_local(addr); mark_inode_dirty(iter->inode); + return true; } /* @@ -921,10 +928,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; - if (srcmap->type == IOMAP_INLINE) { - iomap_write_end_inline(iter, folio, pos, copied); - return true; - } + if (srcmap->type == IOMAP_INLINE) + return iomap_write_end_inline(iter, folio, pos, copied); if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { size_t bh_written; @@ -1396,6 +1401,9 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); + trace_iomap_zero_iter(iter->inode, folio_pos(folio) + offset, + bytes); + folio_zero_range(folio, offset, bytes); folio_mark_accessed(folio); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 6f25d4cfea9f..46aa85af13dc 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -363,14 +363,14 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) if (iomap->flags & IOMAP_F_SHARED) dio->flags |= IOMAP_DIO_COW; - if (iomap->flags & IOMAP_F_NEW) { + if (iomap->flags & IOMAP_F_NEW) need_zeroout = true; - } else if (iomap->type == IOMAP_MAPPED) { - if (iomap_dio_can_use_fua(iomap, dio)) - bio_opf |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; - } + else if (iomap->type == IOMAP_MAPPED && + iomap_dio_can_use_fua(iomap, dio)) + bio_opf |= REQ_FUA; + + if (!(bio_opf & REQ_FUA)) + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; /* * We can only do deferred completion for pure overwrites that @@ -519,6 +519,9 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) loff_t pos = iomi->pos; u64 copied; + if (WARN_ON_ONCE(!inline_data)) + return -EIO; + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 6ad66e6ba653..a61c1dae4742 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -84,6 +84,7 @@ DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); DEFINE_RANGE_EVENT(iomap_dio_rw_queued); +DEFINE_RANGE_EVENT(iomap_zero_iter); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index b3971e91e8eb..38861ca04899 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -285,6 +285,7 @@ restart: retry: if (batch_count) __flush_batch(journal, &batch_count); + cond_resched(); spin_lock(&journal->j_list_lock); goto restart; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index a6c692cac616..9adf36e6364b 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) !list_empty(&of->list)); } +/* Get active reference to kernfs node for an open file */ +static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of) +{ + /* Skip if file was already released */ + if (unlikely(of->released)) + return NULL; + + if (!kernfs_get_active(of->kn)) + return NULL; + + return of; +} + +static void kernfs_put_active_of(struct kernfs_open_file *of) +{ + return kernfs_put_active(of->kn); +} + /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * @@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) if (ops->seq_stop) ops->seq_stop(sf, v); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) @@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); @@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; @@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len < 0) @@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; @@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len > 0) @@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma) if (!of->vm_ops) return; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return; if (of->vm_ops->open) of->vm_ops->open(vma); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) @@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = 0; @@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) else file_update_time(file); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, if (!of->vm_ops) return -EINVAL; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) mutex_lock(&of->mutex); rc = -ENODEV; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) goto out_unlock; ops = kernfs_ops(of->kn); @@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) } vma->vm_ops = &kernfs_vm_ops; out_put: - kernfs_put_active(of->kn); + kernfs_put_active_of(of); out_unlock: mutex_unlock(&of->mutex); @@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; - if (!kernfs_get_active(kn)) + if (!kernfs_get_active_of(of)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) @@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) else ret = kernfs_generic_poll(of, wait); - kernfs_put_active(kn); + kernfs_put_active_of(of); return ret; } @@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); return -ENODEV; } @@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) else ret = generic_file_llseek(file, offset, whence); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); return ret; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 3c293a5a21b1..457f91c412d4 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; - attrs = kernfs_iattrs_noalloc(kn); + attrs = kernfs_iattrs(kn); if (!attrs) - return -ENODATA; + return -ENOMEM; return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e384a69fbece..76eaf64b9d9e 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, diff --git a/fs/locks.c b/fs/locks.c index 559f02aa4172..04a3f0e20724 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2328,8 +2328,8 @@ out: * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) - * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations - * flags need to be set. + * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags + * need to be set. * * Callers expecting ->lock() to return asynchronously will only use F_SETLK, * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df9d11479caf..32db676127a9 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) inode->i_op = &minix_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &minix_aops; - } else + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { init_special_inode(inode, inode->i_mode, rdev); + } else { + printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + make_bad_inode(inode); + } } /* diff --git a/fs/mount.h b/fs/mount.h index 97737051a8b9..79c85639a7ba 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -17,11 +17,7 @@ struct mnt_namespace { }; struct user_namespace *user_ns; struct ucounts *ucounts; - u64 seq; /* Sequence number to prevent loops */ - union { - wait_queue_head_t poll; - struct rcu_head mnt_ns_rcu; - }; + wait_queue_head_t poll; u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; #ifdef CONFIG_FSNOTIFY @@ -30,8 +26,6 @@ struct mnt_namespace { #endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ - struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; @@ -149,7 +143,7 @@ static inline void detach_mounts(struct dentry *dentry) static inline void get_mnt_ns(struct mnt_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } extern seqlock_t mount_lock; @@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry) static inline bool is_anon_ns(struct mnt_namespace *ns) { - return ns->seq == 0; + return ns->ns.ns_id == 0; } static inline bool anon_ns_root(const struct mount *m) diff --git a/fs/namei.c b/fs/namei.c index cd43ff89fbaa..44856b70ea3b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1449,6 +1449,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags dentry->d_inode) return -EISDIR; + /* No need to trigger automounts if mountpoint crossing is disabled. */ + if (lookup_flags & LOOKUP_NO_XDEV) + return -EXDEV; + if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; @@ -1472,6 +1476,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { + if (lookup_flags & LOOKUP_NO_XDEV) { + ret = -EXDEV; + break; + } ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) @@ -1489,6 +1497,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; + if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) { + ret = -EXDEV; + break; + } continue; } } @@ -1630,12 +1642,8 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); - if (jumped) { - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - ret = -EXDEV; - else - nd->state |= ND_JUMPED; - } + if (jumped) + nd->state |= ND_JUMPED; if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) @@ -4828,7 +4836,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to - * be writen back improperly if their true value is unknown to + * be written back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..dc01b14c58cd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -33,6 +33,7 @@ #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> #include <linux/pidfs.h> +#include <linux/nstree.h> #include "pnode.h" #include "internal.h" @@ -65,6 +66,15 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); +static char * __initdata initramfs_options; +static int __init initramfs_options_setup(char *str) +{ + initramfs_options = str; + return 1; +} + +__setup("initramfs_options=", initramfs_options_setup); + static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); @@ -80,13 +90,10 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ -static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif -static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ -static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE = (1 << 0), @@ -119,59 +126,18 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { + struct ns_common *ns; + if (!node) return NULL; - return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); -} - -static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) -{ - struct mnt_namespace *ns_a = node_to_mnt_ns(a); - struct mnt_namespace *ns_b = node_to_mnt_ns(b); - u64 seq_a = ns_a->seq; - u64 seq_b = ns_b->seq; - - if (seq_a < seq_b) - return -1; - if (seq_a > seq_b) - return 1; - return 0; -} - -static inline void mnt_ns_tree_write_lock(void) -{ - write_seqlock(&mnt_ns_tree_lock); -} - -static inline void mnt_ns_tree_write_unlock(void) -{ - write_sequnlock(&mnt_ns_tree_lock); -} - -static void mnt_ns_tree_add(struct mnt_namespace *ns) -{ - struct rb_node *node, *prev; - - mnt_ns_tree_write_lock(); - node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->mnt_ns_tree_node); - if (!prev) - list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); - else - list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); - mnt_ns_tree_write_unlock(); - - WARN_ON_ONCE(node); + ns = rb_entry(node, struct ns_common, ns_tree_node); + return container_of(ns, struct mnt_namespace, ns); } static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ - if (refcount_dec_and_test(&ns->passive)) { + if (ns && refcount_dec_and_test(&ns->passive)) { fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); @@ -181,32 +147,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { - mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); + mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); } static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ - if (!is_anon_ns(ns)) { - mnt_ns_tree_write_lock(); - rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); - list_bidir_del_rcu(&ns->mnt_ns_list); - mnt_ns_tree_write_unlock(); - } - - call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); -} + if (ns_tree_active(ns)) + ns_tree_remove(ns); -static int mnt_ns_find(const void *key, const struct rb_node *node) -{ - const u64 mnt_ns_id = *(u64 *)key; - const struct mnt_namespace *ns = node_to_mnt_ns(node); - - if (mnt_ns_id < ns->seq) - return -1; - if (mnt_ns_id > ns->seq) - return 1; - return 0; + call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); } /* @@ -225,28 +175,21 @@ static int mnt_ns_find(const void *key, const struct rb_node *node) */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; - struct rb_node *node; - unsigned int seq; + struct mnt_namespace *mnt_ns; + struct ns_common *ns; guard(rcu)(); - do { - seq = read_seqbegin(&mnt_ns_tree_lock); - node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); - if (node) - break; - } while (read_seqretry(&mnt_ns_tree_lock, seq)); - - if (!node) + ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); + if (!ns) return NULL; /* * The last reference count is put with RCU delay so we can * unconditonally acquire a reference here. */ - ns = node_to_mnt_ns(node); - refcount_inc(&ns->passive); - return ns; + mnt_ns = container_of(ns, struct mnt_namespace, ns); + refcount_inc(&mnt_ns->passive); + return mnt_ns; } static inline void lock_mount_hash(void) @@ -1017,7 +960,7 @@ static inline bool check_anonymous_mnt(struct mount *mnt) return false; seq = mnt->mnt_ns->seq_origin; - return !seq || (seq == current->nsproxy->mnt_ns->seq); + return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id); } /* @@ -1197,10 +1140,7 @@ static void commit_tree(struct mount *mnt) if (!mnt_ns_attached(mnt)) { for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) - if (unlikely(mnt_ns_attached(m))) - m = skip_mnt_tree(m); - else - mnt_add_to_ns(n, m); + mnt_add_to_ns(n, m); n->nr_mounts += n->pending_mounts; n->pending_mounts = 0; } @@ -2155,19 +2095,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { + struct ns_common *ns; + guard(rcu)(); for (;;) { - struct list_head *list; - - if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); - else - list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); - if (list_is_head(list, &mnt_ns_list)) - return ERR_PTR(-ENOENT); + ns = ns_tree_adjoined_rcu(mntns, previous); + if (IS_ERR(ns)) + return ERR_CAST(ns); - mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + mntns = to_mnt_ns(ns); /* * The last passive reference count is put with RCU @@ -2182,7 +2119,7 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr * the mount namespace and it might already be on its * deathbed. */ - if (!refcount_inc_not_zero(&mntns->ns.count)) + if (!ns_ref_get(mntns)) continue; return mntns; @@ -2207,7 +2144,7 @@ static bool mnt_ns_loop(struct dentry *dentry) if (!mnt_ns) return false; - return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; + return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id; } struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, @@ -2458,7 +2395,7 @@ struct vfsmount *clone_private_mount(const struct path *path) return ERR_PTR(-EINVAL); } - if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (__has_locked_children(old_mnt, path->dentry)) @@ -2704,6 +2641,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, lock_mnt_tree(child); q = __lookup_mnt(&child->mnt_parent->mnt, child->mnt_mountpoint); + commit_tree(child); if (q) { struct mountpoint *mp = root.mp; struct mount *r = child; @@ -2713,7 +2651,6 @@ static int attach_recursive_mnt(struct mount *source_mnt, mp = shorter; mnt_change_mountpoint(r, mp, q); } - commit_tree(child); } unpin_mountpoint(&root); unlock_mount_hash(); @@ -2862,6 +2799,19 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) return attach_recursive_mnt(mnt, p, mp); } +static int may_change_propagation(const struct mount *m) +{ + struct mnt_namespace *ns = m->mnt_ns; + + // it must be mounted in some namespace + if (IS_ERR_OR_NULL(ns)) // is_mounted() + return -EINVAL; + // and the caller must be admin in userns of that namespace + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + return 0; +} + /* * Sanity check the flags to change_mnt_propagation. */ @@ -2898,10 +2848,10 @@ static int do_change_type(struct path *path, int ms_flags) return -EINVAL; namespace_lock(); - if (!check_mnt(mnt)) { - err = -EINVAL; + err = may_change_propagation(mnt); + if (err) goto out_unlock; - } + if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) @@ -3070,7 +3020,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive) if (is_anon_ns(src_mnt_ns)) ns->seq_origin = src_mnt_ns->seq_origin; else - ns->seq_origin = src_mnt_ns->seq; + ns->seq_origin = src_mnt_ns->ns.ns_id; } mnt = __do_loopback(path, recursive); @@ -3279,7 +3229,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ -static int do_remount(struct path *path, int ms_flags, int sb_flags, +static int do_remount(struct path *path, int sb_flags, int mnt_flags, void *data) { int err; @@ -3347,18 +3297,11 @@ static int do_set_group(struct path *from_path, struct path *to_path) namespace_lock(); - err = -EINVAL; - /* To and From must be mounted */ - if (!is_mounted(&from->mnt)) - goto out; - if (!is_mounted(&to->mnt)) - goto out; - - err = -EPERM; - /* We should be allowed to modify mount namespaces of both mounts */ - if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN)) + err = may_change_propagation(from); + if (err) goto out; - if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN)) + err = may_change_propagation(to); + if (err) goto out; err = -EINVAL; @@ -3724,8 +3667,10 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, int error; error = security_sb_kern_mount(sb); - if (!error && mount_too_revealing(sb, &mnt_flags)) + if (!error && mount_too_revealing(sb, &mnt_flags)) { + errorfcp(fc, "VFS", "Mount too revealing"); error = -EPERM; + } if (unlikely(error)) { fc_drop_locked(fc); @@ -4109,7 +4054,7 @@ int path_mount(const char *dev_name, struct path *path, if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) return do_reconfigure_mnt(path, mnt_flags); if (flags & MS_REMOUNT) - return do_remount(path, flags, sb_flags, mnt_flags, data_page); + return do_remount(path, sb_flags, mnt_flags, data_page); if (flags & MS_BIND) return do_loopback(path, dev_name, flags & MS_REC); if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) @@ -4148,20 +4093,11 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) - ns_free_inum(&ns->ns); + ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } -/* - * Assign a sequence number so we can detect when we attempt to bind - * mount a reference to an older mount namespace into the current - * mount namespace, preventing reference counting loops. A 64bit - * number incrementing at 10Ghz will take 12,427 years to wrap which - * is effectively never, so we can ignore the possibility. - */ -static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); - static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; @@ -4177,22 +4113,20 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } - if (!anon) { - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - dec_mnt_namespaces(ucounts); - return ERR_PTR(ret); - } + + if (anon) + ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); + else + ret = ns_common_init(new_ns); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); } - new_ns->ns.ops = &mntns_operations; if (!anon) - new_ns->seq = atomic64_inc_return(&mnt_ns_seq); - refcount_set(&new_ns->ns.count, 1); + ns_tree_gen_id(&new_ns->ns); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; - INIT_LIST_HEAD(&new_ns->mnt_ns_list); - RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; @@ -4200,7 +4134,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } __latent_entropy -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, +struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; @@ -4231,7 +4165,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - ns_free_inum(&new_ns->ns); + ns_common_free(ns); dec_mnt_namespaces(new_ns->ucounts); mnt_ns_release(new_ns); return ERR_CAST(new); @@ -4278,7 +4212,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); - mnt_ns_tree_add(new_ns); + ns_tree_add_raw(new_ns); return new_ns; } @@ -4441,7 +4375,7 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { - pr_warn("VFS: Mount too revealing\n"); + errorfcp(fc, "VFS", "Mount too revealing"); goto err_unlock; } @@ -4551,20 +4485,10 @@ SYSCALL_DEFINE5(move_mount, if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; - lflags = 0; - if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; - if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; uflags = 0; - if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; - from_name = getname_maybe_null(from_pathname, uflags); - if (IS_ERR(from_name)) - return PTR_ERR(from_name); + if (flags & MOVE_MOUNT_T_EMPTY_PATH) + uflags = AT_EMPTY_PATH; - lflags = 0; - if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; - if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - uflags = 0; - if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; to_name = getname_maybe_null(to_pathname, uflags); if (IS_ERR(to_name)) return PTR_ERR(to_name); @@ -4577,11 +4501,24 @@ SYSCALL_DEFINE5(move_mount, to_path = fd_file(f_to)->f_path; path_get(&to_path); } else { + lflags = 0; + if (flags & MOVE_MOUNT_T_SYMLINKS) + lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_T_AUTOMOUNTS) + lflags |= LOOKUP_AUTOMOUNT; ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); if (ret) return ret; } + uflags = 0; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) + uflags = AT_EMPTY_PATH; + + from_name = getname_maybe_null(from_pathname, uflags); + if (IS_ERR(from_name)) + return PTR_ERR(from_name); + if (!from_name && from_dfd >= 0) { CLASS(fd_raw, f_from)(from_dfd); if (fd_empty(f_from)) @@ -4590,6 +4527,11 @@ SYSCALL_DEFINE5(move_mount, return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); } + lflags = 0; + if (flags & MOVE_MOUNT_F_SYMLINKS) + lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_F_AUTOMOUNTS) + lflags |= LOOKUP_AUTOMOUNT; ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); if (ret) return ret; @@ -4996,7 +4938,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* @@ -5176,7 +5118,8 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, int ret; struct mount_kattr kattr = {}; - kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; + if (flags & OPEN_TREE_CLONE) + kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; @@ -5388,7 +5331,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; - s->sm.mnt_ns_id = ns->seq; + s->sm.mnt_ns_id = ns->ns.ns_id; } static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) @@ -5699,7 +5642,6 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { - struct path root __free(path_put) = {}; struct mount *m; int err; @@ -5711,7 +5653,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!s->mnt) return -ENOENT; - err = grab_requested_root(ns, &root); + err = grab_requested_root(ns, &s->root); if (err) return err; @@ -5720,7 +5662,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, * mounts to show users. */ m = real_mount(s->mnt); - if (!is_path_reachable(m, m->mnt.mnt_root, &root) && + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -5728,8 +5670,6 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (err) return err; - s->root = root; - /* * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap * can change concurrently as we only hold the read-side of the @@ -5898,7 +5838,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); @@ -5951,28 +5891,40 @@ retry: if (!ret) ret = copy_statmount_to_user(ks); kvfree(ks->seq.buf); + path_put(&ks->root); if (retry_statmount(ret, &seq_size)) goto retry; return ret; } -static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, - u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, - bool reverse) +struct klistmount { + u64 last_mnt_id; + u64 mnt_parent_id; + u64 *kmnt_ids; + u32 nr_mnt_ids; + struct mnt_namespace *ns; + struct path root; +}; + +static ssize_t do_listmount(struct klistmount *kls, bool reverse) { - struct path root __free(path_put) = {}; + struct mnt_namespace *ns = kls->ns; + u64 mnt_parent_id = kls->mnt_parent_id; + u64 last_mnt_id = kls->last_mnt_id; + u64 *mnt_ids = kls->kmnt_ids; + size_t nr_mnt_ids = kls->nr_mnt_ids; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); - ret = grab_requested_root(ns, &root); + ret = grab_requested_root(ns, &kls->root); if (ret) return ret; if (mnt_parent_id == LSMT_ROOT) { - orig = root; + orig = kls->root; } else { orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) @@ -5984,7 +5936,7 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, * Don't trigger audit denials. We just want to determine what * mounts to show users. */ - if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && + if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -6017,14 +5969,45 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, return ret; } +static void __free_klistmount_free(const struct klistmount *kls) +{ + path_put(&kls->root); + kvfree(kls->kmnt_ids); + mnt_ns_release(kls->ns); +} + +static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, + size_t nr_mnt_ids) +{ + + u64 last_mnt_id = kreq->param; + + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ + if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) + return -EINVAL; + + kls->last_mnt_id = last_mnt_id; + + kls->nr_mnt_ids = nr_mnt_ids; + kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids), + GFP_KERNEL_ACCOUNT); + if (!kls->kmnt_ids) + return -ENOMEM; + + kls->ns = grab_requested_mnt_ns(kreq); + if (!kls->ns) + return -ENOENT; + + kls->mnt_parent_id = kreq->mnt_id; + return 0; +} + SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { - u64 *kmnt_ids __free(kvfree) = NULL; + struct klistmount kls __free(klistmount_free) = {}; const size_t maxcount = 1000000; - struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct mnt_id_req kreq; - u64 last_mnt_id; ssize_t ret; if (flags & ~LISTMOUNT_REVERSE) @@ -6045,22 +6028,12 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (ret) return ret; - last_mnt_id = kreq.param; - /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ - if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) - return -EINVAL; - - kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids), - GFP_KERNEL_ACCOUNT); - if (!kmnt_ids) - return -ENOMEM; - - ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids); + if (ret) + return ret; - if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && - !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; /* @@ -6068,39 +6041,43 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, * listmount() doesn't care about any mount properties. */ scoped_guard(rwsem_read, &namespace_sem) - ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, - nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); + ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; - if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) + if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids))) return -EFAULT; return ret; } +struct mnt_namespace init_mnt_ns = { + .ns.inum = ns_init_inum(&init_mnt_ns), + .ns.ops = &mntns_operations, + .user_ns = &init_user_ns, + .ns.__ns_ref = REFCOUNT_INIT(1), + .ns.ns_type = ns_common_type(&init_mnt_ns), + .passive = REFCOUNT_INIT(1), + .mounts = RB_ROOT, + .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), +}; + static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; - struct mnt_namespace *ns; struct path root; - mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, true); - if (IS_ERR(ns)) - panic("Can't allocate initial namespace"); - ns->seq = atomic64_inc_return(&mnt_ns_seq); - ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); - ns->root = m; - ns->nr_mounts = 1; - mnt_add_to_ns(ns, m); - init_task.nsproxy->mnt_ns = ns; - get_mnt_ns(ns); + init_mnt_ns.root = m; + init_mnt_ns.nr_mounts = 1; + mnt_add_to_ns(&init_mnt_ns, m); + init_task.nsproxy->mnt_ns = &init_mnt_ns; + get_mnt_ns(&init_mnt_ns); root.mnt = mnt; root.dentry = mnt->mnt_root; @@ -6108,7 +6085,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); - mnt_ns_tree_add(ns); + ns_tree_add(&init_mnt_ns); } void __init mnt_init(void) @@ -6148,7 +6125,7 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - if (!refcount_dec_and_test(&ns->ns.count)) + if (!ns_ref_put(ns)) return; namespace_lock(); emptied_ns = ns; @@ -6397,7 +6374,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns) const struct proc_ns_operations mntns_operations = { .name = "mnt", - .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 18b3dc74c70e..37ab6f28b5ad 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -369,7 +369,7 @@ void netfs_readahead(struct readahead_control *ractl) return netfs_put_request(rreq, netfs_rreq_trace_put_return); cleanup_free: - return netfs_put_request(rreq, netfs_rreq_trace_put_failed); + return netfs_put_failed_request(rreq); } EXPORT_SYMBOL(netfs_readahead); @@ -472,7 +472,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); alloc_error: folio_unlock(folio); return ret; @@ -532,7 +532,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); alloc_error: folio_unlock(folio); return ret; @@ -699,7 +699,7 @@ have_folio_no_wait: return 0; error_put: - netfs_put_request(rreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(rreq); error: if (folio) { folio_unlock(folio); @@ -754,7 +754,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, return ret < 0 ? ret : 0; error_put: - netfs_put_request(rreq, netfs_rreq_trace_put_discard); + netfs_put_failed_request(rreq); error: _leave(" = %d", ret); return ret; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index f27ea5099a68..09394ac2c180 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -347,7 +347,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_put(folio); ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); if (ret < 0) - goto error_folio_unlock; + goto out; continue; copied: diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index a05e13472baf..a498ee8d6674 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -131,6 +131,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); return -EIO; } @@ -205,7 +206,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i if (user_backed_iter(iter)) { ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0); if (ret < 0) - goto out; + goto error_put; rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec; rreq->direct_bv_count = ret; rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); @@ -238,6 +239,10 @@ out: if (ret > 0) orig_count -= ret; return ret; + +error_put: + netfs_put_failed_request(rreq); + return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index a16660ab7f83..a9d1c3b2c084 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -57,7 +57,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0); if (n < 0) { ret = n; - goto out; + goto error_put; } wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec; wreq->direct_bv_count = n; @@ -101,6 +101,10 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * out: netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; + +error_put: + netfs_put_failed_request(wreq); + return ret; } EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index d4f16fefd965..4319611f5354 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -87,6 +87,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); void netfs_clear_subrequests(struct netfs_io_request *rreq); void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); +void netfs_put_failed_request(struct netfs_io_request *rreq); struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq); static inline void netfs_see_request(struct netfs_io_request *rreq, diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 20748bcfbf59..486166460e17 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -321,7 +321,7 @@ void netfs_wake_collector(struct netfs_io_request *rreq) { if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { - queue_work(system_unbound_wq, &rreq->work); + queue_work(system_dfl_wq, &rreq->work); } else { trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); wake_up(&rreq->waitq); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e8c99738b5bb..b8c4918d3dcd 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -116,10 +116,8 @@ static void netfs_free_request_rcu(struct rcu_head *rcu) netfs_stat_d(&netfs_n_rh_rreq); } -static void netfs_free_request(struct work_struct *work) +static void netfs_deinit_request(struct netfs_io_request *rreq) { - struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, cleanup_work); struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; @@ -149,6 +147,14 @@ static void netfs_free_request(struct work_struct *work) if (atomic_dec_and_test(&ictx->io_count)) wake_up_var(&ictx->io_count); +} + +static void netfs_free_request(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, cleanup_work); + + netfs_deinit_request(rreq); call_rcu(&rreq->rcu, netfs_free_request_rcu); } @@ -163,11 +169,29 @@ void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace dead = __refcount_dec_and_test(&rreq->ref, &r); trace_netfs_rreq_ref(debug_id, r - 1, what); if (dead) - WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work)); + WARN_ON(!queue_work(system_dfl_wq, &rreq->cleanup_work)); } } /* + * Free a request (synchronously) that was just allocated but has + * failed before it could be submitted. + */ +void netfs_put_failed_request(struct netfs_io_request *rreq) +{ + int r = refcount_read(&rreq->ref); + + /* new requests have two references (see + * netfs_alloc_request(), and this function is only allowed on + * new request objects + */ + WARN_ON_ONCE(r != 2); + + trace_netfs_rreq_ref(rreq->debug_id, r, netfs_rreq_trace_put_failed); + netfs_free_request(&rreq->cleanup_work); +} + +/* * Allocate and partially initialise an I/O request structure. */ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 3e804da1e1eb..a95e7aadafd0 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -281,8 +281,10 @@ reassess: } else if (test_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags)) { notes |= MADE_PROGRESS; } else { - if (!stream->failed) + if (!stream->failed) { stream->transferred += transferred; + stream->transferred_valid = true; + } if (front->transferred < front->len) set_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags); notes |= MADE_PROGRESS; diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index 8097bc069c1d..a1489aa29f78 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -118,7 +118,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache( return creq; cancel_put: - netfs_put_request(creq, netfs_rreq_trace_put_return); + netfs_put_failed_request(creq); cancel: rreq->copy_to_cache = ERR_PTR(-ENOBUFS); clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index fa622a6cd56d..5c0dc4efc792 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -189,7 +189,7 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite return ret; cleanup_free: - netfs_put_request(rreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(rreq); return ret; } EXPORT_SYMBOL(netfs_read_single); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 0f3a36852a4d..cbf3d9194c7b 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -254,6 +254,7 @@ reassess_streams: if (front->start + front->transferred > stream->collected_to) { stream->collected_to = front->start + front->transferred; stream->transferred = stream->collected_to - wreq->start; + stream->transferred_valid = true; notes |= MADE_PROGRESS; } if (test_bit(NETFS_SREQ_FAILED, &front->flags)) { @@ -356,6 +357,7 @@ bool netfs_write_collection(struct netfs_io_request *wreq) { struct netfs_inode *ictx = netfs_inode(wreq->inode); size_t transferred; + bool transferred_valid = false; int s; _enter("R=%x", wreq->debug_id); @@ -376,12 +378,16 @@ bool netfs_write_collection(struct netfs_io_request *wreq) continue; if (!list_empty(&stream->subrequests)) return false; - if (stream->transferred < transferred) + if (stream->transferred_valid && + stream->transferred < transferred) { transferred = stream->transferred; + transferred_valid = true; + } } /* Okay, declare that all I/O is complete. */ - wreq->transferred = transferred; + if (transferred_valid) + wreq->transferred = transferred; trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); if (wreq->io_streams[1].active && diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 50bee2c4130d..dd8743bc8d7f 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -118,12 +118,12 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, wreq->io_streams[0].prepare_write = ictx->ops->prepare_write; wreq->io_streams[0].issue_write = ictx->ops->issue_write; wreq->io_streams[0].collected_to = start; - wreq->io_streams[0].transferred = LONG_MAX; + wreq->io_streams[0].transferred = 0; wreq->io_streams[1].stream_nr = 1; wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE; wreq->io_streams[1].collected_to = start; - wreq->io_streams[1].transferred = LONG_MAX; + wreq->io_streams[1].transferred = 0; if (fscache_resources_valid(&wreq->cache_resources)) { wreq->io_streams[1].avail = true; wreq->io_streams[1].active = true; @@ -133,8 +133,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, return wreq; nomem: - wreq->error = -ENOMEM; - netfs_put_request(wreq, netfs_rreq_trace_put_failed); + netfs_put_failed_request(wreq); return ERR_PTR(-ENOMEM); } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8fb4a950dd55..4e3dcc157a83 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; + else + server->caps &= ~NFS_CAP_XATTR; #endif } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 86e36c630f09..8059ece82468 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -28,6 +28,7 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/gfp.h> +#include <linux/rmap.h> #include <linux/swap.h> #include <linux/compaction.h> @@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL_GPL(nfs_file_fsync); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to) +{ + struct folio *folio; + + if (from >= to) + return; + + folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + + if (folio_test_uptodate(folio)) { + loff_t fpos = folio_pos(folio); + size_t offset = from - fpos; + size_t end = folio_size(folio); + + if (to - fpos < end) + end = to - fpos; + folio_zero_segment(folio, offset, end); + trace_nfs_size_truncate_folio(mapping->host, to); + } + + folio_unlock(folio); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(nfs_truncate_last_folio); + /* * Decide whether a read/modify/write cycle may be more efficient * then a modify/write/read cycle when writing to a page in the @@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb, dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); + nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); fgp |= fgf_set_order(len); start: @@ -442,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset, dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); - if (offset != 0 || length < folio_size(folio)) - return; /* Cancel any unstarted writes on this page */ - nfs_wb_folio_cancel(inode, folio); + if (offset != 0 || length < folio_size(folio)) + nfs_wb_folio(inode, folio); + else + nfs_wb_folio_cancel(inode, folio); folio_wait_private_2(folio); /* [DEPRECATED] */ trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length); } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 8dc921d83538..9edb5f9b0c4e 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, struct pnfs_layout_segment *l2) { const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); - const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2); u32 i; if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) @@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, continue; if (check_device && - nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) { + // reinitialize the error state in case if this is the last iteration + ds = ERR_PTR(-EINVAL); continue; + } *best_idx = idx; break; @@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, struct nfs4_pnfs_ds *ds; ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); - if (ds) + if (!IS_ERR(ds)) return ds; return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); } @@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, best_idx); - if (ds || !pgio->pg_mirror_idx) + if (!IS_ERR(ds) || !pgio->pg_mirror_idx) return ds; return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); } @@ -868,7 +871,7 @@ retry: req->wb_nio = 0; ds = ff_layout_get_ds_for_read(pgio, &ds_idx); - if (!ds) { + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; pnfs_generic_pg_cleanup(pgio); @@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; + struct nfs4_pnfs_ds *ds; - if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx)) - ff_layout_send_layouterror(hdr->lseg); - else + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx); + if (IS_ERR(ds)) pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + else + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr, new_idx); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae423..9bdaf7f38bed 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); @@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; + loff_t oldsize = i_size_read(inode); int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (error) return error; - if (attr->ia_size == i_size_read(inode)) + if (attr->ia_size == oldsize) attr->ia_valid &= ~ATTR_SIZE; } @@ -767,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + nfs_file_block_o_direct(NFS_I(inode)); nfs_sync_inode(inode); + } fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { @@ -777,8 +780,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); - if (error == 0) + if (error == 0) { + if (attr->ia_valid & ATTR_SIZE) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + attr->ia_size); error = nfs_refresh_inode(inode, fattr); + } nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); @@ -2229,7 +2236,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74d712b58423..c0a44f389f8f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; @@ -530,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } +/* Must be called with exclusively locked inode->i_rwsem */ +static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(&nfsi->vfs_inode); + } +} + + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 3388faf2acb9..d275b0a250bf 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -14,15 +14,6 @@ #include "internal.h" -/* Call with exclusively locked inode->i_rwsem */ -static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) -{ - if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { - clear_bit(NFS_INO_ODIRECT, &nfsi->flags); - inode_dio_wait(inode); - } -} - /** * nfs_start_io_read - declare the file is being used for buffered reads * @inode: file inode @@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (err) return err; - nfs_block_o_direct(nfsi, inode); + nfs_file_block_o_direct(nfsi); downgrade_write(&inode->i_rwsem); return 0; @@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (!err) - nfs_block_o_direct(NFS_I(inode), inode); + nfs_file_block_o_direct(NFS_I(inode)); return err; } diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index bd5fca285899..97abf62f109d 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp) return; } - if (nfs_client_is_local(clp)) { - /* If already enabled, disable and re-enable */ - nfs_localio_disable_client(clp); - } + if (nfs_client_is_local(clp)) + return; if (!nfs_uuid_begin(&clp->cl_uuid)) return; @@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, case -ENOMEM: case -ENXIO: case -ENOENT: - /* Revalidate localio, will disable if unsupported */ + /* Revalidate localio */ + nfs_localio_disable_client(clp); nfs_local_probe(clp); } } @@ -453,12 +452,13 @@ static void nfs_local_call_read(struct work_struct *work) nfs_local_iter_init(&iter, iocb, READ); status = filp->f_op->read_iter(&iocb->kiocb, &iter); + + revert_creds(save_cred); + if (status != -EIOCBQUEUED) { nfs_local_read_done(iocb, status); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); } static int @@ -648,14 +648,15 @@ static void nfs_local_call_write(struct work_struct *work) file_start_write(filp); status = filp->f_op->write_iter(&iocb->kiocb, &iter); file_end_write(filp); + + revert_creds(save_cred); + current->flags = old_flags; + if (status != -EIOCBQUEUED) { nfs_local_write_done(iocb, status); nfs_local_vfs_getattr(iocb); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); - current->flags = old_flags; } static int diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 7f1ec9c67ff2..f9a3a1fbf44c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -335,7 +335,7 @@ static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp) num *= HZ; *((int *)kp->arg) = num; if (!list_empty(&nfs_automount_list)) - mod_delayed_work(system_wq, &nfs_automount_task, num); + mod_delayed_work(system_percpu_wq, &nfs_automount_task, num); } else { *((int *)kp->arg) = -1*HZ; cancel_delayed_work(&nfs_automount_task); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 01c01f45358b..6a0b5871ba3b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, exception.inode = inode; exception.state = lock->open_context->state; + nfs_file_block_o_direct(NFS_I(inode)); err = nfs_sync_inode(inode); if (err) goto out; @@ -137,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) @@ -145,7 +147,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == -EOPNOTSUPP) + + if (err == 0) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | NFS_CAP_ZERO_RANGE); @@ -183,6 +189,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) @@ -191,9 +198,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == 0) + if (err == 0) { + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); truncate_pagecache_range(inode, offset, (offset + len) -1); - if (err == -EOPNOTSUPP) + } else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); @@ -354,22 +363,27 @@ out: /** * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload - * @inode: pointer to destination inode + * @file: pointer to destination file * @pos: destination offset * @len: copy length + * @oldsize: length of the file prior to clone/copy * * Punch a hole in the inode page cache, so that the NFS client will * know to retrieve new data. * Update the file size if necessary, and then mark the inode as having * invalid cached values for change attribute, ctime, mtime and space used. */ -static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) +static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len, + loff_t oldsize) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; loff_t newsize = pos + len; loff_t end = newsize - 1; - WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, end >> PAGE_SHIFT)); + nfs_truncate_last_folio(mapping, oldsize, pos); + WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT)); spin_lock(&inode->i_lock); if (newsize > i_size_read(inode)) @@ -402,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; + loff_t oldsize_dst = i_size_read(dst_inode); size_t count = args->count; ssize_t status; @@ -430,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } + nfs_file_block_o_direct(NFS_I(dst_inode)); status = nfs_sync_inode(dst_inode); if (status) return status; @@ -475,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count); + nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst); nfs_invalidate_atime(src_inode); status = res->write_res.count; out: @@ -1242,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct nfs42_clone_res res = { .server = server, }; + loff_t oldsize_dst = i_size_read(dst_inode); int status; msg->rpc_argp = &args; @@ -1276,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, /* a zero-length count means clone to EOF in src */ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; - nfs42_copy_dest_done(dst_inode, dst_offset, count); + nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1d6b5f4230c9..c9a0d1e420c6 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, lock_two_nondirectories(src_inode, dst_inode); /* flush all pending writes on both src and dst so that server * has the latest data */ + nfs_file_block_o_direct(NFS_I(src_inode)); ret = nfs_sync_inode(src_inode); if (ret) goto out_unlock; + nfs_file_block_o_direct(NFS_I(dst_inode)); ret = nfs_sync_inode(dst_inode); if (ret) goto out_unlock; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7d2b67e06cc3..ce61253efd45 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4013,8 +4013,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f res.attr_bitmask[2]; } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | - NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL); + server->caps &= + ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS | + NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME); server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) @@ -4092,7 +4094,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) }; int err; - nfs_server_set_init_caps(server); do { err = nfs4_handle_exception(server, _nfs4_server_capabilities(server, fhandle), diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index db3811af0796..18ae614e5a6c 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -122,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) timeout = 5 * HZ; dprintk("%s: requeueing work. Lease period = %ld\n", __func__, (timeout + HZ - 1) / HZ); - mod_delayed_work(system_wq, &clp->cl_renewd, timeout); + mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout); set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 96b1323318c2..627115179795 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class, TP_ARGS(inode, new_size)) DEFINE_NFS_UPDATE_SIZE_EVENT(truncate); +DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio); DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 11968dcb7243..6e69ce43a13f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -253,13 +253,14 @@ nfs_page_group_unlock(struct nfs_page *req) nfs_page_clear_headlock(req); } -/* - * nfs_page_group_sync_on_bit_locked +/** + * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set + * @req: request in page group + * @bit: PG_* bit that is used to sync page group * * must be called with page group lock held */ -static bool -nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) { struct nfs_page *head = req->wb_head; struct nfs_page *tmp; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index fa5c41d0989a..647c53d1418a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -153,20 +153,10 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) } } -static int -nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) { - int ret; - - if (!test_bit(PG_REMOVE, &req->wb_flags)) - return 0; - ret = nfs_page_group_lock(req); - if (ret) - return ret; if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) nfs_page_set_inode_ref(req, inode); - nfs_page_group_unlock(req); - return 0; } /** @@ -247,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error) } /* - * nfs_page_group_search_locked - * @head - head request of page group - * @page_offset - offset into page - * - * Search page group with head @head to find a request that contains the - * page offset @page_offset. - * - * Returns a pointer to the first matching nfs request, or NULL if no - * match is found. - * - * Must be called with the page group lock held - */ -static struct nfs_page * -nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) -{ - struct nfs_page *req; - - req = head; - do { - if (page_offset >= req->wb_pgbase && - page_offset < (req->wb_pgbase + req->wb_bytes)) - return req; - - req = req->wb_this_page; - } while (req != head); - - return NULL; -} - -/* - * nfs_page_group_covers_page - * @head - head request of page group + * nfs_page_covers_folio + * @req: struct nfs_page * - * Return true if the page group with head @head covers the whole page, - * returns false otherwise + * Return true if the request covers the whole folio. + * Note that the caller should ensure all subrequests have been joined */ static bool nfs_page_group_covers_page(struct nfs_page *req) { unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); - struct nfs_page *tmp; - unsigned int pos = 0; - - nfs_page_group_lock(req); - for (;;) { - tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (!tmp) - break; - pos = tmp->wb_pgbase + tmp->wb_bytes; - } - - nfs_page_group_unlock(req); - return pos >= len; + return req->wb_pgbase == 0 && req->wb_bytes == len; } /* We can set the PG_uptodate flag if we see that a write request @@ -585,19 +533,18 @@ retry: } } + ret = nfs_page_group_lock(head); + if (ret < 0) + goto out_unlock; + /* Ensure that nobody removed the request before we locked it */ if (head != folio->private) { + nfs_page_group_unlock(head); nfs_unlock_and_release_request(head); goto retry; } - ret = nfs_cancel_remove_inode(head, inode); - if (ret < 0) - goto out_unlock; - - ret = nfs_page_group_lock(head); - if (ret < 0) - goto out_unlock; + nfs_cancel_remove_inode(head, inode); /* lock each request in the page group */ for (subreq = head->wb_this_page; @@ -786,7 +733,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); - if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + nfs_page_group_lock(req); + if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) { struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio->mapping; @@ -798,6 +746,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) } spin_unlock(&mapping->i_private_lock); } + nfs_page_group_unlock(req); if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { atomic_long_dec(&nfsi->nrequests); @@ -2054,6 +2003,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) * release it */ nfs_inode_remove_request(req); nfs_unlock_and_release_request(req); + folio_cancel_dirty(folio); } return ret; diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 732abf6b92a5..85ca663c052c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -113,7 +113,7 @@ static void nfsd_file_schedule_laundrette(void) { if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags)) - queue_delayed_work(system_unbound_wq, &nfsd_filecache_laundrette, + queue_delayed_work(system_dfl_wq, &nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); } diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index 4f6468eb2adf..cb237f1b902a 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -103,10 +103,11 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, if (nfsd_file_get(new) == NULL) goto again; /* - * Drop the ref we were going to install and the - * one we were going to return. + * Drop the ref we were going to install (both file and + * net) and the one we were going to return (only file). */ nfsd_file_put(localio); + nfsd_net_put(net); nfsd_file_put(localio); localio = new; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 98ab55ba3ced..edf050766e57 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -470,7 +470,15 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) if (!iap->ia_valid) return 0; - iap->ia_valid |= ATTR_CTIME; + /* + * If ATTR_DELEG is set, then this is an update from a client that + * holds a delegation. If this is an update for only the atime, the + * ctime should not be changed. If the update contains the mtime + * too, then ATTR_CTIME should already be set. + */ + if (!(iap->ia_valid & ATTR_DELEG)) + iap->ia_valid |= ATTR_CTIME; + return notify_change(&nop_mnt_idmap, dentry, iap, NULL); } diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 14868a3dd592..bc52afbfc5c7 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) ************************************************************************/ static ssize_t nilfs_feature_revision_show(struct kobject *kobj, - struct attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d.%d\n", NILFS_CURRENT_REV, NILFS_MINOR_REV); @@ -1087,7 +1087,7 @@ static const char features_readme_str[] = "(1) revision\n\tshow current revision of NILFS file system driver.\n"; static ssize_t nilfs_feature_README_show(struct kobject *kobj, - struct attribute *attr, + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, features_readme_str); diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h index 78a87a016928..d370cd5cce3f 100644 --- a/fs/nilfs2/sysfs.h +++ b/fs/nilfs2/sysfs.h @@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups { struct completion sg_segments_kobj_unregister; }; -#define NILFS_COMMON_ATTR_STRUCT(name) \ +#define NILFS_KOBJ_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ struct attribute attr; \ - ssize_t (*show)(struct kobject *, struct attribute *, \ + ssize_t (*show)(struct kobject *, struct kobj_attribute *, \ char *); \ - ssize_t (*store)(struct kobject *, struct attribute *, \ + ssize_t (*store)(struct kobject *, struct kobj_attribute *, \ const char *, size_t); \ } -NILFS_COMMON_ATTR_STRUCT(feature); +NILFS_KOBJ_ATTR_STRUCT(feature); #define NILFS_DEV_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 079b868552c2..46bfc543f946 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * removed all zero refcount inodes, in any case. Test to * be sure. */ - if (!atomic_read(&inode->i_count)) { + if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 798340db69d7..55a03bb05aa1 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -428,7 +428,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) conn->destroy_next = connector_destroy_list; connector_destroy_list = conn; spin_unlock(&destroy_lock); - queue_work(system_unbound_wq, &connector_reaper_work); + queue_work(system_dfl_wq, &connector_reaper_work); } /* * Note that we didn't update flags telling whether inode cares about @@ -439,7 +439,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, + queue_delayed_work(system_dfl_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } EXPORT_SYMBOL_GPL(fsnotify_put_mark); diff --git a/fs/nsfs.c b/fs/nsfs.c index 59aa801347a7..e7fd8a790aaa 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,12 +13,26 @@ #include <linux/nsfs.h> #include <linux/uaccess.h> #include <linux/mnt_namespace.h> +#include <linux/ipc_namespace.h> +#include <linux/time_namespace.h> +#include <linux/utsname.h> +#include <linux/exportfs.h> +#include <linux/nstree.h> +#include <net/net_namespace.h> #include "mount.h" #include "internal.h" static struct vfsmount *nsfs_mnt; +static struct path nsfs_root_path = {}; + +void nsfs_get_root(struct path *path) +{ + *path = nsfs_root_path; + path_get(path); +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { @@ -139,7 +153,7 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, * the size value will be set to the size the kernel knows about. */ kinfo->size = min(usize, sizeof(*kinfo)); - kinfo->mnt_ns_id = mnt_ns->seq; + kinfo->mnt_ns_id = mnt_ns->ns.ns_id; kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); /* Subtract the root mount of the mount namespace. */ if (kinfo->nr_mounts) @@ -163,15 +177,18 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + case NS_GET_ID: + return true; } /* Extensible ioctls require some extra handling. */ switch (_IOC_NR(cmd)) { case _IOC_NR(NS_MNT_GET_INFO): + return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_NEXT): + return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_PREV): - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0); } return false; @@ -202,26 +219,14 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: - return ns->ops->type; + return ns->ns_type; case NS_GET_OWNER_UID: - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); - case NS_GET_MNTNS_ID: { - __u64 __user *idp; - __u64 id; - - if (ns->ops->type != CLONE_NEWNS) - return -EINVAL; - - mnt_ns = container_of(ns, struct mnt_namespace, ns); - idp = (__u64 __user *)arg; - id = mnt_ns->seq; - return put_user(id, idp); - } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: @@ -229,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { - if (ns->ops->type != CLONE_NEWPID) + if (ns->ns_type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; @@ -267,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, ret = -ESRCH; return ret; } + case NS_GET_MNTNS_ID: + if (ns->ns_type != CLONE_NEWNS) + return -EINVAL; + fallthrough; + case NS_GET_ID: { + __u64 __user *idp; + __u64 id; + + idp = (__u64 __user *)arg; + id = ns->ns_id; + return put_user(id, idp); + } } /* extensible ioctls */ @@ -276,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (!uinfo) @@ -297,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) @@ -415,12 +432,164 @@ static const struct stashed_operations nsfs_stashed_ops = { .put_data = nsfs_put_data, }; +#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) +#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) + +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct ns_common *ns = inode->i_private; + int len = *max_len; + + if (parent) + return FILEID_INVALID; + + if (len < NSFS_FID_SIZE_U32_VER0) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + return FILEID_INVALID; + } else if (len > NSFS_FID_SIZE_U32_LATEST) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + } + + fid->ns_id = ns->ns_id; + fid->ns_type = ns->ns_type; + fid->ns_inum = inode->i_ino; + return FILEID_NSFS; +} + +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct path path __free(path_put) = {}; + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct user_namespace *owning_ns = NULL; + struct ns_common *ns; + int ret; + + if (fh_len < NSFS_FID_SIZE_U32_VER0) + return NULL; + + /* Check that any trailing bytes are zero. */ + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, + fh_len - NSFS_FID_SIZE_U32_LATEST)) + return NULL; + + switch (fh_type) { + case FILEID_NSFS: + break; + default: + return NULL; + } + + scoped_guard(rcu) { + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); + if (!ns) + return NULL; + + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); + VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (!__ns_ref_get(ns)) + return NULL; + } + + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + if (!current_in_namespace(to_cg_ns(ns))) + owning_ns = to_cg_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + if (!current_in_namespace(to_ipc_ns(ns))) + owning_ns = to_ipc_ns(ns)->user_ns; + break; +#endif + case CLONE_NEWNS: + if (!current_in_namespace(to_mnt_ns(ns))) + owning_ns = to_mnt_ns(ns)->user_ns; + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + if (!current_in_namespace(to_net_ns(ns))) + owning_ns = to_net_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + if (!current_in_namespace(to_pid_ns(ns))) { + owning_ns = to_pid_ns(ns)->user_ns; + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + if (!current_in_namespace(to_time_ns(ns))) + owning_ns = to_time_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + if (!current_in_namespace(to_user_ns(ns))) + owning_ns = to_user_ns(ns); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + if (!current_in_namespace(to_uts_ns(ns))) + owning_ns = to_uts_ns(ns)->user_ns; + break; +#endif + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + + /* path_from_stashed() unconditionally consumes the reference. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ERR_PTR(ret); + + return no_free_ptr(path.dentry); +} + +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + /* nsfs_fh_to_dentry() performs all permission checks. */ + return 0; +} + +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +{ + return file_open_root(path, "", oflags, 0); +} + +static const struct export_operations nsfs_export_operations = { + .encode_fh = nsfs_encode_fh, + .fh_to_dentry = nsfs_fh_to_dentry, + .open = nsfs_export_open, + .permission = nsfs_export_permission, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; + ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; @@ -438,4 +607,6 @@ void __init nsfs_init(void) if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; + nsfs_root_path.mnt = nsfs_mnt; + nsfs_root_path.dentry = nsfs_mnt->mnt_root; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2018501b2249..2347a50f079b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1876,7 +1876,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) dlm_debug_init(dlm); snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); - dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); + dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 5130ec44e5e1..cccaa1d6fbba 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = { .alloc_inode = dlmfs_alloc_inode, .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct inode_operations dlmfs_file_inode_operations = { @@ -595,7 +595,8 @@ static int __init init_dlmfs_fs(void) } cleanup_inode = 1; - user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0); + user_dlm_worker = alloc_workqueue("user_dlm", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!user_dlm_worker) { status = -ENOMEM; goto bail; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 930150ed5db1..ef147e8b3271 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -706,6 +706,8 @@ out: * it not only handles the fiemap for inlined files, but also deals * with the fast symlink, cause they have no difference for extent * mapping per se. + * + * Must be called with ip_alloc_sem semaphore held. */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, @@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, u64 phys; u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; struct ocfs2_inode_info *oi = OCFS2_I(inode); + lockdep_assert_held_read(&oi->ip_alloc_sem); di = (struct ocfs2_dinode *)di_bh->b_data; if (ocfs2_inode_is_fast_symlink(inode)) @@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) return ret; } @@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; - + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, len_bytes, fe_flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret) break; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 14bf440ea4df..6c4f78f473fb 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1281,6 +1281,9 @@ static void ocfs2_clear_inode(struct inode *inode) * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ + if (!osb->journal) + return; + jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index f3da840758e7..b46100a4f529 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = { .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .statfs = orangefs_statfs, .show_options = orangefs_show_options, }; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 70b8687dc45e..dbd63a74df4b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -225,7 +225,7 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr) { struct dentry *ret; - inode_lock(workdir->d_inode); + inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT); ret = ovl_create_real(ofs, workdir, ovl_lookup_temp(ofs, workdir), attr); inode_unlock(workdir->d_inode); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index df85a76597e9..bd3d7ba8fb95 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, .free_inode = ovl_free_inode, .destroy_inode = ovl_destroy_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = ovl_put_super, .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index a33115e7384c..41033bac96cb 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1552,7 +1552,8 @@ void ovl_copyattr(struct inode *inode) int ovl_parent_lock(struct dentry *parent, struct dentry *child) { inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - if (!child || child->d_parent == parent) + if (!child || + (!d_unhashed(child) && child->d_parent == parent)) return 0; inode_unlock(parent->d_inode); diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c..c40c29c702e5 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -296,12 +296,12 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; + struct task_struct *task __free(put_task) = NULL; struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; - struct task_struct *task; struct pidfs_attr *attr; const struct cred *c; __u64 mask; @@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd) * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases. */ - return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); } return false; @@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode) } static const struct super_operations pidfs_sops = { - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d..42fead1efe52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/fs/pnode.c b/fs/pnode.c index 81f7599bdac4..6f7d02f3fa98 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -111,7 +111,8 @@ void change_mnt_propagation(struct mount *mnt, int type) return; } if (IS_MNT_SHARED(mnt)) { - m = propagation_source(mnt); + if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list)) + m = propagation_source(mnt); if (list_empty(&mnt->mnt_share)) { mnt_release_group_id(mnt); } else { @@ -637,10 +638,11 @@ void propagate_umount(struct list_head *set) } // now to_umount consists of all acceptable candidates - // deal with reparenting of remaining overmounts on those + // deal with reparenting of surviving overmounts on those list_for_each_entry(m, &to_umount, mnt_list) { - if (m->overmount) - reparent(m->overmount); + struct mount *over = m->overmount; + if (over && !will_be_unmounted(over)) + reparent(over); } // and fold them into the set diff --git a/fs/proc/array.c b/fs/proc/array.c index d6a0369caa93..69269745d73b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -157,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, unsigned int max_fds = 0; rcu_read_lock(); - ppid = pid_alive(p) ? - task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tracer = ptrace_parent(p); if (tracer) tpid = task_pid_nr_ns(tracer, ns); + ppid = task_ppid_nr_ns(p, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); cred = get_task_cred(p); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 76e800e38c8f..176281112273 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -367,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = { .setattr = proc_notify_change, }; +static void pde_set_flags(struct proc_dir_entry *pde) +{ + const struct proc_ops *proc_ops = pde->proc_ops; + + if (!proc_ops) + return; + + if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; + if (proc_ops->proc_read_iter) + pde->flags |= PROC_ENTRY_proc_read_iter; +#ifdef CONFIG_COMPAT + if (proc_ops->proc_compat_ioctl) + pde->flags |= PROC_ENTRY_proc_compat_ioctl; +#endif + if (proc_ops->proc_lseek) + pde->flags |= PROC_ENTRY_proc_lseek; +} + /* returns the registered entry, or frees dp and returns NULL on failure */ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp) @@ -374,6 +393,9 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, if (proc_alloc_inum(&dp->low_ino)) goto out_free_entry; + if (!S_ISDIR(dp->mode)) + pde_set_flags(dp); + write_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { @@ -561,20 +583,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } -static void pde_set_flags(struct proc_dir_entry *pde) -{ - if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) - pde->flags |= PROC_ENTRY_PERMANENT; - if (pde->proc_ops->proc_read_iter) - pde->flags |= PROC_ENTRY_proc_read_iter; -#ifdef CONFIG_COMPAT - if (pde->proc_ops->proc_compat_ioctl) - pde->flags |= PROC_ENTRY_proc_compat_ioctl; -#endif - if (pde->proc_ops->proc_lseek) - pde->flags |= PROC_ENTRY_proc_lseek; -} - struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) @@ -585,7 +593,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, if (!p) return NULL; p->proc_ops = proc_ops; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -636,7 +643,6 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_seq_private); @@ -667,7 +673,6 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, return NULL; p->proc_ops = &proc_single_ops; p->single_show = show; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_single_data); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 129490151be1..d9b7ef122343 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 4403a2e20c16..ea2b597fd92c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -12,7 +12,7 @@ #include "internal.h" -static const struct proc_ns_operations *ns_entries[] = { +static const struct proc_ns_operations *const ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif @@ -117,7 +117,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry, static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; if (!task) return -ENOENT; @@ -151,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); diff --git a/fs/proc/root.c b/fs/proc/root.c index ed86ac710384..1e24e085c7d5 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,12 +38,14 @@ enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, + Opt_pidns, }; static const struct fs_parameter_spec proc_fs_parameters[] = { - fsparam_u32("gid", Opt_gid), + fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), + fsparam_file_or_string("pidns", Opt_pidns), {} }; @@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value) return 0; } +#ifdef CONFIG_PID_NS +static int proc_parse_pidns_param(struct fs_context *fc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct pid_namespace *target, *active = task_active_pid_ns(current); + struct ns_common *ns; + struct file *ns_filp __free(fput) = NULL; + + switch (param->type) { + case fs_value_is_file: + /* came through fsconfig, steal the file reference */ + ns_filp = no_free_ptr(param->file); + break; + case fs_value_is_string: + ns_filp = filp_open(param->string, O_RDONLY, 0); + break; + default: + WARN_ON_ONCE(true); + break; + } + if (!ns_filp) + ns_filp = ERR_PTR(-EBADF); + if (IS_ERR(ns_filp)) { + errorfc(fc, "could not get file from pidns argument"); + return PTR_ERR(ns_filp); + } + + if (!proc_ns_file(ns_filp)) + return invalfc(fc, "pidns argument is not an nsfs file"); + ns = get_proc_ns(file_inode(ns_filp)); + if (ns->ns_type != CLONE_NEWPID) + return invalfc(fc, "pidns argument is not a pidns file"); + target = container_of(ns, struct pid_namespace, ns); + + /* + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the + * permission model should be the same as pidns_install(). + */ + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { + errorfc(fc, "insufficient permissions to set pidns"); + return -EPERM; + } + if (!pidns_is_ancestor(target, active)) + return invalfc(fc, "cannot set pidns to non-descendant pidns"); + + put_pid_ns(ctx->pid_ns); + ctx->pid_ns = get_pid_ns(target); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + return 0; +} +#endif /* CONFIG_PID_NS */ + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + int opt, err; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) @@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_hidepid: - if (proc_parse_hidepid_param(fc, param)) - return -EINVAL; + err = proc_parse_hidepid_param(fc, param); + if (err) + return err; break; case Opt_subset: - if (proc_parse_subset_param(fc, param->string) < 0) - return -EINVAL; + err = proc_parse_subset_param(fc, param->string); + if (err) + return err; + break; + + case Opt_pidns: +#ifdef CONFIG_PID_NS + /* + * We would have to RCU-protect every proc_pid_ns() or + * proc_sb_info() access if we allowed this to be reconfigured + * for an existing procfs instance. Luckily, procfs instances + * are cheap to create, and mount-beneath would let you + * atomically replace an instance even with overmounts. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + errorfc(fc, "cannot reconfigure pidns for existing procfs"); + return -EBUSY; + } + err = proc_parse_pidns_param(fc, param, &result); + if (err) + return err; break; +#else + errorfc(fc, "pidns mount flag not supported on this system"); + return -EOPNOTSUPP; +#endif default: return -EINVAL; @@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info, fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; + if (ctx->mask & (1 << Opt_pidns) && + !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { + put_pid_ns(fs_info->pid_ns); + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + } } static int proc_fill_super(struct super_block *s, struct fs_context *fc) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3d6d8a9f13fc..b26ae556b446 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -340,8 +340,8 @@ static int proc_maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR_OR_NULL(priv->mm)) { - int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; + if (IS_ERR(priv->mm)) { + int err = PTR_ERR(priv->mm); seq_release_private(inode, file); return err; @@ -1148,10 +1148,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; - pte_t ptent = huge_ptep_get(walk->mm, addr, pte); struct folio *folio = NULL; bool present = false; + spinlock_t *ptl; + pte_t ptent; + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); + ptent = huge_ptep_get(walk->mm, addr, pte); if (pte_present(ptent)) { folio = page_folio(pte_page(ptent)); present = true; @@ -1170,6 +1173,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); } + spin_unlock(ptl); return 0; } #else @@ -2017,12 +2021,14 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; u64 flags = 0, frame = 0; + spinlock_t *ptl; int err = 0; pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep); pte = huge_ptep_get(walk->mm, addr, ptep); if (pte_present(pte)) { struct folio *folio = page_folio(pte_page(pte)); @@ -2050,11 +2056,12 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, err = add_to_pagemap(&pme, pm); if (err) - return err; + break; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; } + spin_unlock(ptl); cond_resched(); return err; @@ -2410,6 +2417,9 @@ static void pagemap_scan_backout_range(struct pagemap_scan_private *p, { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + if (!p->vec_buf) + return; + if (cur_buf->start != addr) cur_buf->end = addr; else @@ -3128,17 +3138,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte); + pte_t huge_pte; struct numa_maps *md; struct page *page; + spinlock_t *ptl; + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + huge_pte = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(huge_pte)) - return 0; + goto out; page = pte_page(huge_pte); md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); +out: + spin_unlock(ptl); return 0; } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1a2e1185426c..b4e55c90f8dc 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc) static const struct super_operations pstore_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pstore_evict_inode, .show_options = pstore_show_options, }; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index df4a9b348769..afa15a214538 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -881,7 +881,7 @@ void dqput(struct dquot *dquot) put_releasing_dquots(dquot); atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); - queue_delayed_work(system_unbound_wq, "a_release_work, 1); + queue_delayed_work(system_dfl_wq, "a_release_work, 1); } EXPORT_SYMBOL(dqput); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index f8874c3b8c1e..41f9995da7ca 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations ramfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = ramfs_show_options, }; diff --git a/fs/read_write.c b/fs/read_write.c index c5b6265d984b..833bae068770 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (len == 0) return 0; + /* + * Make sure return value doesn't overflow in 32bit compat mode. Also + * limit the size for all cases except when calling ->copy_file_range(). + */ + if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) + len = min_t(size_t, MAX_RW_COUNT, len); + file_start_write(file_out); /* @@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, - min_t(loff_t, MAX_RW_COUNT, len), - REMAP_FILE_CAN_SHORTEN); + file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; @@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ - ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, - min_t(size_t, len, MAX_RW_COUNT), 0); + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); done: if (ret > 0) { fsnotify_access(file_in); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index d98e0d2de09f..3c39cfacb251 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -625,11 +625,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) */ list_for_each_entry(d, &r->mon_domains, hdr.list) { if (d->ci_id == domid) { - rr.ci_id = d->ci_id; cpu = cpumask_any(&d->hdr.cpu_mask); ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) continue; + rr.ci = ci; mon_event_read(&rr, r, NULL, rdtgrp, &ci->shared_cpu_map, evtid, false); goto checkresult; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 0a1eedba2b03..9a8cf6f11151 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -98,7 +98,7 @@ struct mon_data { * domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -112,7 +112,7 @@ struct rmid_read { struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; - unsigned int ci_id; + struct cacheinfo *ci; int err; u64 val; void *arch_mon_ctx; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f5637855c3ac..7326c28a7908 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -361,7 +361,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); struct rdt_mon_domain *d; - struct cacheinfo *ci; struct mbm_state *m; int err, ret; u64 tval = 0; @@ -389,8 +388,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* Summing domains that share a cache, must be on a CPU for that cache. */ - ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!ci || ci->id != rr->ci_id) + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) return -EINVAL; /* @@ -402,7 +400,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ ret = -EINVAL; list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci_id != rr->ci_id) + if (d->ci_id != rr->ci->id) continue; err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, rr->evtid, &tval, rr->arch_mon_ctx); diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index beb4f18f05ef..2337cf795db3 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -304,6 +304,8 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &ses->tcon_list) { tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); cfids = tcon->cfids; + if (!cfids) + continue; spin_lock(&cfids->cfid_list_lock); /* check lock ordering */ seq_printf(m, "Num entries: %d\n", cfids->num_entries); list_for_each_entry(cfid, &cfids->entries, entry) { @@ -319,8 +321,6 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) seq_printf(m, "\n"); } spin_unlock(&cfids->cfid_list_lock); - - } } } @@ -347,6 +347,22 @@ static __always_inline const char *compression_alg_str(__le16 alg) } } +static __always_inline const char *cipher_alg_str(__le16 cipher) +{ + switch (cipher) { + case SMB2_ENCRYPTION_AES128_CCM: + return "AES128-CCM"; + case SMB2_ENCRYPTION_AES128_GCM: + return "AES128-GCM"; + case SMB2_ENCRYPTION_AES256_CCM: + return "AES256-CCM"; + case SMB2_ENCRYPTION_AES256_GCM: + return "AES256-GCM"; + default: + return "UNKNOWN"; + } +} + static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct mid_q_entry *mid_entry; @@ -539,6 +555,11 @@ skip_rdma: else seq_puts(m, "disabled (not supported by this server)"); + /* Show negotiated encryption cipher, even if not required */ + seq_puts(m, "\nEncryption: "); + if (server->cipher_type) + seq_printf(m, "Negotiated cipher (%s)", cipher_alg_str(server->cipher_type)); + seq_printf(m, "\n\n\tSessions: "); i = 0; list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { @@ -576,12 +597,8 @@ skip_rdma: /* dump session id helpful for use with network trace */ seq_printf(m, " SessionId: 0x%llx", ses->Suid); - if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) { + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) seq_puts(m, " encrypted"); - /* can help in debugging to show encryption type */ - if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM) - seq_puts(m, "(gcm256)"); - } if (ses->sign) seq_puts(m, " signed"); diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index bc1c1e9b288a..43b86fa4d695 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -124,55 +124,44 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo, dp = description; /* start with version and hostname portion of UNC string */ spnego_key = ERR_PTR(-EINVAL); - sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION, - hostname); - dp = description + strlen(description); + dp += sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION, + hostname); /* add the server address */ if (server->dstaddr.ss_family == AF_INET) - sprintf(dp, "ip4=%pI4", &sa->sin_addr); + dp += sprintf(dp, "ip4=%pI4", &sa->sin_addr); else if (server->dstaddr.ss_family == AF_INET6) - sprintf(dp, "ip6=%pI6", &sa6->sin6_addr); + dp += sprintf(dp, "ip6=%pI6", &sa6->sin6_addr); else goto out; - dp = description + strlen(description); - /* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */ if (server->sec_kerberos) - sprintf(dp, ";sec=krb5"); + dp += sprintf(dp, ";sec=krb5"); else if (server->sec_mskerberos) - sprintf(dp, ";sec=mskrb5"); + dp += sprintf(dp, ";sec=mskrb5"); else if (server->sec_iakerb) - sprintf(dp, ";sec=iakerb"); + dp += sprintf(dp, ";sec=iakerb"); else { cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n"); - sprintf(dp, ";sec=krb5"); + dp += sprintf(dp, ";sec=krb5"); } - dp = description + strlen(description); - sprintf(dp, ";uid=0x%x", - from_kuid_munged(&init_user_ns, sesInfo->linux_uid)); + dp += sprintf(dp, ";uid=0x%x", + from_kuid_munged(&init_user_ns, sesInfo->linux_uid)); - dp = description + strlen(description); - sprintf(dp, ";creduid=0x%x", + dp += sprintf(dp, ";creduid=0x%x", from_kuid_munged(&init_user_ns, sesInfo->cred_uid)); - if (sesInfo->user_name) { - dp = description + strlen(description); - sprintf(dp, ";user=%s", sesInfo->user_name); - } + if (sesInfo->user_name) + dp += sprintf(dp, ";user=%s", sesInfo->user_name); - dp = description + strlen(description); - sprintf(dp, ";pid=0x%x", current->pid); + dp += sprintf(dp, ";pid=0x%x", current->pid); - if (sesInfo->upcall_target == UPTARGET_MOUNT) { - dp = description + strlen(description); - sprintf(dp, ";upcall_target=mount"); - } else { - dp = description + strlen(description); - sprintf(dp, ";upcall_target=app"); - } + if (sesInfo->upcall_target == UPTARGET_MOUNT) + dp += sprintf(dp, ";upcall_target=mount"); + else + dp += sprintf(dp, ";upcall_target=app"); cifs_dbg(FYI, "key description = %s\n", description); saved_cred = override_creds(spnego_cred); diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c index 4cc6e0896fad..f8659d36793f 100644 --- a/fs/smb/client/cifs_unicode.c +++ b/fs/smb/client/cifs_unicode.c @@ -629,6 +629,9 @@ cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len, int len; __le16 *dst; + if (!src) + return NULL; + len = cifs_local_to_utf16_bytes(src, maxlen, cp); len += 2; /* NULL */ dst = kmalloc(len, GFP_KERNEL); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 3bd85ab2deb1..dcb39d1b5958 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode) /* no serverino => unconditional eviction */ return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || - generic_drop_inode(inode); + inode_generic_drop(inode); } static const struct super_operations cifs_super_ops = { @@ -1358,6 +1358,20 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, truncate_setsize(target_inode, new_size); fscache_resize_cookie(cifs_inode_cookie(target_inode), new_size); + } else if (rc == -EOPNOTSUPP) { + /* + * copy_file_range syscall man page indicates EINVAL + * is returned e.g when "fd_in and fd_out refer to the + * same file and the source and target ranges overlap." + * Test generic/157 was what showed these cases where + * we need to remap EOPNOTSUPP to EINVAL + */ + if (off >= src_inode->i_size) { + rc = -EINVAL; + } else if (src_inode == target_inode) { + if (off + len > destoff) + rc = -EINVAL; + } } if (rc == 0 && new_size > target_cifsi->netfs.zero_point) target_cifsi->netfs.zero_point = new_size; @@ -1881,7 +1895,9 @@ init_cifs(void) cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n"); } - cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + cifsiod_wq = alloc_workqueue("cifsiod", + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cifsiod_wq) { rc = -ENOMEM; goto out_clean_proc; @@ -1909,28 +1925,32 @@ init_cifs(void) } cifsoplockd_wq = alloc_workqueue("cifsoplockd", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cifsoplockd_wq) { rc = -ENOMEM; goto out_destroy_fileinfo_put_wq; } deferredclose_wq = alloc_workqueue("deferredclose", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!deferredclose_wq) { rc = -ENOMEM; goto out_destroy_cifsoplockd_wq; } serverclose_wq = alloc_workqueue("serverclose", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!serverclose_wq) { rc = -ENOMEM; goto out_destroy_deferredclose_wq; } cfid_put_wq = alloc_workqueue("cfid_put_wq", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!cfid_put_wq) { rc = -ENOMEM; goto out_destroy_serverclose_wq; diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 487f39cff77e..3ce7c614ccc0 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -145,6 +145,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 55 -#define CIFS_VERSION "2.55" +#define SMB3_PRODUCT_BUILD 56 +#define CIFS_VERSION "2.56" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index e6830ab3a546..0fae95cf81c4 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -87,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 7 +#define MAX_COMPOUND 10 /* * Default number of credits to keep available for SMB3. @@ -1732,6 +1732,7 @@ struct mid_q_entry { int mid_rc; /* rc for MID_RC */ __le16 command; /* smb command code */ unsigned int optype; /* operation type */ + spinlock_t mid_lock; bool wait_cancelled:1; /* Cancelled while waiting for response */ bool deleted_from_q:1; /* Whether Mid has been dequeued frem pending_mid_q */ bool large_buf:1; /* if valid response, is pointer to large buf */ @@ -1881,9 +1882,12 @@ static inline bool is_replayable_error(int error) /* cifs_get_writable_file() flags */ -#define FIND_WR_ANY 0 -#define FIND_WR_FSUID_ONLY 1 -#define FIND_WR_WITH_DELETE 2 +enum cifs_writable_file_flags { + FIND_WR_ANY = 0U, + FIND_WR_FSUID_ONLY = (1U << 0), + FIND_WR_WITH_DELETE = (1U << 1), + FIND_WR_NO_PENDING_DELETE = (1U << 2), +}; #define MID_FREE 0 #define MID_REQUEST_ALLOCATED 1 @@ -2036,6 +2040,9 @@ require use of the stronger protocol */ * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search * ->oplock_break_cancelled + * mid_q_entry->mid_lock mid_q_entry->callback alloc_mid + * smb2_mid_entry_alloc + * (Any fields of mid_q_entry that will need protection) ****************************************************************************/ #ifdef DECLARE_GLOBALS_HERE @@ -2339,6 +2346,8 @@ struct smb2_compound_vars { struct kvec qi_iov; struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov; struct smb2_file_rename_info_hdr rename_info; struct smb2_file_link_info_hdr link_info; @@ -2375,6 +2384,23 @@ static inline bool cifs_netbios_name(const char *name, size_t namelen) return ret; } +/* + * Execute mid callback atomically - ensures callback runs exactly once + * and prevents sleeping in atomic context. + */ +static inline void mid_execute_callback(struct mid_q_entry *mid) +{ + void (*callback)(struct mid_q_entry *mid); + + spin_lock(&mid->mid_lock); + callback = mid->callback; + mid->callback = NULL; /* Mark as executed, */ + spin_unlock(&mid->mid_lock); + + if (callback) + callback(mid); +} + #define CIFS_REPARSE_SUPPORT(tcon) \ ((tcon)->posix_extensions || \ (le32_to_cpu((tcon)->fsAttrInfo.Attributes) & \ diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index c34c533b2efa..e8fba98690ce 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -312,8 +312,8 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon); -extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, - const char *path); +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, + struct dentry *dentry); extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode, const char *path); diff --git a/fs/smb/client/cifstransport.c b/fs/smb/client/cifstransport.c index 352dafb888dd..e98b95eff8c9 100644 --- a/fs/smb/client/cifstransport.c +++ b/fs/smb/client/cifstransport.c @@ -46,6 +46,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); memset(temp, 0, sizeof(struct mid_q_entry)); kref_init(&temp->refcount); + spin_lock_init(&temp->mid_lock); temp->mid = get_mid(smb_buffer); temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); @@ -345,16 +346,15 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = wait_for_response(server, midQ); if (rc != 0) { send_cancel(server, &rqst, midQ); - spin_lock(&server->mid_queue_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED || - midQ->mid_state == MID_RESPONSE_RECEIVED) { + spin_lock(&midQ->mid_lock); + if (midQ->callback) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ->mid_lock); add_credits(server, &credits, 0); return rc; } - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ->mid_lock); } rc = cifs_sync_mid_result(midQ, server); @@ -527,15 +527,14 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, rc = wait_for_response(server, midQ); if (rc) { send_cancel(server, &rqst, midQ); - spin_lock(&server->mid_queue_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED || - midQ->mid_state == MID_RESPONSE_RECEIVED) { + spin_lock(&midQ->mid_lock); + if (midQ->callback) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ->mid_lock); return rc; } - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ->mid_lock); } /* We got the response - restart system call. */ diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c index 766b4de13da7..db709f5cd2e1 100644 --- a/fs/smb/client/compress.c +++ b/fs/smb/client/compress.c @@ -155,58 +155,29 @@ static int cmp_bkt(const void *_a, const void *_b) } /* - * TODO: - * Support other iter types, if required. - * Only ITER_XARRAY is supported for now. + * Collect some 2K samples with 2K gaps between. */ -static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) +static int collect_sample(const struct iov_iter *source, ssize_t max, u8 *sample) { - struct folio *folios[16], *folio; - unsigned int nr, i, j, npages; - loff_t start = iter->xarray_start + iter->iov_offset; - pgoff_t last, index = start / PAGE_SIZE; - size_t len, off, foff; - void *p; - int s = 0; - - last = (start + max - 1) / PAGE_SIZE; - do { - nr = xa_extract(iter->xarray, (void **)folios, index, last, ARRAY_SIZE(folios), - XA_PRESENT); - if (nr == 0) - return -EIO; - - for (i = 0; i < nr; i++) { - folio = folios[i]; - npages = folio_nr_pages(folio); - foff = start - folio_pos(folio); - off = foff % PAGE_SIZE; - - for (j = foff / PAGE_SIZE; j < npages; j++) { - size_t len2; - - len = min_t(size_t, max, PAGE_SIZE - off); - len2 = min_t(size_t, len, SZ_2K); - - p = kmap_local_page(folio_page(folio, j)); - memcpy(&sample[s], p, len2); - kunmap_local(p); - - s += len2; - - if (len2 < SZ_2K || s >= max - SZ_2K) - return s; - - max -= len; - if (max <= 0) - return s; - - start += len; - off = 0; - index++; - } - } - } while (nr == ARRAY_SIZE(folios)); + struct iov_iter iter = *source; + size_t s = 0; + + while (iov_iter_count(&iter) >= SZ_2K) { + size_t part = umin(umin(iov_iter_count(&iter), SZ_2K), max); + size_t n; + + n = copy_from_iter(sample + s, part, &iter); + if (n != part) + return -EFAULT; + + s += n; + max -= n; + + if (iov_iter_count(&iter) < PAGE_SIZE - SZ_2K) + break; + + iov_iter_advance(&iter, SZ_2K); + } return s; } diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 587845a2452d..dd12f3eb61dc 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -335,7 +335,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_entry_safe(mid, nmid, &retry_list, qhead) { list_del_init(&mid->qhead); - mid->callback(mid); + mid_execute_callback(mid); release_mid(mid); } @@ -919,7 +919,7 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) list_del_init(&mid->qhead); mid->mid_rc = mid_rc; mid->mid_state = MID_RC; - mid->callback(mid); + mid_execute_callback(mid); release_mid(mid); } @@ -1117,7 +1117,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server) mid_entry = list_entry(tmp, struct mid_q_entry, qhead); cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid); list_del_init(&mid_entry->qhead); - mid_entry->callback(mid_entry); + mid_execute_callback(mid_entry); release_mid(mid_entry); } /* 1/8th of sec is more than enough time for them to exit */ @@ -1394,7 +1394,7 @@ next_pdu: } if (!mids[i]->multiRsp || mids[i]->multiEnd) - mids[i]->callback(mids[i]); + mid_execute_callback(mids[i]); release_mid(mids[i]); } else if (server->ops->is_oplock_break && @@ -4205,7 +4205,6 @@ retry: return 0; } - server->lstrp = jiffies; server->tcpStatus = CifsInNegotiate; server->neg_start = jiffies; spin_unlock(&server->srv_lock); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 186e061068be..cb907e18cc35 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -998,7 +998,10 @@ int cifs_open(struct inode *inode, struct file *file) /* Get the cached handle as SMB2 close is deferred */ if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { - rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile); + rc = cifs_get_writable_path(tcon, full_path, + FIND_WR_FSUID_ONLY | + FIND_WR_NO_PENDING_DELETE, + &cfile); } else { rc = cifs_get_readable_path(tcon, full_path, &cfile); } @@ -2530,6 +2533,9 @@ refind_writable: continue; if (with_delete && !(open_file->fid.access & DELETE)) continue; + if ((flags & FIND_WR_NO_PENDING_DELETE) && + open_file->status_file_deleted) + continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ @@ -2647,6 +2653,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, spin_unlock(&tcon->open_file_lock); free_dentry_path(page); *ret_file = find_readable_file(cinode, 0); + if (*ret_file) { + spin_lock(&cinode->open_file_lock); + if ((*ret_file)->status_file_deleted) { + spin_unlock(&cinode->open_file_lock); + cifsFileInfo_put(*ret_file); + *ret_file = NULL; + } else { + spin_unlock(&cinode->open_file_lock); + } + } return *ret_file ? 0 : -ENOENT; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 75be4b46bc6f..7e9784080501 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode) * but will return the EACCES to the caller. Note that the VFS does not call * unlink on negative dentries currently. */ -int cifs_unlink(struct inode *dir, struct dentry *dentry) +static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename) { int rc = 0; unsigned int xid; @@ -1943,15 +1943,24 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *tcon; + __u32 dosattr = 0, origattr = 0; struct TCP_Server_Info *server; struct iattr *attrs = NULL; - __u32 dosattr = 0, origattr = 0; + bool rehash = false; cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry); if (unlikely(cifs_forced_shutdown(cifs_sb))) return -EIO; + /* Unhash dentry in advance to prevent any concurrent opens */ + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) { + __d_drop(dentry); + rehash = true; + } + spin_unlock(&dentry->d_lock); + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -1975,7 +1984,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) } netfs_wait_for_outstanding_io(inode); - cifs_close_deferred_file_under_dentry(tcon, full_path); + cifs_close_deferred_file_under_dentry(tcon, dentry); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { @@ -1994,7 +2003,24 @@ retry_std_delete: goto psx_del_no_retry; } - rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); + /* For SMB2+, if the file is open, we always perform a silly rename. + * + * We check for d_count() right after calling + * cifs_close_deferred_file_under_dentry() to make sure that the + * dentry's refcount gets dropped in case the file had any deferred + * close. + */ + if (!sillyrename && server->vals->protocol_id > SMB10_PROT_ID) { + spin_lock(&dentry->d_lock); + if (d_count(dentry) > 1) + sillyrename = true; + spin_unlock(&dentry->d_lock); + } + + if (sillyrename) + rc = -EBUSY; + else + rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); psx_del_no_retry: if (!rc) { @@ -2003,7 +2029,8 @@ psx_del_no_retry: cifs_drop_nlink(inode); } } else if (rc == -ENOENT) { - d_drop(dentry); + if (simple_positive(dentry)) + d_delete(dentry); } else if (rc == -EBUSY) { if (server->ops->rename_pending_delete) { rc = server->ops->rename_pending_delete(full_path, @@ -2056,9 +2083,16 @@ unlink_out: kfree(attrs); free_xid(xid); cifs_put_tlink(tlink); + if (rehash) + d_rehash(dentry); return rc; } +int cifs_unlink(struct inode *dir, struct dentry *dentry) +{ + return __cifs_unlink(dir, dentry, false); +} + static int cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, const char *full_path, struct cifs_sb_info *cifs_sb, @@ -2346,14 +2380,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb); cifs_put_tlink(tlink); + cifsInode = CIFS_I(d_inode(direntry)); + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags); spin_lock(&d_inode(direntry)->i_lock); i_size_write(d_inode(direntry), 0); clear_nlink(d_inode(direntry)); spin_unlock(&d_inode(direntry)->i_lock); } - cifsInode = CIFS_I(d_inode(direntry)); /* force revalidate to go get info when needed */ cifsInode->time = 0; @@ -2446,8 +2482,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ do_rename_exit: - if (rc == 0) + if (rc == 0) { d_move(from_dentry, to_dentry); + /* Force a new lookup */ + d_drop(from_dentry); + } cifs_put_tlink(tlink); return rc; } @@ -2458,10 +2497,12 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, struct dentry *target_dentry, unsigned int flags) { const char *from_name, *to_name; + struct TCP_Server_Info *server; void *page1, *page2; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; + bool rehash = false; unsigned int xid; int rc, tmprc; int retry_count = 0; @@ -2477,10 +2518,22 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, if (unlikely(cifs_forced_shutdown(cifs_sb))) return -EIO; + /* + * Prevent any concurrent opens on the target by unhashing the dentry. + * VFS already unhashes the target when renaming directories. + */ + if (d_is_positive(target_dentry) && !d_is_dir(target_dentry)) { + if (!d_unhashed(target_dentry)) { + d_drop(target_dentry); + rehash = true; + } + } + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; page1 = alloc_dentry_path(); page2 = alloc_dentry_path(); @@ -2498,10 +2551,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, goto cifs_rename_exit; } - cifs_close_deferred_file_under_dentry(tcon, from_name); + cifs_close_deferred_file_under_dentry(tcon, source_dentry); if (d_inode(target_dentry) != NULL) { netfs_wait_for_outstanding_io(d_inode(target_dentry)); - cifs_close_deferred_file_under_dentry(tcon, to_name); + cifs_close_deferred_file_under_dentry(tcon, target_dentry); } rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, @@ -2518,6 +2571,8 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, } } + if (!rc) + rehash = false; /* * No-replace is the natural behavior for CIFS, so skip unlink hacks. */ @@ -2565,23 +2620,61 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, unlink_target: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - - /* Try unlinking the target dentry if it's not negative */ - if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { - if (d_is_dir(target_dentry)) - tmprc = cifs_rmdir(target_dir, target_dentry); - else - tmprc = cifs_unlink(target_dir, target_dentry); - if (tmprc) - goto cifs_rename_exit; - rc = cifs_do_rename(xid, source_dentry, from_name, - target_dentry, to_name); + if (d_really_is_positive(target_dentry)) { + if (!rc) { + struct inode *inode = d_inode(target_dentry); + /* + * Samba and ksmbd servers allow renaming a target + * directory that is open, so make sure to update + * ->i_nlink and then mark it as delete pending. + */ + if (S_ISDIR(inode->i_mode)) { + drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb); + spin_lock(&inode->i_lock); + i_size_write(inode, 0); + clear_nlink(inode); + spin_unlock(&inode->i_lock); + set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags); + CIFS_I(inode)->time = 0; /* force reval */ + inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + } + } else if (rc == -EACCES || rc == -EEXIST) { + /* + * Rename failed, possibly due to a busy target. + * Retry it by unliking the target first. + */ + if (d_is_dir(target_dentry)) { + tmprc = cifs_rmdir(target_dir, target_dentry); + } else { + tmprc = __cifs_unlink(target_dir, target_dentry, + server->vals->protocol_id > SMB10_PROT_ID); + } + if (tmprc) { + /* + * Some servers will return STATUS_ACCESS_DENIED + * or STATUS_DIRECTORY_NOT_EMPTY when failing to + * rename a non-empty directory. Make sure to + * propagate the appropriate error back to + * userspace. + */ + if (tmprc == -EEXIST || tmprc == -ENOTEMPTY) + rc = tmprc; + goto cifs_rename_exit; + } + rc = cifs_do_rename(xid, source_dentry, from_name, + target_dentry, to_name); + if (!rc) + rehash = false; + } } /* force revalidate to go get info when needed */ CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; cifs_rename_exit: + if (rehash) + d_rehash(target_dentry); kfree(info_buf_source); free_dentry_path(page2); free_dentry_path(page1); @@ -2599,6 +2692,8 @@ cifs_dentry_needs_reval(struct dentry *dentry) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct cached_fid *cfid = NULL; + if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags)) + return false; if (cifs_i->time == 0) return true; @@ -2749,7 +2844,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) } cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", - full_path, inode, inode->i_count.counter, + full_path, inode, icount_read(inode), dentry, cifs_get_time(dentry), jiffies); again: diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index da23cc12a52c..dda6dece802a 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -832,33 +832,28 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) kfree(tmp_list); } } -void -cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) + +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, + struct dentry *dentry) { - struct cifsFileInfo *cfile; struct file_list *tmp_list, *tmp_next_list; - void *page; - const char *full_path; + struct cifsFileInfo *cfile; LIST_HEAD(file_head); - page = alloc_dentry_path(); spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { - full_path = build_path_from_dentry(cfile->dentry, page); - if (strstr(full_path, path)) { - if (delayed_work_pending(&cfile->deferred)) { - if (cancel_delayed_work(&cfile->deferred)) { - spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - cifs_del_deferred_close(cfile); - spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - - tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); - if (tmp_list == NULL) - break; - tmp_list->cfile = cfile; - list_add_tail(&tmp_list->list, &file_head); - } - } + if ((cfile->dentry == dentry) && + delayed_work_pending(&cfile->deferred) && + cancel_delayed_work(&cfile->deferred)) { + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + break; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); } } spin_unlock(&tcon->open_file_lock); @@ -868,7 +863,6 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) list_del(&tmp_list->list); kfree(tmp_list); } - free_dentry_path(page); } /* diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 7869cec58f52..10c84c095fe7 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -278,7 +278,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb, } /* - * For absolute symlinks it is not possible to determinate + * For absolute symlinks it is not possible to determine * if it should point to directory or file. */ if (symname[0] == '/') { diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 893a1ea8c000..a02d41d1ce4a 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -1005,7 +1005,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, rc = -EOPNOTSUPP; } - /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */ + /* Fallback to SMB_COM_SETATTR command when absolutely needed. */ if (rc == -EOPNOTSUPP) { cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n"); rc = SMBSetInformation(xid, tcon, full_path, @@ -1039,7 +1039,7 @@ set_via_filehandle: cifsFileInfo_put(open_file); /* - * Setting the read-only bit is not honered on non-NT servers when done + * Setting the read-only bit is not honored on non-NT servers when done * via open-semantics. So for setting it, use SMB_COM_SETATTR command. * This command works only after the file is closed, so use it only when * operation was called without the filehandle. diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index 224495322a05..e56e4d402f13 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -30,10 +30,9 @@ enum smb2_compound_ops { SMB2_OP_QUERY_DIR, SMB2_OP_MKDIR, SMB2_OP_RENAME, - SMB2_OP_DELETE, SMB2_OP_HARDLINK, SMB2_OP_SET_EOF, - SMB2_OP_RMDIR, + SMB2_OP_UNLINK, SMB2_OP_POSIX_QUERY_INFO, SMB2_OP_SET_REPARSE, SMB2_OP_GET_REPARSE, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 2a0316c514e4..0985db9f86e5 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -207,8 +207,10 @@ replay_again: server = cifs_pick_channel(ses); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); - if (vars == NULL) - return -ENOMEM; + if (vars == NULL) { + rc = -ENOMEM; + goto out; + } rqst = &vars->rqst[0]; rsp_iov = &vars->rsp_iov[0]; @@ -344,9 +346,6 @@ replay_again: trace_smb3_posix_query_info_compound_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_DELETE: - trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path); - break; case SMB2_OP_MKDIR: /* * Directories are created through parameters in the @@ -354,23 +353,40 @@ replay_again: */ trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_RMDIR: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + case SMB2_OP_UNLINK: + rqst[num_rqst].rq_iov = vars->unlink_iov; rqst[num_rqst].rq_nvec = 1; size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ data[0] = &delete_pending[0]; - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_DISPOSITION_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (rc) + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); - trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path); + } + num_rqst++; + trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path); break; case SMB2_OP_SET_EOF: rqst[num_rqst].rq_iov = &vars->si_iov[0]; @@ -440,7 +456,7 @@ replay_again: ses->Suid, full_path); break; case SMB2_OP_RENAME: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_iov = vars->rename_iov; rqst[num_rqst].rq_nvec = 2; len = in_iov[i].iov_len; @@ -671,7 +687,7 @@ finished: } for (i = 0; i < num_cmds; i++) { - char *buf = rsp_iov[i + i].iov_base; + char *buf = rsp_iov[i + 1].iov_base; if (buf && resp_buftype[i + 1] != CIFS_NO_BUFFER) rc = server->ops->map_error(buf, false); @@ -730,19 +746,6 @@ finished: trace_smb3_posix_query_info_compound_done(xid, tcon->tid, ses->Suid); break; - case SMB2_OP_DELETE: - if (rc) - trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc); - else { - /* - * If dentry (hence, inode) is NULL, lease break is going to - * take care of degrading leases on handles for deleted files. - */ - if (inode) - cifs_mark_open_handles_for_deleted_file(inode, full_path); - trace_smb3_delete_done(xid, tcon->tid, ses->Suid); - } - break; case SMB2_OP_MKDIR: if (rc) trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc); @@ -763,11 +766,11 @@ finished: trace_smb3_rename_done(xid, tcon->tid, ses->Suid); SMB2_set_info_free(&rqst[num_rqst++]); break; - case SMB2_OP_RMDIR: - if (rc) - trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc); + case SMB2_OP_UNLINK: + if (!rc) + trace_smb3_unlink_done(xid, tcon->tid, ses->Suid); else - trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid); + trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc); SMB2_set_info_free(&rqst[num_rqst++]); break; case SMB2_OP_SET_EOF: @@ -864,6 +867,7 @@ finished: smb2_should_replay(tcon, &retries, &cur_sleep)) goto replay_again; +out: if (cfile) cifsFileInfo_put(cfile); @@ -1163,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE); return smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, NULL, - &(int){SMB2_OP_RMDIR}, 1, + &(int){SMB2_OP_UNLINK}, 1, NULL, NULL, NULL, NULL); } @@ -1171,21 +1175,107 @@ int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb, struct dentry *dentry) { + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + __le16 *utf16_path __free(kfree) = NULL; + int retries = 0, cur_sleep = 1; + struct TCP_Server_Info *server; struct cifs_open_parms oparms; + struct smb2_create_req *creq; + struct inode *inode = NULL; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + struct kvec close_iov; + int resp_buftype[2]; + struct cifs_fid fid; + int flags = 0; + __u8 oplock; + int rc; - oparms = CIFS_OPARMS(cifs_sb, tcon, name, - DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE); - int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, dentry); - if (rc == -EINVAL) { - cifs_dbg(FYI, "invalid lease key, resending request without lease"); - rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, NULL); + utf16_path = cifs_convert_path_to_utf16(name, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; +again: + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(tcon->ses); + + memset(rqst, 0, sizeof(rqst)); + memset(resp_buftype, 0, sizeof(resp_buftype)); + memset(rsp_iov, 0, sizeof(rsp_iov)); + + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = ARRAY_SIZE(open_iov); + + oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE | FILE_READ_ATTRIBUTES, + FILE_OPEN, CREATE_DELETE_ON_CLOSE | + OPEN_REPARSE_POINT, ACL_NO_MODE); + oparms.fid = &fid; + + if (dentry) { + inode = d_inode(dentry); + if (CIFS_I(inode)->lease_granted && server->ops->get_lease_key) { + oplock = SMB2_OPLOCK_LEVEL_LEASE; + server->ops->get_lease_key(inode, &fid); + } + } + + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); + if (rc) + goto err_free; + smb2_set_next_command(tcon, &rqst[0]); + creq = rqst[0].rq_iov[0].iov_base; + creq->ShareAccess = FILE_SHARE_DELETE_LE; + + rqst[1].rq_iov = &close_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_close_init(tcon, server, &rqst[1], + COMPOUND_FID, COMPOUND_FID, false); + smb2_set_related(&rqst[1]); + if (rc) + goto err_free; + + if (retries) { + for (int i = 0; i < ARRAY_SIZE(rqst); i++) + smb2_set_replay(server, &rqst[i]); } + + rc = compound_send_recv(xid, tcon->ses, server, flags, + ARRAY_SIZE(rqst), rqst, + resp_buftype, rsp_iov); + SMB2_open_free(&rqst[0]); + SMB2_close_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto again; + + /* Retry compound request without lease */ + if (rc == -EINVAL && dentry) { + dentry = NULL; + retries = 0; + cur_sleep = 1; + goto again; + } + /* + * If dentry (hence, inode) is NULL, lease break is going to + * take care of degrading leases on handles for deleted files. + */ + if (!rc && inode) + cifs_mark_open_handles_for_deleted_file(inode, name); + + return rc; + +err_free: + SMB2_open_free(&rqst[0]); + SMB2_close_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } @@ -1438,3 +1528,113 @@ out: cifs_free_open_info(&data); return rc; } + +static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb, + const char *name, size_t namelen) +{ + int len; + + if (*name == '\\' || + (cifs_sb_master_tlink(cifs_sb) && + cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/')) + name++; + return cifs_strndup_to_utf16(name, namelen, &len, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); +} + +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry)); + __le16 *utf16_path __free(kfree) = NULL; + __u32 co = file_create_options(dentry); + int cmds[] = { + SMB2_OP_SET_INFO, + SMB2_OP_RENAME, + SMB2_OP_UNLINK, + }; + const int num_cmds = ARRAY_SIZE(cmds); + char *to_name __free(kfree) = NULL; + __u32 attrs = cinode->cifsAttrs; + struct cifs_open_parms oparms; + static atomic_t sillycounter; + struct cifsFileInfo *cfile; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct kvec iov[2]; + const char *ppath; + void *page; + size_t len; + int rc; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + page = alloc_dentry_path(); + + ppath = build_path_from_dentry(dentry->d_parent, page); + if (IS_ERR(ppath)) { + rc = PTR_ERR(ppath); + goto out; + } + + len = strlen(ppath) + strlen("/.__smb1234") + 1; + to_name = kmalloc(len, GFP_KERNEL); + if (!to_name) { + rc = -ENOMEM; + goto out; + } + + scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb), + atomic_inc_return(&sillycounter) & 0xffff); + + utf16_path = utf16_smb2_path(cifs_sb, to_name, len); + if (!utf16_path) { + rc = -ENOMEM; + goto out; + } + + drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + DELETE | FILE_WRITE_ATTRIBUTES, + FILE_OPEN, co, ACL_NO_MODE); + + attrs &= ~ATTR_READONLY; + if (!attrs) + attrs = ATTR_NORMAL; + if (d_inode(dentry)->i_nlink <= 1) + attrs |= ATTR_HIDDEN; + iov[0].iov_base = &(FILE_BASIC_INFO) { + .Attributes = cpu_to_le32(attrs), + }; + iov[0].iov_len = sizeof(FILE_BASIC_INFO); + iov[1].iov_base = utf16_path; + iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path); + + cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, dentry); + if (rc == -EINVAL) { + cifs_dbg(FYI, "invalid lease key, resending request without lease\n"); + cifs_get_writable_path(tcon, full_path, + FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, NULL); + } + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags); + } else { + cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n", + __func__, full_path, to_name, rc); + rc = -EIO; + } +out: + cifs_put_tlink(tlink); + free_dentry_path(page); + return rc; +} diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index cddf273c14ae..89d933b4a8bc 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -614,6 +614,15 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) struct cifs_tcon *tcon; struct cifs_pending_open *open; + /* Trace receipt of lease break request from server */ + trace_smb3_lease_break_enter(le32_to_cpu(rsp->CurrentLeaseState), + le32_to_cpu(rsp->Flags), + le16_to_cpu(rsp->Epoch), + le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), + le64_to_cpu(rsp->hdr.SessionId), + *((u64 *)rsp->LeaseKey), + *((u64 *)&rsp->LeaseKey[8])); + cifs_dbg(FYI, "Checking for lease break\n"); /* If server is a channel, select the primary channel */ @@ -660,10 +669,12 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "Can not process lease break - no lease matched\n"); trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState), - le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), - le64_to_cpu(rsp->hdr.SessionId), - *((u64 *)rsp->LeaseKey), - *((u64 *)&rsp->LeaseKey[8])); + le32_to_cpu(rsp->Flags), + le16_to_cpu(rsp->Epoch), + le32_to_cpu(rsp->hdr.Id.SyncId.TreeId), + le64_to_cpu(rsp->hdr.SessionId), + *((u64 *)rsp->LeaseKey), + *((u64 *)&rsp->LeaseKey[8])); return false; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index ad8947434b71..e586f3f4b5c9 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -772,6 +772,13 @@ next_iface: bytes_left -= sizeof(*p); break; } + /* Validate that Next doesn't point beyond the buffer */ + if (next > bytes_left) { + cifs_dbg(VFS, "%s: invalid Next pointer %zu > %zd\n", + __func__, next, bytes_left); + rc = -EINVAL; + goto out; + } p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); bytes_left -= next; } @@ -783,7 +790,9 @@ next_iface: } /* Azure rounds the buffer size up 8, to a 16 byte boundary */ - if ((bytes_left > 8) || p->Next) + if ((bytes_left > 8) || + (bytes_left >= offsetof(struct network_interface_info_ioctl_rsp, Next) + + sizeof(p->Next) && p->Next)) cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); ses->iface_last_update = jiffies; @@ -2631,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* SMB headers in a compound are 8 byte aligned. */ - if (!IS_ALIGNED(len, 8)) { - num_padding = 8 - (len & 7); + if (IS_ALIGNED(len, 8)) + goto out; + + num_padding = 8 - (len & 7); + if (smb3_encryption_required(tcon)) { + int i; + + /* + * Flatten request into a single buffer with required padding as + * the encryption layer can't handle the padding iovs. + */ + for (i = 1; i < rqst->rq_nvec; i++) { + memcpy(rqst->rq_iov[0].iov_base + + rqst->rq_iov[0].iov_len, + rqst->rq_iov[i].iov_base, + rqst->rq_iov[i].iov_len); + rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len; + } + memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len, + 0, num_padding); + rqst->rq_iov[0].iov_len += num_padding; + rqst->rq_nvec = 1; + } else { rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding; rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding; rqst->rq_nvec++; - len += num_padding; } + len += num_padding; +out: shdr->NextCommand = cpu_to_le32(len); } @@ -4487,7 +4518,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, for (int i = 1; i < num_rqst; i++) { struct smb_rqst *old = &old_rq[i - 1]; struct smb_rqst *new = &new_rq[i]; - struct folio_queue *buffer; + struct folio_queue *buffer = NULL; size_t size = iov_iter_count(&old->rq_iter); orig_len += smb_rqst_len(server, old); @@ -4805,7 +4836,7 @@ static void smb2_decrypt_offload(struct work_struct *work) dw->server->ops->is_network_name_deleted(dw->buf, dw->server); - mid->callback(mid); + mid_execute_callback(mid); } else { spin_lock(&dw->server->srv_lock); if (dw->server->tcpStatus == CifsNeedReconnect) { @@ -4813,7 +4844,7 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->mid_state = MID_RETRY_NEEDED; spin_unlock(&dw->server->mid_queue_lock); spin_unlock(&dw->server->srv_lock); - mid->callback(mid); + mid_execute_callback(mid); } else { spin_lock(&dw->server->mid_queue_lock); mid->mid_state = MID_REQUEST_SUBMITTED; @@ -5367,6 +5398,7 @@ struct smb_version_operations smb20_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ @@ -5472,6 +5504,7 @@ struct smb_version_operations smb21_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb30_operations = { @@ -5588,6 +5621,7 @@ struct smb_version_operations smb30_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb311_operations = { @@ -5704,6 +5738,7 @@ struct smb_version_operations smb311_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 2df93a75e3b8..c3b9d3f6210f 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -6192,11 +6192,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, please_key_high = (__u64 *)(lease_key+8); if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); - trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid, + trace_smb3_lease_ack_err(le32_to_cpu(lease_state), tcon->tid, ses->Suid, *please_key_low, *please_key_high, rc); cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc); } else - trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid, + trace_smb3_lease_ack_done(le32_to_cpu(lease_state), tcon->tid, ses->Suid, *please_key_low, *please_key_high); return rc; diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 6e805ece6a7b..b3f1398c9f79 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end); int smb2_make_nfs_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid); #endif /* _SMB2PROTO_H */ diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index ff9ef7fcd010..bc0e92eb2b64 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -771,6 +771,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *shdr, temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); memset(temp, 0, sizeof(struct mid_q_entry)); kref_init(&temp->refcount); + spin_lock_init(&temp->mid_lock); temp->mid = le64_to_cpu(shdr->MessageId); temp->credits = credits > 0 ? credits : 1; temp->pid = current->pid; diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index c628e91c328b..e0fce5033004 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -453,9 +453,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_recv_io *response = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); struct smbdirect_socket *sc = response->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbd_connection *info = container_of(sc, struct smbd_connection, socket); - int data_length = 0; + u32 data_offset = 0; + u32 data_length = 0; + u32 remaining_data_length = 0; log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", response, sc->recv_io.expected, wc->status, wc->opcode, @@ -487,7 +490,22 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) /* SMBD data transfer packet */ case SMBDIRECT_EXPECT_DATA_TRANSFER: data_transfer = smbdirect_recv_io_payload(response); + + if (wc->byte_len < + offsetof(struct smbdirect_data_transfer, padding)) + goto error; + + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); data_length = le32_to_cpu(data_transfer->data_length); + if (wc->byte_len < data_offset || + (u64)wc->byte_len < (u64)data_offset + data_length) + goto error; + + if (remaining_data_length > sp->max_fragmented_recv_size || + data_length > sp->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size) + goto error; if (data_length) { if (sc->recv_io.reassembly.full_packet_received) @@ -1090,8 +1108,10 @@ static int smbd_negotiate(struct smbd_connection *info) log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); - if (rc) + if (rc) { + put_receive_buffer(info, response); return rc; + } init_completion(&info->negotiate_completion); info->negotiate_done = false; @@ -1329,17 +1349,16 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } + log_rdma_event(INFO, "cancelling post_send_credits_work\n"); + disable_work_sync(&info->post_send_credits_work); + log_rdma_event(INFO, "destroying qp\n"); ib_drain_qp(sc->ib.qp); rdma_destroy_qp(sc->rdma.cm_id); sc->ib.qp = NULL; log_rdma_event(INFO, "cancelling idle timer\n"); - cancel_delayed_work_sync(&info->idle_timer_work); - - log_rdma_event(INFO, "wait for all send posted to IB to finish\n"); - wait_event(info->wait_send_pending, - atomic_read(&info->send_pending) == 0); + disable_delayed_work_sync(&info->idle_timer_work); /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); @@ -1712,7 +1731,7 @@ allocate_mr_failed: return NULL; negotiation_failed: - cancel_delayed_work_sync(&info->idle_timer_work); + disable_delayed_work_sync(&info->idle_timer_work); destroy_caches_and_workqueue(info); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; rdma_disconnect(sc->rdma.cm_id); @@ -1986,7 +2005,11 @@ int smbd_send(struct TCP_Server_Info *server, */ wait_event(info->wait_send_pending, - atomic_read(&info->send_pending) == 0); + atomic_read(&info->send_pending) == 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0) + rc = -EAGAIN; return rc; } @@ -2067,7 +2090,7 @@ static void destroy_mr_list(struct smbd_connection *info) struct smbdirect_socket *sc = &info->socket; struct smbd_mr *mr, *tmp; - cancel_work_sync(&info->mr_recovery_work); + disable_work_sync(&info->mr_recovery_work); list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { if (mr->state == MR_INVALIDATED) ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 93e5b2bb9f28..fd650e2afc76 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter); @@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done); @@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err); @@ -1171,8 +1168,54 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \ __u64 lease_key_high), \ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high)) -DEFINE_SMB3_LEASE_DONE_EVENT(lease_done); -DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found); +DEFINE_SMB3_LEASE_DONE_EVENT(lease_ack_done); +/* Tracepoint when a lease break request is received/entered (includes epoch and flags) */ +DECLARE_EVENT_CLASS(smb3_lease_enter_class, + TP_PROTO(__u32 lease_state, + __u32 flags, + __u16 epoch, + __u32 tid, + __u64 sesid, + __u64 lease_key_low, + __u64 lease_key_high), + TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high), + TP_STRUCT__entry( + __field(__u32, lease_state) + __field(__u32, flags) + __field(__u16, epoch) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, lease_key_low) + __field(__u64, lease_key_high) + ), + TP_fast_assign( + __entry->lease_state = lease_state; + __entry->flags = flags; + __entry->epoch = epoch; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->lease_key_low = lease_key_low; + __entry->lease_key_high = lease_key_high; + ), + TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x flags=0x%x epoch=%u", + __entry->sesid, __entry->tid, __entry->lease_key_high, + __entry->lease_key_low, __entry->lease_state, __entry->flags, __entry->epoch) +) + +#define DEFINE_SMB3_LEASE_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_lease_enter_class, smb3_##name, \ + TP_PROTO(__u32 lease_state, \ + __u32 flags, \ + __u16 epoch, \ + __u32 tid, \ + __u64 sesid, \ + __u64 lease_key_low, \ + __u64 lease_key_high), \ + TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high)) + +DEFINE_SMB3_LEASE_ENTER_EVENT(lease_break_enter); +/* Lease not found: reuse lease_enter payload (includes epoch and flags) */ +DEFINE_SMB3_LEASE_ENTER_EVENT(lease_not_found); DECLARE_EVENT_CLASS(smb3_lease_err_class, TP_PROTO(__u32 lease_state, @@ -1213,7 +1256,7 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \ int rc), \ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc)) -DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); +DEFINE_SMB3_LEASE_ERR_EVENT(lease_ack_err); DECLARE_EVENT_CLASS(smb3_connect_class, TP_PROTO(char *hostname, diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 32d528b4dd83..a61ba7f3fb86 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -1005,15 +1005,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n", midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(server, &rqst[i], midQ[i]); - spin_lock(&server->mid_queue_lock); + spin_lock(&midQ[i]->mid_lock); midQ[i]->wait_cancelled = true; - if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED || - midQ[i]->mid_state == MID_RESPONSE_RECEIVED) { + if (midQ[i]->callback) { midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; credits[i].value = 0; } - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ[i]->mid_lock); } } diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 3f04a2977ba8..67c4f73398df 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -504,7 +504,8 @@ void ksmbd_conn_transport_destroy(void) { mutex_lock(&init_lock); ksmbd_tcp_destroy(); - ksmbd_rdma_destroy(); + ksmbd_rdma_stop_listening(); stop_sessions(); + ksmbd_rdma_destroy(); mutex_unlock(&init_lock); } diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 31dd1caac1e8..2aa8084bb593 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -46,7 +46,12 @@ struct ksmbd_conn { struct mutex srv_mutex; int status; unsigned int cli_cap; - __be32 inet_addr; + union { + __be32 inet_addr; +#if IS_ENABLED(CONFIG_IPV6) + u8 inet6_addr[16]; +#endif + }; char *request_buf; struct ksmbd_transport *transport; struct nls_table *local_nls; diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c index 72b00ca6e455..4a71f46d7020 100644 --- a/fs/smb/server/ksmbd_work.c +++ b/fs/smb/server/ksmbd_work.c @@ -78,7 +78,7 @@ int ksmbd_work_pool_init(void) int ksmbd_workqueue_init(void) { - ksmbd_wq = alloc_workqueue("ksmbd-io", 0, 0); + ksmbd_wq = alloc_workqueue("ksmbd-io", WQ_PERCPU, 0); if (!ksmbd_wq) return -ENOMEM; return 0; diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index d7a8a580d013..a04d5702820d 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1102,8 +1102,10 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp, if (!atomic_inc_not_zero(&opinfo->refcount)) continue; - if (ksmbd_conn_releasing(opinfo->conn)) + if (ksmbd_conn_releasing(opinfo->conn)) { + opinfo_put(opinfo); continue; + } oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL); opinfo_put(opinfo); @@ -1139,8 +1141,11 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp) if (!atomic_inc_not_zero(&opinfo->refcount)) continue; - if (ksmbd_conn_releasing(opinfo->conn)) + if (ksmbd_conn_releasing(opinfo->conn)) { + opinfo_put(opinfo); continue; + } + oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL); opinfo_put(opinfo); } @@ -1343,8 +1348,10 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, if (!atomic_inc_not_zero(&brk_op->refcount)) continue; - if (ksmbd_conn_releasing(brk_op->conn)) + if (ksmbd_conn_releasing(brk_op->conn)) { + opinfo_put(brk_op); continue; + } if (brk_op->is_lease && (brk_op->o_lease->state & (~(SMB2_LEASE_READ_CACHING_LE | diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 0d92ce49aed7..a565fc36cee6 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2951,18 +2951,19 @@ int smb2_open(struct ksmbd_work *work) } ksmbd_debug(SMB, "converted name = %s\n", name); - if (strchr(name, ':')) { - if (!test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_STREAMS)) { - rc = -EBADF; - goto err_out2; - } - rc = parse_stream_name(name, &stream_name, &s_type); - if (rc < 0) - goto err_out2; - } if (posix_ctxt == false) { + if (strchr(name, ':')) { + if (!test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_STREAMS)) { + rc = -EBADF; + goto err_out2; + } + rc = parse_stream_name(name, &stream_name, &s_type); + if (rc < 0) + goto err_out2; + } + rc = ksmbd_validate_filename(name); if (rc < 0) goto err_out2; @@ -3443,6 +3444,8 @@ int smb2_open(struct ksmbd_work *work) fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE | FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE)); + fp->is_posix_ctxt = posix_ctxt; + /* fp should be searchable through ksmbd_inode.m_fp_list * after daccess, saccess, attrib_only, and stream are * initialized. @@ -5988,7 +5991,7 @@ static int smb2_rename(struct ksmbd_work *work, if (IS_ERR(new_name)) return PTR_ERR(new_name); - if (strchr(new_name, ':')) { + if (fp->is_posix_ctxt == false && strchr(new_name, ':')) { int s_type; char *xattr_stream_name, *stream_name = NULL; size_t xattr_stream_size; diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 8d366db5f605..5df138fc6004 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -148,7 +148,7 @@ struct smb_direct_transport { wait_queue_head_t wait_send_pending; atomic_t send_pending; - struct delayed_work post_recv_credits_work; + struct work_struct post_recv_credits_work; struct work_struct send_immediate_work; struct work_struct disconnect_work; @@ -367,8 +367,8 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) spin_lock_init(&t->lock_new_recv_credits); - INIT_DELAYED_WORK(&t->post_recv_credits_work, - smb_direct_post_recv_credits); + INIT_WORK(&t->post_recv_credits_work, + smb_direct_post_recv_credits); INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work); INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work); @@ -399,9 +399,9 @@ static void free_transport(struct smb_direct_transport *t) wait_event(t->wait_send_pending, atomic_read(&t->send_pending) == 0); - cancel_work_sync(&t->disconnect_work); - cancel_delayed_work_sync(&t->post_recv_credits_work); - cancel_work_sync(&t->send_immediate_work); + disable_work_sync(&t->disconnect_work); + disable_work_sync(&t->post_recv_credits_work); + disable_work_sync(&t->send_immediate_work); if (t->qp) { ib_drain_qp(t->qp); @@ -554,7 +554,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMB_DIRECT_MSG_DATA_TRANSFER: { struct smb_direct_data_transfer *data_transfer = (struct smb_direct_data_transfer *)recvmsg->packet; - unsigned int data_length; + u32 remaining_data_length, data_offset, data_length; int avail_recvmsg_count, receive_credits; if (wc->byte_len < @@ -564,15 +564,25 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) return; } + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); data_length = le32_to_cpu(data_transfer->data_length); - if (data_length) { - if (wc->byte_len < sizeof(struct smb_direct_data_transfer) + - (u64)data_length) { - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); - return; - } + data_offset = le32_to_cpu(data_transfer->data_offset); + if (wc->byte_len < data_offset || + wc->byte_len < (u64)data_offset + data_length) { + put_recvmsg(t, recvmsg); + smb_direct_disconnect_rdma_connection(t); + return; + } + if (remaining_data_length > t->max_fragmented_recv_size || + data_length > t->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > + (u64)t->max_fragmented_recv_size) { + put_recvmsg(t, recvmsg); + smb_direct_disconnect_rdma_connection(t); + return; + } + if (data_length) { if (t->full_packet_received) recvmsg->first_segment = true; @@ -605,8 +615,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) wake_up_interruptible(&t->wait_send_credits); if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count)) - mod_delayed_work(smb_direct_wq, - &t->post_recv_credits_work, 0); + queue_work(smb_direct_wq, &t->post_recv_credits_work); if (data_length) { enqueue_reassembly(t, recvmsg, (int)data_length); @@ -763,8 +772,7 @@ again: st->count_avail_recvmsg += queue_removed; if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) { spin_unlock(&st->receive_credit_lock); - mod_delayed_work(smb_direct_wq, - &st->post_recv_credits_work, 0); + queue_work(smb_direct_wq, &st->post_recv_credits_work); } else { spin_unlock(&st->receive_credit_lock); } @@ -791,7 +799,7 @@ read_rfc1002_done: static void smb_direct_post_recv_credits(struct work_struct *work) { struct smb_direct_transport *t = container_of(work, - struct smb_direct_transport, post_recv_credits_work.work); + struct smb_direct_transport, post_recv_credits_work); struct smb_direct_recvmsg *recvmsg; int receive_credits, credits = 0; int ret; @@ -1209,78 +1217,130 @@ static int smb_direct_writev(struct ksmbd_transport *t, bool need_invalidate, unsigned int remote_key) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); - int remaining_data_length; - int start, i, j; - int max_iov_size = st->max_send_size - + size_t remaining_data_length; + size_t iov_idx; + size_t iov_ofs; + size_t max_iov_size = st->max_send_size - sizeof(struct smb_direct_data_transfer); int ret; - struct kvec vec; struct smb_direct_send_ctx send_ctx; + int error = 0; if (st->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; //FIXME: skip RFC1002 header.. + if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4)) + return -EINVAL; buflen -= 4; + iov_idx = 1; + iov_ofs = 0; remaining_data_length = buflen; ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); - start = i = 1; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen - iov[i].iov_len); - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) + while (remaining_data_length) { + struct kvec vecs[SMB_DIRECT_MAX_SEND_SGES - 1]; /* minus smbdirect hdr */ + size_t possible_bytes = max_iov_size; + size_t possible_vecs; + size_t bytes = 0; + size_t nvecs = 0; + + /* + * For the last message remaining_data_length should be + * have been 0 already! + */ + if (WARN_ON_ONCE(iov_idx >= niovs)) { + error = -EINVAL; + goto done; + } + + /* + * We have 2 factors which limit the arguments we pass + * to smb_direct_post_send_data(): + * + * 1. The number of supported sges for the send, + * while one is reserved for the smbdirect header. + * And we currently need one SGE per page. + * 2. The number of negotiated payload bytes per send. + */ + possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx); + + while (iov_idx < niovs && possible_vecs && possible_bytes) { + struct kvec *v = &vecs[nvecs]; + int page_count; + + v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs; + v->iov_len = min_t(size_t, + iov[iov_idx].iov_len - iov_ofs, + possible_bytes); + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (page_count > possible_vecs) { + /* + * If the number of pages in the buffer + * is to much (because we currently require + * one SGE per page), we need to limit the + * length. + * + * We know possible_vecs is at least 1, + * so we always keep the first page. + * + * We need to calculate the number extra + * pages (epages) we can also keep. + * + * We calculate the number of bytes in the + * first page (fplen), this should never be + * larger than v->iov_len because page_count is + * at least 2, but adding a limitation feels + * better. + * + * Then we calculate the number of bytes (elen) + * we can keep for the extra pages. + */ + size_t epages = possible_vecs - 1; + size_t fpofs = offset_in_page(v->iov_base); + size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len); + size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE); + + v->iov_len = fplen + elen; + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (WARN_ON_ONCE(page_count > possible_vecs)) { + /* + * Something went wrong in the above + * logic... + */ + error = -EINVAL; goto done; - } else { - /* iov[start] is too big, break it */ - int nvec = (buflen + max_iov_size - 1) / - max_iov_size; - - for (j = 0; j < nvec; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j * max_iov_size; - vec.iov_len = - min_t(int, max_iov_size, - buflen - max_iov_size * j); - remaining_data_length -= vec.iov_len; - ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1, - remaining_data_length); - if (ret) - goto done; } - i++; - if (i == niovs) - break; } - start = i; - buflen = 0; - } else { - i++; - if (i == niovs) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) - goto done; - break; + possible_vecs -= page_count; + nvecs += 1; + possible_bytes -= v->iov_len; + bytes += v->iov_len; + + iov_ofs += v->iov_len; + if (iov_ofs >= iov[iov_idx].iov_len) { + iov_idx += 1; + iov_ofs = 0; } } + + remaining_data_length -= bytes; + + ret = smb_direct_post_send_data(st, &send_ctx, + vecs, nvecs, + remaining_data_length); + if (unlikely(ret)) { + error = ret; + goto done; + } } done: ret = smb_direct_flush_send_list(st, &send_ctx, true); + if (unlikely(!ret && error)) + ret = error; /* * As an optimization, we don't wait for individual I/O to finish @@ -1672,7 +1732,7 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) goto out_err; } - smb_direct_post_recv_credits(&t->post_recv_credits_work.work); + smb_direct_post_recv_credits(&t->post_recv_credits_work); return 0; out_err: put_recvmsg(t, recvmsg); @@ -1744,6 +1804,11 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } + if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { + pr_err("warning: device max_send_sge = %d too small\n", + device->attrs.max_send_sge); + return -EINVAL; + } if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); @@ -1767,7 +1832,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = max_sge_per_wr; + cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; cap->max_rdma_ctxs = t->max_rw_credits; @@ -2177,7 +2242,8 @@ int ksmbd_rdma_init(void) * for lack of credits */ smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq", - WQ_HIGHPRI | WQ_MEM_RECLAIM, 0); + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!smb_direct_wq) return -ENOMEM; @@ -2194,7 +2260,7 @@ int ksmbd_rdma_init(void) return 0; } -void ksmbd_rdma_destroy(void) +void ksmbd_rdma_stop_listening(void) { if (!smb_direct_listener.cm_id) return; @@ -2203,7 +2269,10 @@ void ksmbd_rdma_destroy(void) rdma_destroy_id(smb_direct_listener.cm_id); smb_direct_listener.cm_id = NULL; +} +void ksmbd_rdma_destroy(void) +{ if (smb_direct_wq) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index 77aee4e5c9dc..a2291b77488a 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -54,13 +54,15 @@ struct smb_direct_data_transfer { #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); +void ksmbd_rdma_stop_listening(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); void init_smbd_max_io_size(unsigned int sz); unsigned int get_smbd_max_read_write_size(void); #else static inline int ksmbd_rdma_init(void) { return 0; } -static inline int ksmbd_rdma_destroy(void) { return 0; } +static inline void ksmbd_rdma_stop_listening(void) { } +static inline void ksmbd_rdma_destroy(void) { } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } static inline void init_smbd_max_io_size(unsigned int sz) { } static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index b1df02e321b0..4337df97987d 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -85,7 +85,14 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) return NULL; } +#if IS_ENABLED(CONFIG_IPV6) + if (client_sk->sk->sk_family == AF_INET6) + memcpy(&conn->inet6_addr, &client_sk->sk->sk_v6_daddr, 16); + else + conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr; +#else conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr; +#endif conn->transport = KSMBD_TRANS(t); KSMBD_TRANS(t)->conn = conn; KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops; @@ -229,7 +236,6 @@ static int ksmbd_kthread_fn(void *p) { struct socket *client_sk = NULL; struct interface *iface = (struct interface *)p; - struct inet_sock *csk_inet; struct ksmbd_conn *conn; int ret; @@ -252,13 +258,27 @@ static int ksmbd_kthread_fn(void *p) /* * Limits repeated connections from clients with the same IP. */ - csk_inet = inet_sk(client_sk->sk); down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) - if (csk_inet->inet_daddr == conn->inet_addr) { +#if IS_ENABLED(CONFIG_IPV6) + if (client_sk->sk->sk_family == AF_INET6) { + if (memcmp(&client_sk->sk->sk_v6_daddr, + &conn->inet6_addr, 16) == 0) { + ret = -EAGAIN; + break; + } + } else if (inet_sk(client_sk->sk)->inet_daddr == + conn->inet_addr) { + ret = -EAGAIN; + break; + } +#else + if (inet_sk(client_sk->sk)->inet_daddr == + conn->inet_addr) { ret = -EAGAIN; break; } +#endif up_read(&conn_list_lock); if (ret == -EAGAIN) continue; diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index 0708155b5caf..78b506c5ef03 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -112,6 +112,8 @@ struct ksmbd_file { bool is_durable; bool is_persistent; bool is_resilient; + + bool is_posix_ctxt; }; static inline void set_ctx_actor(struct dir_context *ctx, diff --git a/fs/splice.c b/fs/splice.c index 4d6df083e0c0..f5094b6d00a0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -739,6 +739,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, sd.pos = kiocb.ki_pos; if (ret <= 0) break; + WARN_ONCE(ret > sd.total_len - left, + "Splice Exceeded! ret=%zd tot=%zu left=%zu\n", + ret, sd.total_len, left); sd.num_spliced += ret; sd.total_len -= ret; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 992ea0e37257..4465cf05603a 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -187,10 +187,15 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) unsigned short flags; unsigned int fragments; u64 lookup_table_start, xattr_id_table_start, next_table; - int err; + int err, devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); TRACE("Entered squashfs_fill_superblock\n"); + if (!devblksize) { + errorf(fc, "squashfs: unable to set blocksize\n"); + return -EINVAL; + } + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); @@ -201,12 +206,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) msblk->panic_on_errors = (opts->errors == Opt_errors_panic); - msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); - if (!msblk->devblksize) { - errorf(fc, "squashfs: unable to set blocksize\n"); - return -EINVAL; - } - + msblk->devblksize = devblksize; msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->meta_index_mutex); diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..f4fa0e93c463 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1716,49 +1716,6 @@ int get_tree_bdev(struct fs_context *fc, } EXPORT_SYMBOL(get_tree_bdev); -static int test_bdev_super(struct super_block *s, void *data) -{ - return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; -} - -struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - dev_t dev; - - error = lookup_bdev(dev_name, &dev); - if (error) - return ERR_PTR(error); - - flags |= SB_NOSEC; - s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (s->s_root) { - if ((flags ^ s->s_flags) & SB_RDONLY) { - deactivate_locked_super(s); - return ERR_PTR(-EBUSY); - } - } else { - error = setup_bdev_super(s, flags, NULL); - if (!error) - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - s->s_flags |= SB_ACTIVE; - } - - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_bdev); - void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; @@ -1773,26 +1730,6 @@ void kill_block_super(struct super_block *sb) EXPORT_SYMBOL(kill_block_super); #endif -struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); - - if (IS_ERR(s)) - return ERR_CAST(s); - - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_nodev); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. @@ -2314,17 +2251,20 @@ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", - WQ_MEM_RECLAIM, 0, + WQ_MEM_RECLAIM | WQ_PERCPU, + 0, sb->s_id); if (!wq) return -ENOMEM; + + old = NULL; /* * This has to be atomic as more DIOs can race to create the workqueue */ - old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); - /* Someone created workqueue before us? Free ours... */ - if (old) + if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) { + /* Someone created workqueue before us? Free ours... */ destroy_workqueue(wq); + } return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq); diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index fb5ac358077b..0b14d004a095 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -88,6 +88,8 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, } const struct fscrypt_operations ubifs_crypt_operations = { + .inode_info_offs = (int)offsetof(struct ubifs_inode, i_crypt_info) - + (int)offsetof(struct ubifs_inode, vfs_inode), .legacy_key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b2068608..46952a33c4e6 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) static int ubifs_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); @@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode) goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(c, !atomic_read(&inode->i_count)); + ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 5db45c9e26ee..49e50431741c 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -365,6 +365,7 @@ struct ubifs_gced_idx_leb { * @read_in_a_row: number of consecutive pages read in a row (for bulk read) * @data_len: length of the data attached to the inode * @data: inode's data + * @i_crypt_info: inode's fscrypt information * * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS @@ -416,6 +417,9 @@ struct ubifs_inode { pgoff_t read_in_a_row; int data_len; void *data; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif }; /** diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 503268cf4296..89eccc4becf9 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -284,9 +284,9 @@ static int enable_verity(struct file *filp, /* Successfully enabled verity */ /* - * Readers can start using ->i_verity_info immediately, so it - * can't be rolled back once set. So don't set it until just - * after the filesystem has successfully enabled verity. + * Readers can start using the inode's verity info immediately, + * so it can't be rolled back once set. So don't set it until + * just after the filesystem has successfully enabled verity. */ fsverity_set_info(inode, vi); } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 5fe854a5b9ad..bc1d887c532e 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -63,10 +63,11 @@ struct merkle_tree_params { * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated - * and stored in ->i_verity_info; it remains until the inode is evicted. It - * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file digest. The Merkle tree - * pages themselves are not cached here, but the filesystem may cache them. + * and a pointer to it is stored in the file's in-memory inode. It remains + * until the inode is evicted. It caches information about the Merkle tree + * that's needed to efficiently verify data read from the file. It also caches + * the file digest. The Merkle tree pages themselves are not cached here, but + * the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; diff --git a/fs/verity/open.c b/fs/verity/open.c index c561e130cd0c..77b1c977af02 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -244,17 +244,17 @@ fail: void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple tasks may race to set ->i_verity_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in - * fsverity_get_info(). I.e., here we publish ->i_verity_info with a - * RELEASE barrier so that other tasks can ACQUIRE it. + * Multiple tasks may race to set the inode's verity info pointer, so + * use cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., publish the pointer with a RELEASE + * barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { - /* Lost the race, so free the fsverity_info we allocated. */ + if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) { + /* Lost the race, so free the verity info we allocated. */ fsverity_free_info(vi); /* - * Afterwards, the caller may access ->i_verity_info directly, - * so make sure to ACQUIRE the winning fsverity_info. + * Afterwards, the caller may access the inode's verity info + * directly, so make sure to ACQUIRE the winning verity info. */ (void)fsverity_get_info(inode); } @@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode, return 0; } -/* Ensure the inode has an ->i_verity_info */ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); @@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); void __fsverity_cleanup_inode(struct inode *inode) { - fsverity_free_info(inode->i_verity_info); - inode->i_verity_info = NULL; + struct fsverity_info **vi_addr = fsverity_info_addr(inode); + + fsverity_free_info(*vi_addr); + *vi_addr = NULL; } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); diff --git a/fs/verity/verify.c b/fs/verity/verify.c index a1f00c3fd3b2..f0c47b9afb8c 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -245,7 +245,7 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, unsigned long max_ra_pages) { struct inode *inode = data_folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; + struct fsverity_info *vi = *fsverity_info_addr(inode); const unsigned int block_size = vi->tree_params.block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; @@ -355,7 +355,7 @@ void __init fsverity_init_workqueue(void) * latency on ARM64. */ fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue", - WQ_HIGHPRI, + WQ_HIGHPRI | WQ_PERCPU, num_online_cpus()); if (!fsverity_read_workqueue) panic("failed to allocate fsverity_read_queue"); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index ae0ca6858496..065953475cf5 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -105,6 +105,7 @@ config XFS_POSIX_ACL config XFS_RT bool "XFS Realtime subvolume support" depends on XFS_FS + default BLK_DEV_ZONED help If you say Y here you will be able to mount and use XFS filesystems which contain a realtime subvolume. The realtime subvolume is a diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4c44ce1c8a64..bff3dc226f81 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -435,6 +435,13 @@ xfs_attr_rmtval_get( 0, &bp, &xfs_attr3_rmt_buf_ops); if (xfs_metadata_is_sick(error)) xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK); + /* + * ENODATA from disk implies a disk medium failure; + * ENODATA for xattrs means attribute not found, so + * disambiguate that here. + */ + if (error == -ENODATA) + error = -EIO; if (error) return error; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 17d9e6154f19..723a0643b838 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2833,6 +2833,12 @@ xfs_da_read_buf( &bp, ops); if (xfs_metadata_is_sick(error)) xfs_dirattr_mark_sick(dp, whichfork); + /* + * ENODATA from disk implies a disk medium failure; ENODATA for + * xattrs means attribute not found, so disambiguate that here. + */ + if (error == -ENODATA && whichfork == XFS_ATTR_FORK) + error = -EIO; if (error) goto out_free; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 1e6e9c10cea2..a8187281eb96 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -479,7 +479,7 @@ DECLARE_EVENT_CLASS(xchk_dqiter_class, __field(xfs_exntst_t, state) ), TP_fast_assign( - __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev; + __entry->dev = cursor->sc->mp->m_super->s_dev; __entry->dqtype = cursor->dqtype; __entry->ino = cursor->quota_ip->i_ino; __entry->cur_id = cursor->id; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1ee4f835ac3c..a26f79815533 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -760,6 +760,9 @@ xfs_vm_swap_activate( { struct xfs_inode *ip = XFS_I(file_inode(swap_file)); + if (xfs_is_zoned_inode(ip)) + return -EINVAL; + /* * Swap file activation can race against concurrent shared extent * removal in files that have been cloned. If this happens, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 55a304cb3aef..f96fbf5c54c9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1101,9 +1101,6 @@ xfs_file_write_iter( if (xfs_is_shutdown(ip->i_mount)) return -EIO; - if (IS_DAX(inode)) - return xfs_file_dax_write(iocb, from); - if (iocb->ki_flags & IOCB_ATOMIC) { if (ocount < xfs_get_atomic_write_min(ip)) return -EINVAL; @@ -1116,6 +1113,9 @@ xfs_file_write_iter( return ret; } + if (IS_DAX(inode)) + return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a3..df8eab11dc48 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1035,7 +1035,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (atomic_read(&VFS_I(ip)->i_count)) + if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 07fbdcc4cbf5..bd6d33557194 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -358,9 +358,20 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) { + if (IS_DAX(VFS_IC(ip))) + return false; + return xfs_inode_buftarg(ip)->bt_awu_max > 0; } +static inline bool xfs_inode_can_sw_atomic_write(const struct xfs_inode *ip) +{ + if (IS_DAX(VFS_IC(ip))) + return false; + + return xfs_can_sw_atomic_write(ip->i_mount); +} + /* * In-core inode flags. */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index fe1f74a3b6a3..e1051a530a50 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -219,7 +219,7 @@ xfs_bulk_ireq_setup( else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno) return -EINVAL; - breq->flags |= XFS_IBULK_SAME_AG; + breq->iwalk_flags |= XFS_IWALK_SAME_AG; /* Asking for an inode past the end of the AG? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 149b5460fbfd..603effabe1ee 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -616,7 +616,8 @@ xfs_get_atomic_write_min( * write of exactly one single fsblock if the bdev will make that * guarantee for us. */ - if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp)) + if (xfs_inode_can_hw_atomic_write(ip) || + xfs_inode_can_sw_atomic_write(ip)) return mp->m_sb.sb_blocksize; return 0; @@ -633,7 +634,7 @@ xfs_get_atomic_write_max( * write of exactly one single fsblock if the bdev will make that * guarantee for us. */ - if (!xfs_can_sw_atomic_write(mp)) { + if (!xfs_inode_can_sw_atomic_write(ip)) { if (xfs_inode_can_hw_atomic_write(ip)) return mp->m_sb.sb_blocksize; return 0; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c8c9b8d8309f..2aa37a4d2706 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -307,7 +307,6 @@ xfs_bulkstat( .breq = breq, }; struct xfs_trans *tp; - unsigned int iwalk_flags = 0; int error; if (breq->idmap != &nop_mnt_idmap) { @@ -328,10 +327,7 @@ xfs_bulkstat( * locking abilities to detect cycles in the inobt without deadlocking. */ tp = xfs_trans_alloc_empty(breq->mp); - if (breq->flags & XFS_IBULK_SAME_AG) - iwalk_flags |= XFS_IWALK_SAME_AG; - - error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags, + error = xfs_iwalk(breq->mp, tp, breq->startino, breq->iwalk_flags, xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); kfree(bc.buf); @@ -457,7 +453,7 @@ xfs_inumbers( * locking abilities to detect cycles in the inobt without deadlocking. */ tp = xfs_trans_alloc_empty(breq->mp); - error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags, + error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->iwalk_flags, xfs_inumbers_walk, breq->icount, &ic); xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index f10e8f8f2335..2d0612f14d6e 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -13,17 +13,15 @@ struct xfs_ibulk { xfs_ino_t startino; /* start with this inode */ unsigned int icount; /* number of elements in ubuffer */ unsigned int ocount; /* number of records returned */ - unsigned int flags; /* see XFS_IBULK_FLAG_* */ + unsigned int flags; /* XFS_IBULK_FLAG_* */ + unsigned int iwalk_flags; /* XFS_IWALK_FLAG_* */ }; -/* Only iterate within the same AG as startino */ -#define XFS_IBULK_SAME_AG (1U << 0) - /* Fill out the bs_extents64 field if set. */ -#define XFS_IBULK_NREXT64 (1U << 1) +#define XFS_IBULK_NREXT64 (1U << 0) /* Signal that we can return metadata directories. */ -#define XFS_IBULK_METADIR (1U << 2) +#define XFS_IBULK_METADIR (1U << 1) /* * Advance the user buffer pointer by one record of the given size. If the diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c8a57e21a1d3..0da19472d603 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1489,8 +1489,7 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | - WQ_HIGHPRI), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU), 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 2133fbaf1766..dc32c5e34d81 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -779,6 +779,25 @@ xfs_set_max_atomic_write_opt( return -EINVAL; } + if (xfs_has_reflink(mp)) + goto set_limit; + + if (new_max_fsbs == 1) { + if (mp->m_ddev_targp->bt_awu_max || + (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_awu_max)) { + } else { + xfs_warn(mp, + "cannot support atomic writes of size %lluk with no reflink or HW support", + new_max_bytes >> 10); + return -EINVAL; + } + } else { + xfs_warn(mp, + "cannot support atomic writes of size %lluk with no reflink support", + new_max_bytes >> 10); + return -EINVAL; + } + set_limit: error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs); if (error) { diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 866c71d9fbae..73b7e72944e4 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -293,7 +293,8 @@ int xfs_mru_cache_init(void) { xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a77..70ee2d55e4f1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -578,19 +578,19 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_unwritten; @@ -602,13 +602,14 @@ xfs_init_mount_workqueues( goto out_destroy_reclaim; mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_inodegc_wq) goto out_destroy_blockgc; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", - XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0, + mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_inodegc; @@ -778,7 +779,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode); + return inode_generic_drop(inode); } STATIC void @@ -2596,8 +2597,8 @@ xfs_init_workqueues(void) * AGs in all the filesystems mounted. Hence use the default large * max_active value for this workqueue. */ - xfs_alloc_wq = alloc_workqueue("xfsalloc", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0); + xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 0); if (!xfs_alloc_wq) return -ENOMEM; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e1794e3e3156..79b8641880ab 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -455,6 +455,7 @@ DEFINE_EVENT(xfs_zone_alloc_class, name, \ xfs_extlen_t len), \ TP_ARGS(oz, rgbno, len)) DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks); DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); TRACE_EVENT(xfs_zone_gc_select_victim, @@ -1151,7 +1152,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->count = icount_read(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ece374d622b3..575e7028f423 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -253,8 +253,8 @@ xfs_trans_alloc( * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ retry: - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); tp = __xfs_trans_alloc(mp, flags); + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); error = xfs_trans_reserve(tp, resp, blocks, rtextents); if (error == -ENOSPC && want_retry) { xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 33f7eee521a8..f28214c28ab5 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -166,10 +166,9 @@ xfs_open_zone_mark_full( static void xfs_zone_record_blocks( struct xfs_trans *tp, - xfs_fsblock_t fsbno, - xfs_filblks_t len, struct xfs_open_zone *oz, - bool used) + xfs_fsblock_t fsbno, + xfs_filblks_t len) { struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = oz->oz_rtg; @@ -179,18 +178,37 @@ xfs_zone_record_blocks( xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); - if (used) { - rmapip->i_used_blocks += len; - ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); - } else { - xfs_add_frextents(mp, len); - } + rmapip->i_used_blocks += len; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); oz->oz_written += len; if (oz->oz_written == rtg_blocks(rtg)) xfs_open_zone_mark_full(oz); xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); } +/* + * Called for blocks that have been written to disk, but not actually linked to + * an inode, which can happen when garbage collection races with user data + * writes to a file. + */ +static void +xfs_zone_skip_blocks( + struct xfs_open_zone *oz, + xfs_filblks_t len) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + + trace_xfs_zone_skip_blocks(oz, 0, len); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + oz->oz_written += len; + if (oz->oz_written == rtg_blocks(rtg)) + xfs_open_zone_mark_full(oz); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + + xfs_add_frextents(rtg_mount(rtg), len); +} + static int xfs_zoned_map_extent( struct xfs_trans *tp, @@ -250,8 +268,7 @@ xfs_zoned_map_extent( } } - xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, - true); + xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount); /* Map the new blocks into the data fork. */ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); @@ -259,8 +276,7 @@ xfs_zoned_map_extent( skip: trace_xfs_reflink_cow_remap_skip(ip, new); - xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, - false); + xfs_zone_skip_blocks(oz, new->br_blockcount); return 0; } @@ -358,44 +374,6 @@ xfs_zone_free_blocks( return 0; } -/* - * Check if the zone containing the data just before the offset we are - * writing to is still open and has space. - */ -static struct xfs_open_zone * -xfs_last_used_zone( - struct iomap_ioend *ioend) -{ - struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset); - struct xfs_rtgroup *rtg = NULL; - struct xfs_open_zone *oz = NULL; - struct xfs_iext_cursor icur; - struct xfs_bmbt_irec got; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb, - &icur, &got)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return NULL; - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock)); - if (!rtg) - return NULL; - - xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED); - oz = READ_ONCE(rtg->rtg_open_zone); - if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref))) - oz = NULL; - xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED); - - xfs_rtgroup_rele(rtg); - return oz; -} - static struct xfs_group * xfs_find_free_zone( struct xfs_mount *mp, @@ -902,12 +880,9 @@ xfs_zone_alloc_and_submit( goto out_error; /* - * If we don't have a cached zone in this write context, see if the - * last extent before the one we are writing to points to an active - * zone. If so, just continue writing to it. + * If we don't have a locally cached zone in this write context, see if + * the inode is still associated with a zone and use that if so. */ - if (!*oz && ioend->io_offset) - *oz = xfs_last_used_zone(ioend); if (!*oz) *oz = xfs_cached_zone(mp, ip); diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c index 1313c55b8cbe..9cd38716fd25 100644 --- a/fs/xfs/xfs_zone_space_resv.c +++ b/fs/xfs/xfs_zone_space_resv.c @@ -10,6 +10,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_rtbitmap.h" +#include "xfs_icache.h" #include "xfs_zone_alloc.h" #include "xfs_zone_priv.h" #include "xfs_zones.h" @@ -230,6 +231,11 @@ xfs_zoned_space_reserve( error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, flags & XFS_ZR_RESERVED); + if (error == -ENOSPC && !(flags & XFS_ZR_NOWAIT)) { + xfs_inodegc_flush(mp); + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, + flags & XFS_ZR_RESERVED); + } if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags); if (error) |