summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/afs/callback.c4
-rw-r--r--fs/afs/dir.c223
-rw-r--r--fs/afs/dir_edit.c18
-rw-r--r--fs/afs/dir_silly.c11
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/afs/internal.h15
-rw-r--r--fs/afs/main.c4
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/protocol_yfs.h3
-rw-r--r--fs/afs/rotate.c17
-rw-r--r--fs/afs/server.c3
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/afs/yfsclient.c249
-rw-r--r--fs/aio.c2
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/bcachefs/btree_write_buffer.c2
-rw-r--r--fs/bcachefs/io_read.c8
-rw-r--r--fs/bcachefs/journal_io.c2
-rw-r--r--fs/bcachefs/super.c10
-rw-r--r--fs/btrfs/block-group.c11
-rw-r--r--fs/btrfs/btrfs_inode.h7
-rw-r--r--fs/btrfs/compression.c22
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/delayed-inode.c3
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent_io.c75
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/inode.c111
-rw-r--r--fs/btrfs/qgroup.c9
-rw-r--r--fs/btrfs/ref-verify.c9
-rw-r--r--fs/btrfs/relocation.c19
-rw-r--r--fs/btrfs/space-info.c4
-rw-r--r--fs/btrfs/subpage.c19
-rw-r--r--fs/btrfs/super.c45
-rw-r--r--fs/btrfs/tree-checker.c4
-rw-r--r--fs/btrfs/tree-log.c99
-rw-r--r--fs/btrfs/verity.c2
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/btrfs/zoned.c143
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/ceph/addr.c9
-rw-r--r--fs/ceph/crypto.c2
-rw-r--r--fs/ceph/debugfs.c14
-rw-r--r--fs/ceph/dir.c17
-rw-r--r--fs/ceph/file.c24
-rw-r--r--fs/ceph/inode.c89
-rw-r--r--fs/ceph/mds_client.c174
-rw-r--r--fs/ceph/mds_client.h18
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/configfs/mount.c2
-rw-r--r--fs/coredump.c8
-rw-r--r--fs/cramfs/inode.c11
-rw-r--r--fs/crypto/bio.c2
-rw-r--r--fs/crypto/crypto.c14
-rw-r--r--fs/crypto/fname.c11
-rw-r--r--fs/crypto/fscrypt_private.h4
-rw-r--r--fs/crypto/hooks.c2
-rw-r--r--fs/crypto/inline_crypt.c12
-rw-r--r--fs/crypto/keysetup.c43
-rw-r--r--fs/crypto/policy.c7
-rw-r--r--fs/dax.c3
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/debugfs/inode.c11
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/dlm/main.c2
-rw-r--r--fs/efivarfs/super.c6
-rw-r--r--fs/erofs/Kconfig20
-rw-r--r--fs/erofs/erofs_fs.h8
-rw-r--r--fs/erofs/internal.h1
-rw-r--r--fs/erofs/super.c40
-rw-r--r--fs/erofs/xattr.c13
-rw-r--r--fs/erofs/zdata.c13
-rw-r--r--fs/erofs/zmap.c67
-rw-r--r--fs/eventpoll.c139
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext4/crypto.c2
-rw-r--r--fs/ext4/ext4.h8
-rw-r--r--fs/ext4/fsmap.c23
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/indirect.c4
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/namei.c4
-rw-r--r--fs/ext4/orphan.c5
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/super.c20
-rw-r--r--fs/ext4/verity.c2
-rw-r--r--fs/f2fs/f2fs.h6
-rw-r--r--fs/f2fs/super.c16
-rw-r--r--fs/f2fs/verity.c2
-rw-r--r--fs/fcntl.c10
-rw-r--r--fs/fhandle.c16
-rw-r--r--fs/file.c5
-rw-r--r--fs/fs-writeback.c17
-rw-r--r--fs/fsopen.c70
-rw-r--r--fs/fuse/dev.c4
-rw-r--r--fs/fuse/dir.c3
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/fuse/fuse_i.h14
-rw-r--r--fs/fuse/inode.c21
-rw-r--r--fs/fuse/passthrough.c5
-rw-r--r--fs/fuse/virtio_fs.c2
-rw-r--r--fs/gfs2/main.c5
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hugetlbfs/inode.c10
-rw-r--r--fs/inode.c118
-rw-r--r--fs/internal.h1
-rw-r--r--fs/ioctl.c5
-rw-r--r--fs/iomap/buffered-io.c18
-rw-r--r--fs/iomap/direct-io.c17
-rw-r--r--fs/iomap/trace.h1
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/kernfs/file.c58
-rw-r--r--fs/kernfs/inode.c4
-rw-r--r--fs/kernfs/mount.c2
-rw-r--r--fs/locks.c4
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/mount.h12
-rw-r--r--fs/namei.c22
-rw-r--r--fs/namespace.c394
-rw-r--r--fs/netfs/buffered_read.c10
-rw-r--r--fs/netfs/buffered_write.c2
-rw-r--r--fs/netfs/direct_read.c7
-rw-r--r--fs/netfs/direct_write.c6
-rw-r--r--fs/netfs/internal.h1
-rw-r--r--fs/netfs/misc.c2
-rw-r--r--fs/netfs/objects.c32
-rw-r--r--fs/netfs/read_collect.c4
-rw-r--r--fs/netfs/read_pgpriv2.c2
-rw-r--r--fs/netfs/read_single.c2
-rw-r--r--fs/netfs/write_collect.c10
-rw-r--r--fs/netfs/write_issue.c7
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/file.c40
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c21
-rw-r--r--fs/nfs/inode.c19
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/io.c13
-rw-r--r--fs/nfs/localio.c21
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs42proc.c35
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c7
-rw-r--r--fs/nfs/nfs4renewd.c2
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/pagelist.c9
-rw-r--r--fs/nfs/write.c82
-rw-r--r--fs/nfsd/filecache.c2
-rw-r--r--fs/nfsd/localio.c5
-rw-r--r--fs/nfsd/vfs.c10
-rw-r--r--fs/nilfs2/sysfs.c4
-rw-r--r--fs/nilfs2/sysfs.h8
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/notify/mark.c4
-rw-r--r--fs/nsfs.c211
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c3
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c5
-rw-r--r--fs/ocfs2/extent_map.c10
-rw-r--r--fs/ocfs2/inode.c3
-rw-r--r--fs/orangefs/super.c2
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/super.c2
-rw-r--r--fs/overlayfs/util.c3
-rw-r--r--fs/pidfs.c6
-rw-r--r--fs/pipe.c6
-rw-r--r--fs/pnode.c10
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/generic.c39
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/namespaces.c6
-rw-r--r--fs/proc/root.c98
-rw-r--r--fs/proc/task_mmu.c27
-rw-r--r--fs/pstore/inode.c2
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/ramfs/inode.c2
-rw-r--r--fs/read_write.c14
-rw-r--r--fs/resctrl/ctrlmondata.c2
-rw-r--r--fs/resctrl/internal.h4
-rw-r--r--fs/resctrl/monitor.c6
-rw-r--r--fs/smb/client/cifs_debug.c31
-rw-r--r--fs/smb/client/cifs_spnego.c47
-rw-r--r--fs/smb/client/cifs_unicode.c3
-rw-r--r--fs/smb/client/cifsfs.c32
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifsglob.h34
-rw-r--r--fs/smb/client/cifsproto.h4
-rw-r--r--fs/smb/client/cifstransport.c19
-rw-r--r--fs/smb/client/compress.c71
-rw-r--r--fs/smb/client/connect.c9
-rw-r--r--fs/smb/client/file.c18
-rw-r--r--fs/smb/client/inode.c137
-rw-r--r--fs/smb/client/misc.c38
-rw-r--r--fs/smb/client/reparse.c2
-rw-r--r--fs/smb/client/smb1ops.c4
-rw-r--r--fs/smb/client/smb2glob.h3
-rw-r--r--fs/smb/client/smb2inode.c296
-rw-r--r--fs/smb/client/smb2misc.c19
-rw-r--r--fs/smb/client/smb2ops.c49
-rw-r--r--fs/smb/client/smb2pdu.c4
-rw-r--r--fs/smb/client/smb2proto.h3
-rw-r--r--fs/smb/client/smb2transport.c1
-rw-r--r--fs/smb/client/smbdirect.c43
-rw-r--r--fs/smb/client/trace.h61
-rw-r--r--fs/smb/client/transport.c7
-rw-r--r--fs/smb/server/connection.c3
-rw-r--r--fs/smb/server/connection.h7
-rw-r--r--fs/smb/server/ksmbd_work.c2
-rw-r--r--fs/smb/server/oplock.c13
-rw-r--r--fs/smb/server/smb2pdu.c25
-rw-r--r--fs/smb/server/transport_rdma.c213
-rw-r--r--fs/smb/server/transport_rdma.h4
-rw-r--r--fs/smb/server/transport_tcp.c26
-rw-r--r--fs/smb/server/vfs_cache.h2
-rw-r--r--fs/splice.c3
-rw-r--r--fs/squashfs/super.c14
-rw-r--r--fs/super.c74
-rw-r--r--fs/ubifs/crypto.c2
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/verity/enable.c6
-rw-r--r--fs/verity/fsverity_private.h9
-rw-r--r--fs/verity/open.c23
-rw-r--r--fs/verity/verify.c4
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c7
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/scrub/trace.h2
-rw-r--r--fs/xfs/xfs_aops.c3
-rw-r--r--fs/xfs/xfs_file.c6
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_inode.h11
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_iops.c5
-rw-r--r--fs/xfs/xfs_itable.c8
-rw-r--r--fs/xfs/xfs_itable.h10
-rw-r--r--fs/xfs/xfs_log.c3
-rw-r--r--fs/xfs/xfs_mount.c19
-rw-r--r--fs/xfs/xfs_mru_cache.c3
-rw-r--r--fs/xfs/xfs_super.c17
-rw-r--r--fs/xfs/xfs_trace.h3
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_zone_alloc.c87
-rw-r--r--fs/xfs/xfs_zone_space_resv.c6
248 files changed, 3605 insertions, 1781 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 795c6388744c..1581ebac5bb4 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode)
v9ses = v9fs_inode2v9ses(inode);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
/*
* in case of non cached mode always drop the
* inode because we want the inode attribute
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 69e1dd55b160..894d2bad6b6c 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -42,7 +42,7 @@ static void afs_volume_init_callback(struct afs_volume *volume)
list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb);
- queue_work(system_unbound_wq, &vnode->cb_work);
+ queue_work(system_dfl_wq, &vnode->cb_work);
}
}
@@ -90,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
if (reason != afs_cb_break_for_deleted &&
vnode->status.type == AFS_FTYPE_FILE &&
atomic_read(&vnode->cb_nr_mmap))
- queue_work(system_unbound_wq, &vnode->cb_work);
+ queue_work(system_dfl_wq, &vnode->cb_work);
trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true);
} else {
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index bfb69e066672..89d36e3e5c79 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1823,7 +1823,8 @@ error:
static void afs_rename_success(struct afs_operation *op)
{
- struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry));
+ struct afs_vnode *vnode = op->more_files[0].vnode;
+ struct afs_vnode *new_vnode = op->more_files[1].vnode;
_enter("op=%08x", op->debug_id);
@@ -1834,22 +1835,40 @@ static void afs_rename_success(struct afs_operation *op)
op->ctime = op->file[1].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[1]);
}
+ if (op->more_files[0].scb.have_status)
+ afs_vnode_commit_status(op, &op->more_files[0]);
+ if (op->more_files[1].scb.have_status)
+ afs_vnode_commit_status(op, &op->more_files[1]);
/* If we're moving a subdir between dirs, we need to update
* its DV counter too as the ".." will be altered.
*/
- if (S_ISDIR(vnode->netfs.inode.i_mode) &&
- op->file[0].vnode != op->file[1].vnode) {
- u64 new_dv;
+ if (op->file[0].vnode != op->file[1].vnode) {
+ if (S_ISDIR(vnode->netfs.inode.i_mode)) {
+ u64 new_dv;
- write_seqlock(&vnode->cb_lock);
+ write_seqlock(&vnode->cb_lock);
- new_dv = vnode->status.data_version + 1;
- trace_afs_set_dv(vnode, new_dv);
- vnode->status.data_version = new_dv;
- inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
+ new_dv = vnode->status.data_version + 1;
+ trace_afs_set_dv(vnode, new_dv);
+ vnode->status.data_version = new_dv;
+ inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
- write_sequnlock(&vnode->cb_lock);
+ write_sequnlock(&vnode->cb_lock);
+ }
+
+ if ((op->rename.rename_flags & RENAME_EXCHANGE) &&
+ S_ISDIR(new_vnode->netfs.inode.i_mode)) {
+ u64 new_dv;
+
+ write_seqlock(&new_vnode->cb_lock);
+
+ new_dv = new_vnode->status.data_version + 1;
+ new_vnode->status.data_version = new_dv;
+ inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv);
+
+ write_sequnlock(&new_vnode->cb_lock);
+ }
}
}
@@ -1900,8 +1919,8 @@ static void afs_rename_edit_dir(struct afs_operation *op)
if (S_ISDIR(vnode->netfs.inode.i_mode) &&
new_dvnode != orig_dvnode &&
test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
- afs_edit_dir_update_dotdot(vnode, new_dvnode,
- afs_edit_dir_for_rename_sub);
+ afs_edit_dir_update(vnode, &dotdot_name, new_dvnode,
+ afs_edit_dir_for_rename_sub);
new_inode = d_inode(new_dentry);
if (new_inode) {
@@ -1915,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op)
/* Now we can update d_fsdata on the dentries to reflect their
* new parent's data_version.
- *
- * Note that if we ever implement RENAME_EXCHANGE, we'll have
- * to update both dentries with opposing dir versions.
*/
afs_update_dentry_version(op, new_dvp, op->dentry);
afs_update_dentry_version(op, new_dvp, op->dentry_2);
@@ -1930,6 +1946,67 @@ static void afs_rename_edit_dir(struct afs_operation *op)
fscache_end_operation(&new_cres);
}
+static void afs_rename_exchange_edit_dir(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ struct afs_vnode *orig_dvnode = orig_dvp->vnode;
+ struct afs_vnode *new_dvnode = new_dvp->vnode;
+ struct afs_vnode *old_vnode = op->more_files[0].vnode;
+ struct afs_vnode *new_vnode = op->more_files[1].vnode;
+ struct dentry *old_dentry = op->dentry;
+ struct dentry *new_dentry = op->dentry_2;
+
+ _enter("op=%08x", op->debug_id);
+
+ if (new_dvnode == orig_dvnode) {
+ down_write(&orig_dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+ orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) {
+ afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+ new_vnode, afs_edit_dir_for_rename_0);
+ afs_edit_dir_update(orig_dvnode, &new_dentry->d_name,
+ old_vnode, afs_edit_dir_for_rename_1);
+ }
+
+ d_exchange(old_dentry, new_dentry);
+ up_write(&orig_dvnode->validate_lock);
+ } else {
+ down_write(&orig_dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+ orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta)
+ afs_edit_dir_update(orig_dvnode, &old_dentry->d_name,
+ new_vnode, afs_edit_dir_for_rename_0);
+
+ up_write(&orig_dvnode->validate_lock);
+ down_write(&new_dvnode->validate_lock);
+
+ if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) &&
+ new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta)
+ afs_edit_dir_update(new_dvnode, &new_dentry->d_name,
+ old_vnode, afs_edit_dir_for_rename_1);
+
+ if (S_ISDIR(old_vnode->netfs.inode.i_mode) &&
+ test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags))
+ afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode,
+ afs_edit_dir_for_rename_sub);
+
+ if (S_ISDIR(new_vnode->netfs.inode.i_mode) &&
+ test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags))
+ afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode,
+ afs_edit_dir_for_rename_sub);
+
+ /* Now we can update d_fsdata on the dentries to reflect their
+ * new parents' data_version.
+ */
+ afs_update_dentry_version(op, new_dvp, old_dentry);
+ afs_update_dentry_version(op, orig_dvp, new_dentry);
+
+ d_exchange(old_dentry, new_dentry);
+ up_write(&new_dvnode->validate_lock);
+ }
+}
+
static void afs_rename_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
@@ -1948,6 +2025,32 @@ static const struct afs_operation_ops afs_rename_operation = {
.put = afs_rename_put,
};
+#if 0 /* Autoswitched in yfs_fs_rename_replace(). */
+static const struct afs_operation_ops afs_rename_replace_operation = {
+ .issue_afs_rpc = NULL,
+ .issue_yfs_rpc = yfs_fs_rename_replace,
+ .success = afs_rename_success,
+ .edit_dir = afs_rename_edit_dir,
+ .put = afs_rename_put,
+};
+#endif
+
+static const struct afs_operation_ops afs_rename_noreplace_operation = {
+ .issue_afs_rpc = NULL,
+ .issue_yfs_rpc = yfs_fs_rename_noreplace,
+ .success = afs_rename_success,
+ .edit_dir = afs_rename_edit_dir,
+ .put = afs_rename_put,
+};
+
+static const struct afs_operation_ops afs_rename_exchange_operation = {
+ .issue_afs_rpc = NULL,
+ .issue_yfs_rpc = yfs_fs_rename_exchange,
+ .success = afs_rename_success,
+ .edit_dir = afs_rename_exchange_edit_dir,
+ .put = afs_rename_put,
+};
+
/*
* rename a file in an AFS filesystem and/or move it between directories
*/
@@ -1956,10 +2059,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *new_dentry, unsigned int flags)
{
struct afs_operation *op;
- struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+ struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL;
int ret;
- if (flags)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
return -EINVAL;
/* Don't allow silly-rename files be moved around. */
@@ -1969,6 +2072,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
vnode = AFS_FS_I(d_inode(old_dentry));
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
+ if (d_is_positive(new_dentry))
+ new_vnode = AFS_FS_I(d_inode(new_dentry));
_enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}",
orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
@@ -1989,6 +2094,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (ret < 0)
goto error;
+ ret = -ENOMEM;
+ op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+ if (!op->more_files)
+ goto error;
+
afs_op_set_vnode(op, 0, orig_dvnode);
afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
op->file[0].dv_delta = 1;
@@ -1997,46 +2107,63 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
+ op->more_files[0].vnode = vnode;
+ op->more_files[0].speculative = true;
+ op->more_files[1].vnode = new_vnode;
+ op->more_files[1].speculative = true;
+ op->nr_files = 4;
op->dentry = old_dentry;
op->dentry_2 = new_dentry;
+ op->rename.rename_flags = flags;
op->rename.new_negative = d_is_negative(new_dentry);
- op->ops = &afs_rename_operation;
- /* For non-directories, check whether the target is busy and if so,
- * make a copy of the dentry and then do a silly-rename. If the
- * silly-rename succeeds, the copied dentry is hashed and becomes the
- * new target.
- */
- if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
- /* To prevent any new references to the target during the
- * rename, we unhash the dentry in advance.
+ if (flags & RENAME_NOREPLACE) {
+ op->ops = &afs_rename_noreplace_operation;
+ } else if (flags & RENAME_EXCHANGE) {
+ op->ops = &afs_rename_exchange_operation;
+ d_drop(new_dentry);
+ } else {
+ /* If we might displace the target, we might need to do silly
+ * rename.
*/
- if (!d_unhashed(new_dentry)) {
- d_drop(new_dentry);
- op->rename.rehash = new_dentry;
- }
+ op->ops = &afs_rename_operation;
- if (d_count(new_dentry) > 2) {
- /* copy the target dentry's name */
- op->rename.tmp = d_alloc(new_dentry->d_parent,
- &new_dentry->d_name);
- if (!op->rename.tmp) {
- afs_op_nomem(op);
- goto error;
+ /* For non-directories, check whether the target is busy and if
+ * so, make a copy of the dentry and then do a silly-rename.
+ * If the silly-rename succeeds, the copied dentry is hashed
+ * and becomes the new target.
+ */
+ if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
+ /* To prevent any new references to the target during
+ * the rename, we unhash the dentry in advance.
+ */
+ if (!d_unhashed(new_dentry)) {
+ d_drop(new_dentry);
+ op->rename.rehash = new_dentry;
}
- ret = afs_sillyrename(new_dvnode,
- AFS_FS_I(d_inode(new_dentry)),
- new_dentry, op->key);
- if (ret) {
- afs_op_set_error(op, ret);
- goto error;
+ if (d_count(new_dentry) > 2) {
+ /* copy the target dentry's name */
+ op->rename.tmp = d_alloc(new_dentry->d_parent,
+ &new_dentry->d_name);
+ if (!op->rename.tmp) {
+ afs_op_nomem(op);
+ goto error;
+ }
+
+ ret = afs_sillyrename(new_dvnode,
+ AFS_FS_I(d_inode(new_dentry)),
+ new_dentry, op->key);
+ if (ret) {
+ afs_op_set_error(op, ret);
+ goto error;
+ }
+
+ op->dentry_2 = op->rename.tmp;
+ op->rename.rehash = NULL;
+ op->rename.new_negative = true;
}
-
- op->dentry_2 = op->rename.tmp;
- op->rename.rehash = NULL;
- op->rename.new_negative = true;
}
}
@@ -2052,6 +2179,8 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
d_drop(old_dentry);
ret = afs_do_sync_operation(op);
+ if (ret == -ENOTSUPP)
+ ret = -EINVAL;
out:
afs_dir_unuse_cookie(orig_dvnode, ret);
if (new_dvnode != orig_dvnode)
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 60a549f1d9c5..4b1342c72089 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -522,11 +522,11 @@ error:
}
/*
- * Edit a subdirectory that has been moved between directories to update the
- * ".." entry.
+ * Edit an entry in a directory to update the vnode it refers to. This is also
+ * used to update the ".." entry in a directory.
*/
-void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
- enum afs_edit_dir_reason why)
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+ struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why)
{
union afs_xdr_dir_block *block;
union afs_xdr_dirent *de;
@@ -557,7 +557,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto already_invalidated;
- slot = afs_dir_scan_block(block, &dotdot_name, b);
+ slot = afs_dir_scan_block(block, name, b);
if (slot >= 0)
goto found_dirent;
@@ -566,7 +566,7 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
/* Didn't find the dirent to clobber. Download the directory again. */
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
- 0, 0, 0, 0, "..");
+ 0, 0, 0, 0, name->name);
afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd);
goto out;
@@ -576,7 +576,7 @@ found_dirent:
de->u.unique = htonl(new_dvnode->fid.unique);
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot,
- ntohl(de->u.vnode), ntohl(de->u.unique), "..");
+ ntohl(de->u.vnode), ntohl(de->u.unique), name->name);
kunmap_local(block);
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
@@ -589,12 +589,12 @@ out:
already_invalidated:
kunmap_local(block);
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
- 0, 0, 0, 0, "..");
+ 0, 0, 0, 0, name->name);
goto out;
error:
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
- 0, 0, 0, 0, "..");
+ 0, 0, 0, 0, name->name);
goto out;
}
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 0b80eb93fa40..014495d4b868 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
if (IS_ERR(op))
return PTR_ERR(op);
+ op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL);
+ if (!op->more_files) {
+ afs_put_operation(op);
+ return -ENOMEM;
+ }
+
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, dvnode);
op->file[0].dv_delta = 1;
@@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
+ op->more_files[0].vnode = AFS_FS_I(d_inode(old));
+ op->more_files[0].speculative = true;
+ op->more_files[1].vnode = AFS_FS_I(d_inode(new));
+ op->more_files[1].speculative = true;
+ op->nr_files = 4;
op->dentry = old;
op->dentry_2 = new;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e9538e91f848..e1cb17b85791 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode)
_enter("");
if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
- return generic_delete_inode(inode);
+ return inode_just_drop(inode);
else
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
/*
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 1124ea4000cb..444a3ea4fdf6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -562,6 +562,7 @@ struct afs_server {
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
#define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */
#define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */
+#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */
refcount_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
@@ -891,9 +892,10 @@ struct afs_operation {
bool need_rehash;
} unlink;
struct {
- struct dentry *rehash;
- struct dentry *tmp;
- bool new_negative;
+ struct dentry *rehash;
+ struct dentry *tmp;
+ unsigned int rename_flags;
+ bool new_negative;
} rename;
struct {
struct netfs_io_subrequest *subreq;
@@ -1100,8 +1102,8 @@ int afs_single_writepages(struct address_space *mapping,
extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
enum afs_edit_dir_reason);
extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
-void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
- enum afs_edit_dir_reason why);
+void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name,
+ struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why);
void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
/*
@@ -1693,6 +1695,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *);
extern void yfs_fs_link(struct afs_operation *);
extern void yfs_fs_symlink(struct afs_operation *);
extern void yfs_fs_rename(struct afs_operation *);
+void yfs_fs_rename_replace(struct afs_operation *op);
+void yfs_fs_rename_noreplace(struct afs_operation *op);
+void yfs_fs_rename_exchange(struct afs_operation *op);
extern void yfs_fs_store_data(struct afs_operation *);
extern void yfs_fs_setattr(struct afs_operation *);
extern void yfs_fs_get_volume_status(struct afs_operation *);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 02475d415d88..e6bb8237db98 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -169,13 +169,13 @@ static int __init afs_init(void)
printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
- afs_wq = alloc_workqueue("afs", 0, 0);
+ afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0);
if (!afs_wq)
goto error_afs_wq;
afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
if (!afs_async_calls)
goto error_async;
- afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
+ afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!afs_lock_manager)
goto error_lockmgr;
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 8f2b3a177690..c8a7f266080d 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -131,6 +131,7 @@ int afs_abort_to_error(u32 abort_code)
case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG;
case RXGEN_OPCODE: return -ENOTSUPP;
+ case RX_INVALID_OPERATION: return -ENOTSUPP;
default: return -EREMOTEIO;
}
diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h
index e4cd89c44c46..b2f06c1917c2 100644
--- a/fs/afs/protocol_yfs.h
+++ b/fs/afs/protocol_yfs.h
@@ -50,6 +50,9 @@ enum YFS_FS_Operations {
YFSREMOVEACL = 64171,
YFSREMOVEFILE2 = 64173,
YFSSTOREOPAQUEACL2 = 64174,
+ YFSRENAME_REPLACE = 64176,
+ YFSRENAME_NOREPLACE = 64177,
+ YFSRENAME_EXCHANGE = 64187,
YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */
YFSFETCHDATA64 = 64537, /* YFS Fetch file data */
YFSSTOREDATA64 = 64538, /* YFS Store file data */
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index a1c24f589d9e..6a4e7da10fc4 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -432,6 +432,16 @@ bool afs_select_fileserver(struct afs_operation *op)
afs_op_set_error(op, -EDQUOT);
goto failed_but_online;
+ case RX_INVALID_OPERATION:
+ case RXGEN_OPCODE:
+ /* Handle downgrading to an older operation. */
+ afs_op_set_error(op, -ENOTSUPP);
+ if (op->flags & AFS_OPERATION_DOWNGRADE) {
+ op->flags &= ~AFS_OPERATION_DOWNGRADE;
+ goto go_again;
+ }
+ goto failed_but_online;
+
default:
afs_op_accumulate_error(op, error, abort_code);
failed_but_online:
@@ -620,12 +630,13 @@ iterate_address:
op->addr_index = addr_index;
set_bit(addr_index, &op->addr_tried);
- op->volsync.creation = TIME64_MIN;
- op->volsync.update = TIME64_MIN;
- op->call_responded = false;
_debug("address [%u] %u/%u %pISp",
op->server_index, addr_index, alist->nr_addrs,
rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
+go_again:
+ op->volsync.creation = TIME64_MIN;
+ op->volsync.update = TIME64_MIN;
+ op->call_responded = false;
_leave(" = t");
return true;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index a97562f831eb..c4428ebddb1d 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -331,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate,
void afs_put_server(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int a, debug_id = server->debug_id;
+ unsigned int a, debug_id;
bool zero;
int r;
if (!server)
return;
+ debug_id = server->debug_id;
a = atomic_read(&server->active);
zero = __refcount_dec_and_test(&server->ref, &r);
trace_afs_server(debug_id, r - 1, a, reason);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 2e7526ea883a..93ad86ff3345 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -172,7 +172,7 @@ static void afs_issue_write_worker(struct work_struct *work)
void afs_issue_write(struct netfs_io_subrequest *subreq)
{
subreq->work.func = afs_issue_write_worker;
- if (!queue_work(system_unbound_wq, &subreq->work))
+ if (!queue_work(system_dfl_wq, &subreq->work))
WARN_ON_ONCE(1);
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 257af259c04a..febf13a49f0b 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -1042,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op)
_enter("");
+ if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags))
+ return yfs_fs_rename_replace(op);
+
call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename,
sizeof(__be32) +
sizeof(struct yfs_xdr_RPCFlags) +
@@ -1071,6 +1074,252 @@ void yfs_fs_rename(struct afs_operation *op)
}
/*
+ * Deliver reply data to a YFS.Rename_NoReplace operation. This does not
+ * return the status of a displaced target inode as there cannot be one.
+ */
+static int yfs_deliver_fs_rename_1(struct afs_call *call)
+{
+ struct afs_operation *op = call->op;
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ struct afs_vnode_param *old_vp = &op->more_files[0];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
+ xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+ xdr_decode_YFSFid(&bp, &old_vp->fid);
+ xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+ xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+ xdr_decode_YFSVolSync(&bp, &op->volsync);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange
+ * operation. These return the status of the displaced target inode if there
+ * was one.
+ */
+static int yfs_deliver_fs_rename_2(struct afs_call *call)
+{
+ struct afs_operation *op = call->op;
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ struct afs_vnode_param *old_vp = &op->more_files[0];
+ struct afs_vnode_param *new_vp = &op->more_files[1];
+ const __be32 *bp;
+ int ret;
+
+ _enter("{%u}", call->unmarshall);
+
+ ret = afs_transfer_reply(call);
+ if (ret < 0)
+ return ret;
+
+ bp = call->buffer;
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
+ xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb);
+ xdr_decode_YFSFid(&bp, &old_vp->fid);
+ xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb);
+ xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb);
+ xdr_decode_YFSFid(&bp, &new_vp->fid);
+ xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb);
+ xdr_decode_YFSVolSync(&bp, &op->volsync);
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+static void yfs_done_fs_rename_replace(struct afs_call *call)
+{
+ if (call->error == -ECONNABORTED &&
+ (call->abort_code == RX_INVALID_OPERATION ||
+ call->abort_code == RXGEN_OPCODE)) {
+ set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags);
+ call->op->flags |= AFS_OPERATION_DOWNGRADE;
+ }
+}
+
+/*
+ * YFS.Rename_Replace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Replace = {
+ .name = "FS.Rename_Replace",
+ .op = yfs_FS_Rename_Replace,
+ .deliver = yfs_deliver_fs_rename_2,
+ .done = yfs_done_fs_rename_replace,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_NoReplace operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_NoReplace = {
+ .name = "FS.Rename_NoReplace",
+ .op = yfs_FS_Rename_NoReplace,
+ .deliver = yfs_deliver_fs_rename_1,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * YFS.Rename_Exchange operation type
+ */
+static const struct afs_call_type yfs_RXYFSRename_Exchange = {
+ .name = "FS.Rename_Exchange",
+ .op = yfs_FS_Rename_Exchange,
+ .deliver = yfs_deliver_fs_rename_2,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * Rename a file or directory, replacing the target if it exists. The status
+ * of a displaced target is returned.
+ */
+void yfs_fs_rename_replace(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ const struct qstr *orig_name = &op->dentry->d_name;
+ const struct qstr *new_name = &op->dentry_2->d_name;
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(orig_name->len) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(new_name->len),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return afs_op_nomem(op);
+
+ /* Marshall the parameters. */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME_REPLACE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+ bp = xdr_encode_name(bp, orig_name);
+ bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+ bp = xdr_encode_name(bp, new_name);
+ yfs_check_req(call, bp);
+
+ call->fid = orig_dvp->fid;
+ trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+ afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Rename a file or directory, failing if the target dirent exists.
+ */
+void yfs_fs_rename_noreplace(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ const struct qstr *orig_name = &op->dentry->d_name;
+ const struct qstr *new_name = &op->dentry_2->d_name;
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(orig_name->len) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(new_name->len),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return afs_op_nomem(op);
+
+ /* Marshall the parameters. */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+ bp = xdr_encode_name(bp, orig_name);
+ bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+ bp = xdr_encode_name(bp, new_name);
+ yfs_check_req(call, bp);
+
+ call->fid = orig_dvp->fid;
+ trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+ afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
+ * Exchange a pair of files directories.
+ */
+void yfs_fs_rename_exchange(struct afs_operation *op)
+{
+ struct afs_vnode_param *orig_dvp = &op->file[0];
+ struct afs_vnode_param *new_dvp = &op->file[1];
+ const struct qstr *orig_name = &op->dentry->d_name;
+ const struct qstr *new_name = &op->dentry_2->d_name;
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter("");
+
+ call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange,
+ sizeof(__be32) +
+ sizeof(struct yfs_xdr_RPCFlags) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(orig_name->len) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ xdr_strlen(new_name->len),
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSFid) +
+ sizeof(struct yfs_xdr_YFSFetchStatus) +
+ sizeof(struct yfs_xdr_YFSVolSync));
+ if (!call)
+ return afs_op_nomem(op);
+
+ /* Marshall the parameters. */
+ bp = call->request;
+ bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE);
+ bp = xdr_encode_u32(bp, 0); /* RPC flags */
+ bp = xdr_encode_YFSFid(bp, &orig_dvp->fid);
+ bp = xdr_encode_name(bp, orig_name);
+ bp = xdr_encode_YFSFid(bp, &new_dvp->fid);
+ bp = xdr_encode_name(bp, new_name);
+ yfs_check_req(call, bp);
+
+ call->fid = orig_dvp->fid;
+ trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
+ afs_make_op_call(op, call, GFP_NOFS);
+}
+
+/*
* YFS.StoreData64 operation type.
*/
static const struct afs_call_type yfs_RXYFSStoreData64 = {
diff --git a/fs/aio.c b/fs/aio.c
index 7fc7b6221312..6002617f078c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -636,7 +636,7 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
/* Synchronize against RCU protected table->table[] dereferences */
INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
- queue_rcu_work(system_wq, &ctx->free_rwork);
+ queue_rcu_work(system_percpu_wq, &ctx->free_rwork);
}
/*
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 1d847a939f29..180a458fc4f7 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -129,7 +129,7 @@ struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *n
}
return inode;
}
-EXPORT_SYMBOL_GPL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");
+EXPORT_SYMBOL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");
static struct file *__anon_inode_getfile(const char *name,
const struct file_operations *fops,
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 4b095235a0d2..0afb44ce1a85 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -827,7 +827,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_
if (bch2_btree_write_buffer_should_flush(c) &&
__enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) &&
- !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
+ !queue_work(system_dfl_wq, &c->btree_write_buffer.flush_work))
enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
if (dst->wb == &wb->flushing)
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index e0874ad9a6cf..460e2e6341f1 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -684,7 +684,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio,
if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
bch2_rbio_punt(rbio, bch2_rbio_retry,
- RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ RBIO_CONTEXT_UNBOUND, system_dfl_wq);
} else {
rbio = bch2_rbio_free(rbio);
@@ -921,10 +921,10 @@ csum_err:
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
goto out;
decompression_err:
- bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq);
goto out;
decrypt_err:
- bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_dfl_wq);
goto out;
}
@@ -963,7 +963,7 @@ static void bch2_read_endio(struct bio *bio)
rbio->promote ||
crc_is_compressed(rbio->pick.crc) ||
bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
- context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
+ context = RBIO_CONTEXT_UNBOUND, wq = system_dfl_wq;
else if (rbio->pick.crc.csum_type)
context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 9e028dbcc3d0..29bea8e0e495 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1362,7 +1362,7 @@ int bch2_journal_read(struct bch_fs *c,
BCH_DEV_READ_REF_journal_read))
closure_call(&ca->journal.read,
bch2_journal_read_device,
- system_unbound_wq,
+ system_dfl_wq,
&jlist.cl);
else
degraded = true;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index c46b1053a02c..f2417d298b84 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -801,13 +801,13 @@ int bch2_fs_init_rw(struct bch_fs *c)
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
!(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_PERCPU, 1)) ||
!(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 1)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
- WQ_FREEZABLE, 0)))
+ WQ_FREEZABLE|WQ_PERCPU, 0)))
return bch_err_throw(c, ENOMEM_fs_other_alloc);
int ret = bch2_fs_btree_interior_update_init(c) ?:
@@ -975,7 +975,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
sizeof(struct sort_iter_set);
if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_PERCPU, 512)) ||
enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR,
bch2_writes_disabled) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9bf282d2453c..fcd274d83fd7 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
bg1 = list_entry(a, struct btrfs_block_group, bg_list);
bg2 = list_entry(b, struct btrfs_block_group, bg_list);
- return bg1->used > bg2->used;
+ /*
+ * Some other task may be updating the ->used field concurrently, but it
+ * is not serious if we get a stale value or load/store tearing issues,
+ * as sorting the list of block groups to reclaim is not critical and an
+ * occasional imperfect order is ok. So silence KCSAN and avoid the
+ * overhead of locking or any other synchronization.
+ */
+ return data_race(bg1->used > bg2->used);
}
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
@@ -2031,7 +2038,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
btrfs_reclaim_sweep(fs_info);
spin_lock(&fs_info->unused_bgs_lock);
if (!list_empty(&fs_info->reclaim_bgs))
- queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+ queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work);
spin_unlock(&fs_info->unused_bgs_lock);
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b99fb0273292..3bb504c1e32a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -248,7 +248,7 @@ struct btrfs_inode {
u64 new_delalloc_bytes;
/*
* The offset of the last dir index key that was logged.
- * This is used only for directories.
+ * This is used only for directories. Protected by 'log_mutex'.
*/
u64 last_dir_index_offset;
};
@@ -338,6 +338,11 @@ struct btrfs_inode {
struct list_head delayed_iput;
struct rw_semaphore i_mmap_lock;
+
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info;
+#endif
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d09d622016ef..35e3071cec06 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1616,25 +1616,29 @@ out:
}
/*
- * Convert the compression suffix (eg. after "zlib" starting with ":") to
- * level, unrecognized string will set the default level. Negative level
- * numbers are allowed.
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to level.
+ *
+ * If the resulting level exceeds the algo's supported levels, it will be clamped.
+ *
+ * Return <0 if no valid string can be found.
+ * Return 0 if everything is fine.
*/
-int btrfs_compress_str2level(unsigned int type, const char *str)
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret)
{
int level = 0;
int ret;
- if (!type)
+ if (!type) {
+ *level_ret = btrfs_compress_set_level(type, level);
return 0;
+ }
if (str[0] == ':') {
ret = kstrtoint(str + 1, 10, &level);
if (ret)
- level = 0;
+ return ret;
}
- level = btrfs_compress_set_level(type, level);
-
- return level;
+ *level_ret = btrfs_compress_set_level(type, level);
+ return 0;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 1b38e707bbd9..7b41b2b5ff44 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -102,7 +102,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
-int btrfs_compress_str2level(unsigned int type, const char *str);
+int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret);
struct folio *btrfs_alloc_compr_folio(void);
void btrfs_free_compr_folio(struct folio *folio);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0f8d8e275143..c0c1ddd46b67 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1843,7 +1843,6 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
struct inode *vfs_inode = &inode->vfs_inode;
@@ -1864,8 +1863,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 70fc4e7cc5a0..aa4393eba997 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1958,7 +1958,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
- unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
+ unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU;
fs_info->workers =
btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 835b0deef9bb..b21cb72835cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -111,6 +111,24 @@ struct btrfs_bio_ctrl {
*/
unsigned long submit_bitmap;
struct readahead_control *ractl;
+
+ /*
+ * The start offset of the last used extent map by a read operation.
+ *
+ * This is for proper compressed read merge.
+ * U64_MAX means we are starting the read and have made no progress yet.
+ *
+ * The current btrfs_bio_is_contig() only uses disk_bytenr as
+ * the condition to check if the read can be merged with previous
+ * bio, which is not correct. E.g. two file extents pointing to the
+ * same extent but with different offset.
+ *
+ * So here we need to do extra checks to only merge reads that are
+ * covered by the same extent map.
+ * Just extent_map::start will be enough, as they are unique
+ * inside the same inode.
+ */
+ u64 last_em_start;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
@@ -909,7 +927,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -1019,12 +1037,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (compress_type != BTRFS_COMPRESS_NONE &&
- prev_em_start && *prev_em_start != (u64)-1 &&
- *prev_em_start != em->start)
+ bio_ctrl->last_em_start != U64_MAX &&
+ bio_ctrl->last_em_start != em->start)
force_bio_submit = true;
- if (prev_em_start)
- *prev_em_start = em->start;
+ bio_ctrl->last_em_start = em->start;
btrfs_free_extent_map(em);
em = NULL;
@@ -1238,12 +1255,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
const u64 start = folio_pos(folio);
const u64 end = start + folio_size(folio) - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
+ .last_em_start = U64_MAX,
+ };
struct extent_map *em_cached = NULL;
int ret;
lock_extents_for_read(inode, start, end, &cached_state);
- ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_free_extent_map(em_cached);
@@ -1512,7 +1532,7 @@ out:
/*
* Return 0 if we have submitted or queued the sector for submission.
- * Return <0 for critical errors.
+ * Return <0 for critical errors, and the sector will have its dirty flag cleared.
*
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
@@ -1535,8 +1555,17 @@ static int submit_one_sector(struct btrfs_inode *inode,
ASSERT(filepos < i_size);
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
- if (IS_ERR(em))
+ if (IS_ERR(em)) {
+ /*
+ * When submission failed, we should still clear the folio dirty.
+ * Or the folio will be written back again but without any
+ * ordered extent.
+ */
+ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
return PTR_ERR(em);
+ }
extent_offset = filepos - em->start;
em_end = btrfs_extent_map_end(em);
@@ -1609,8 +1638,12 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
folio_unlock(folio);
return 1;
}
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_folio_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_set_writeback(fs_info, folio, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
return ret;
+ }
for (cur = start; cur < start + len; cur += fs_info->sectorsize)
set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
@@ -1666,8 +1699,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
*
- * If we hit any error, the corresponding sector will still be dirty
- * thus no need to clear PAGECACHE_TAG_DIRTY.
+ * If we hit any error, the corresponding sector will have its dirty
+ * flag cleared and writeback finished, thus no need to handle the error case.
*/
if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
@@ -1813,6 +1846,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
xas_load(&xas);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irqrestore(&xas, flags);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
@@ -2569,7 +2603,8 @@ void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = {
.opf = REQ_OP_READ | REQ_RAHEAD,
- .ractl = rac
+ .ractl = rac,
+ .last_em_start = U64_MAX,
};
struct folio *folio;
struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
@@ -2577,12 +2612,11 @@ void btrfs_readahead(struct readahead_control *rac)
const u64 end = start + readahead_length(rac) - 1;
struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
- u64 prev_em_start = (u64)-1;
lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
- btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl);
btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
@@ -4331,15 +4365,18 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
int ret;
- xa_lock_irq(&fs_info->buffer_tree);
+ rcu_read_lock();
xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
/*
* The same as try_release_extent_buffer(), to ensure the eb
* won't disappear out from under us.
*/
spin_lock(&eb->refs_lock);
+ rcu_read_unlock();
+
if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
+ rcu_read_lock();
continue;
}
@@ -4358,11 +4395,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
* check the folio private at the end. And
* release_extent_buffer() will release the refs_lock.
*/
- xa_unlock_irq(&fs_info->buffer_tree);
release_extent_buffer(eb);
- xa_lock_irq(&fs_info->buffer_tree);
+ rcu_read_lock();
}
- xa_unlock_irq(&fs_info->buffer_tree);
+ rcu_read_unlock();
/*
* Finally to check if we have cleared folio private, as if we have
@@ -4375,7 +4411,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
ret = 0;
spin_unlock(&folio->mapping->i_private_lock);
return ret;
-
}
int try_release_extent_buffer(struct folio *folio)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 57f52585a6dd..9a5a497edc97 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1372,7 +1372,7 @@ void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
return;
- queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
+ queue_work(system_dfl_wq, &fs_info->em_shrinker_work);
}
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b77dd22b8cdb..27942d0ad9de 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -401,10 +401,12 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
while (index <= end_index) {
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
- index++;
- if (IS_ERR(folio))
+ if (IS_ERR(folio)) {
+ index++;
continue;
+ }
+ index = folio_end(folio) >> PAGE_SHIFT;
/*
* Here we just clear all Ordered bits for every page in the
* range, then btrfs_mark_ordered_io_finished() will handle
@@ -2013,7 +2015,7 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio
* cleaered by the caller.
*/
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, file_pos, end);
+ btrfs_cleanup_ordered_extents(inode, file_pos, len);
return ret;
}
@@ -3883,10 +3885,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
bool filled = false;
int first_xattr_slot;
- ret = btrfs_init_file_extent_tree(inode);
- if (ret)
- goto out;
-
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
@@ -3918,8 +3916,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(inode, 0,
- round_up(i_size_read(vfs_inode), fs_info->sectorsize));
inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
@@ -3951,6 +3947,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path
btrfs_set_inode_mapping_order(inode);
cache_index:
+ ret = btrfs_init_file_extent_tree(inode);
+ if (ret)
+ goto out;
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any
@@ -4187,6 +4188,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
return ret;
}
+static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
+{
+ struct timespec64 now;
+
+ /*
+ * If we are replaying a log tree, we do not want to update the mtime
+ * and ctime of the parent directory with the current time, since the
+ * log replay procedure is responsible for setting them to their correct
+ * values (the ones it had when the fsync was done).
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
+ return;
+
+ now = inode_set_ctime_current(&dir->vfs_inode);
+ inode_set_mtime_to_ts(&dir->vfs_inode, now);
+}
+
/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
@@ -4287,7 +4305,7 @@ skip_backref:
inode_inc_iversion(&inode->vfs_inode);
inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+ update_time_after_link_or_unlink(dir);
return btrfs_update_inode(trans, dir);
}
@@ -4538,7 +4556,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
- if (atomic_read(&inode->vfs_inode.i_count) > 1)
+ if (icount_read(&inode->vfs_inode) > 1)
d_prune_aliases(&inode->vfs_inode);
min_ino = btrfs_ino(inode) + 1;
@@ -5677,7 +5695,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
bool empty = false;
xa_lock(&root->inodes);
- entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ /*
+ * This btrfs_inode is being freed and has already been unhashed at this
+ * point. It's possible that another btrfs_inode has already been
+ * allocated for the same inode and inserted itself into the root, so
+ * don't delete it in that case.
+ *
+ * Note that this shouldn't need to allocate memory, so the gfp flags
+ * don't really matter.
+ */
+ entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
+ GFP_ATOMIC);
if (entry == inode)
empty = xa_empty(&root->inodes);
xa_unlock(&root->inodes);
@@ -6681,15 +6709,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
name->len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
- /*
- * If we are replaying a log tree, we do not want to update the mtime
- * and ctime of the parent directory with the current time, since the
- * log replay procedure is responsible for setting them to their correct
- * values (the ones it had when the fsync was done).
- */
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
- inode_set_mtime_to_ts(&parent_inode->vfs_inode,
- inode_set_ctime_current(&parent_inode->vfs_inode));
+ update_time_after_link_or_unlink(parent_inode);
ret = btrfs_update_inode(trans, parent_inode);
if (ret)
@@ -6794,7 +6814,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct fscrypt_name fname;
u64 index;
int ret;
- int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
@@ -6826,44 +6845,44 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
/* There are several dir indexes for this inode, clear the cache. */
BTRFS_I(inode)->dir_index = 0ULL;
- inc_nlink(inode);
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index);
+ if (ret)
+ goto fail;
+ /* Link added now we update the inode item with the new link count. */
+ inc_nlink(inode);
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
- drop_inode = 1;
- } else {
- struct dentry *parent = dentry->d_parent;
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
- if (ret)
+ if (inode->i_nlink == 1) {
+ /*
+ * If the new hard link count is 1, it's a file created with the
+ * open(2) O_TMPFILE flag.
+ */
+ ret = btrfs_orphan_del(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto fail;
- if (inode->i_nlink == 1) {
- /*
- * If new hard link count is 1, it's a file created
- * with open(2) O_TMPFILE flag.
- */
- ret = btrfs_orphan_del(trans, BTRFS_I(inode));
- if (ret)
- goto fail;
}
- d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
+ /* Grab reference for the new dentry passed to d_instantiate(). */
+ ihold(inode);
+ d_instantiate(dentry, inode);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
+
fail:
fscrypt_free_filename(&fname);
if (trans)
btrfs_end_transaction(trans);
- if (drop_inode) {
- inode_dec_link_count(inode);
- iput(inode);
- }
btrfs_btree_balance_dirty(fs_info);
return ret;
}
@@ -7819,6 +7838,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ /* new_delalloc_bytes and last_dir_index_offset are in a union. */
ei->new_delalloc_bytes = 0;
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
@@ -7953,7 +7973,7 @@ int btrfs_drop_inode(struct inode *inode)
if (btrfs_root_refs(&root->root_item) == 0)
return 1;
else
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
static void init_once(void *foo)
@@ -7961,6 +7981,9 @@ static void init_once(void *foo)
struct btrfs_inode *ei = foo;
inode_init_once(&ei->vfs_inode);
+#ifdef CONFIG_FS_VERITY
+ ei->i_verity_info = NULL;
+#endif
}
void __cold btrfs_destroy_cachep(void)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1a5972178b3a..da102da169fd 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1453,9 +1453,9 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
- struct btrfs_qgroup *cur;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
+ u64 num_bytes_cmpr = src->excl_cmpr;
int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1463,15 +1463,16 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
goto out;
qgroup_iterator_add(&qgroup_list, qgroup);
- list_for_each_entry(cur, &qgroup_list, iterator) {
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
qgroup->rfer += sign * num_bytes;
- qgroup->rfer_cmpr += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes_cmpr;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr);
qgroup->excl += sign * num_bytes;
- qgroup->excl_cmpr += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes_cmpr;
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 3871c3a6c743..9f1858b42c0e 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -980,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
if (!btrfs_test_opt(fs_info, REF_VERIFY))
return 0;
+ extent_root = btrfs_extent_root(fs_info, 0);
+ /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */
+ if (IS_ERR(extent_root)) {
+ btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling");
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- extent_root = btrfs_extent_root(fs_info, 0);
eb = btrfs_read_lock_root_node(extent_root);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e58151933844..7256f6748c8f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -602,6 +602,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
if (btrfs_root_id(root) == objectid) {
u64 commit_root_gen;
+ /*
+ * Relocation will wait for cleaner thread, and any half-dropped
+ * subvolume will be fully cleaned up at mount time.
+ * So here we shouldn't hit a subvolume with non-zero drop_progress.
+ *
+ * If this isn't the case, error out since it can make us attempt to
+ * drop references for extents that were already dropped before.
+ */
+ if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) {
+ struct btrfs_key cpu_key;
+
+ btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress);
+ btrfs_err(fs_info,
+ "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)",
+ objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset);
+ ret = -EUCLEAN;
+ goto fail;
+ }
+
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 0481c693ac2e..c573d80550ad 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1830,7 +1830,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq, async_work);
+ queue_work(system_dfl_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1847,7 +1847,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
need_preemptive_reclaim(fs_info, space_info)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
- queue_work(system_unbound_wq,
+ queue_work(system_dfl_wq,
&fs_info->preempt_reclaim_work);
}
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index c9b3821957f7..cb4f97833dc3 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -448,8 +448,25 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&bfs->lock, flags);
bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+
+ /*
+ * Don't clear the TOWRITE tag when starting writeback on a still-dirty
+ * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it,
+ * assume writeback is complete, and exit too early — violating sync
+ * ordering guarantees.
+ */
if (!folio_test_writeback(folio))
- folio_start_writeback(folio);
+ __folio_start_writeback(folio, true);
+ if (!folio_test_dirty(folio)) {
+ struct address_space *mapping = folio_mapping(folio);
+ XA_STATE(xas, &mapping->i_pages, folio->index);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
+ }
spin_unlock_irqrestore(&bfs->lock, flags);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 68e35a3700ff..b06b8f325537 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -88,6 +88,9 @@ struct btrfs_fs_context {
refcount_t refs;
};
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old);
+
enum {
Opt_acl,
Opt_clear_cache,
@@ -273,6 +276,7 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
const struct fs_parameter *param, int opt)
{
const char *string = param->string;
+ int ret;
/*
* Provide the same semantics as older kernels that don't use fs
@@ -291,21 +295,30 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
} else if (btrfs_match_compress_type(string, "zlib", true)) {
ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
- string + 4);
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (btrfs_match_compress_type(string, "lzo", false)) {
+ } else if (btrfs_match_compress_type(string, "lzo", true)) {
ctx->compress_type = BTRFS_COMPRESS_LZO;
- ctx->compress_level = 0;
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
+ if (string[3] == ':' && string[4])
+ btrfs_warn(NULL, "Compression level ignored for LZO");
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
} else if (btrfs_match_compress_type(string, "zstd", true)) {
ctx->compress_type = BTRFS_COMPRESS_ZSTD;
- ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
- string + 4);
+ ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4,
+ &ctx->compress_level);
+ if (ret < 0)
+ goto error;
btrfs_set_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, NODATACOW);
btrfs_clear_opt(ctx->mount_opt, NODATASUM);
@@ -316,10 +329,14 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
btrfs_clear_opt(ctx->mount_opt, COMPRESS);
btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
} else {
- btrfs_err(NULL, "unrecognized compression value %s", string);
- return -EINVAL;
+ ret = -EINVAL;
+ goto error;
}
return 0;
+error:
+ btrfs_err(NULL, "failed to parse compression option '%s'", string);
+ return ret;
+
}
static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
@@ -698,12 +715,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,
if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
- btrfs_info(info, "disk space caching is enabled");
btrfs_warn(info,
"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
}
- if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
- btrfs_info(info, "using free-space-tree");
}
return ret;
@@ -980,6 +994,8 @@ static int btrfs_fill_super(struct super_block *sb,
return ret;
}
+ btrfs_emit_options(fs_info, NULL);
+
inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -1077,7 +1093,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
- if (info->compress_level)
+ if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO)
seq_printf(seq, ":%d", info->compress_level);
}
if (btrfs_test_opt(info, NOSSD))
@@ -1437,7 +1453,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
{
btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
- btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow");
btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
@@ -1459,10 +1475,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
+ btrfs_info_if_unset(info, old, NODATASUM, "setting datasum");
btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
- btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers");
btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 0f556f4de3f9..a997c7cc35a2 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1756,10 +1756,10 @@ static int check_inode_ref(struct extent_buffer *leaf,
while (ptr < end) {
u16 namelen;
- if (unlikely(ptr + sizeof(iref) > end)) {
+ if (unlikely(ptr + sizeof(*iref) > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
- ptr, end, sizeof(iref));
+ ptr, end, sizeof(*iref));
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2186e87fb61b..7a63afedd01e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1964,7 +1964,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
search_key.objectid = log_key.objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = key->objectid;
+ search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len);
ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
if (ret < 0) {
goto out;
@@ -2605,14 +2605,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
/*
* Correctly adjust the reserved bytes occupied by a log tree extent buffer
*/
-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
btrfs_err(fs_info, "unable to find block group for %llu", start);
- return;
+ return -ENOENT;
}
spin_lock(&cache->space_info->lock);
@@ -2623,27 +2623,22 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
+
+ return 0;
}
static int clean_log_buffer(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
- int ret;
-
btrfs_tree_lock(eb);
btrfs_clear_buffer_dirty(trans, eb);
wait_on_extent_buffer_writeback(eb);
btrfs_tree_unlock(eb);
- if (trans) {
- ret = btrfs_pin_reserved_extent(trans, eb);
- if (ret)
- return ret;
- } else {
- unaccount_log_buffer(eb->fs_info, eb->start);
- }
+ if (trans)
+ return btrfs_pin_reserved_extent(trans, eb);
- return 0;
+ return unaccount_log_buffer(eb->fs_info, eb->start);
}
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
@@ -3345,6 +3340,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
return 0;
}
+static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ bool ret = false;
+
+ /*
+ * Do this only if ->logged_trans is still 0 to prevent races with
+ * concurrent logging as we may see the inode not logged when
+ * inode_logged() is called but it gets logged after inode_logged() did
+ * not find it in the log tree and we end up setting ->logged_trans to a
+ * value less than trans->transid after the concurrent logging task has
+ * set it to trans->transid. As a consequence, subsequent rename, unlink
+ * and link operations may end up not logging new names and removing old
+ * names from the log.
+ */
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == 0)
+ inode->logged_trans = trans->transid - 1;
+ else if (inode->logged_trans == trans->transid)
+ ret = true;
+ spin_unlock(&inode->lock);
+
+ return ret;
+}
+
/*
* Check if an inode was logged in the current transaction. This correctly deals
* with the case where the inode was logged but has a logged_trans of 0, which
@@ -3362,15 +3382,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
- if (inode->logged_trans == trans->transid)
+ /*
+ * Quick lockless call, since once ->logged_trans is set to the current
+ * transaction, we never set it to a lower value anywhere else.
+ */
+ if (data_race(inode->logged_trans) == trans->transid)
return 1;
/*
- * If logged_trans is not 0, then we know the inode logged was not logged
- * in this transaction, so we can return false right away.
+ * If logged_trans is not 0 and not trans->transid, then we know the
+ * inode was not logged in this transaction, so we can return false
+ * right away. We take the lock to avoid a race caused by load/store
+ * tearing with a concurrent btrfs_log_inode() call or a concurrent task
+ * in this function further below - an update to trans->transid can be
+ * teared into two 32 bits updates for example, in which case we could
+ * see a positive value that is not trans->transid and assume the inode
+ * was not logged when it was.
*/
- if (inode->logged_trans > 0)
+ spin_lock(&inode->lock);
+ if (inode->logged_trans == trans->transid) {
+ spin_unlock(&inode->lock);
+ return 1;
+ } else if (inode->logged_trans > 0) {
+ spin_unlock(&inode->lock);
return 0;
+ }
+ spin_unlock(&inode->lock);
/*
* If no log tree was created for this root in this transaction, then
@@ -3379,10 +3416,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* transaction's ID, to avoid the search below in a future call in case
* a log tree gets created after this.
*/
- if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
- inode->logged_trans = trans->transid - 1;
- return 0;
- }
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+ return mark_inode_as_not_logged(trans, inode);
/*
* We have a log tree and the inode's logged_trans is 0. We can't tell
@@ -3436,8 +3471,7 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* Set logged_trans to a value greater than 0 and less then the
* current transaction to avoid doing the search in future calls.
*/
- inode->logged_trans = trans->transid - 1;
- return 0;
+ return mark_inode_as_not_logged(trans, inode);
}
/*
@@ -3445,20 +3479,9 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
* the current transacion's ID, to avoid future tree searches as long as
* the inode is not evicted again.
*/
+ spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
-
- /*
- * If it's a directory, then we must set last_dir_index_offset to the
- * maximum possible value, so that the next attempt to log the inode does
- * not skip checking if dir index keys found in modified subvolume tree
- * leaves have been logged before, otherwise it would result in attempts
- * to insert duplicate dir index keys in the log tree. This must be done
- * because last_dir_index_offset is an in-memory only field, not persisted
- * in the inode item or any other on-disk structure, so its value is lost
- * once the inode is evicted.
- */
- if (S_ISDIR(inode->vfs_inode.i_mode))
- inode->last_dir_index_offset = (u64)-1;
+ spin_unlock(&inode->lock);
return 1;
}
@@ -4050,7 +4073,7 @@ done:
/*
* If the inode was logged before and it was evicted, then its
- * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * last_dir_index_offset is 0, so we don't know the value of the last index
* key offset. If that's the case, search for it and update the inode. This
* is to avoid lookups in the log tree every time we try to insert a dir index
* key from a leaf changed in the current transaction, and to allow us to always
@@ -4066,7 +4089,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode,
lockdep_assert_held(&inode->log_mutex);
- if (inode->last_dir_index_offset != (u64)-1)
+ if (inode->last_dir_index_offset != 0)
return 0;
if (!ctx->logged_before) {
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index b7a96a005487..4633cbcfcdb9 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -802,6 +802,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
}
const struct fsverity_operations btrfs_verityops = {
+ .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) -
+ (int)offsetof(struct btrfs_inode, vfs_inode),
.begin_enable_verity = btrfs_begin_enable_verity,
.end_enable_verity = btrfs_end_enable_verity,
.get_verity_descriptor = btrfs_get_verity_descriptor,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fa7a929a0461..c6e3efd6f602 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error;
}
+ if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = true;
down_write(&sb->s_umount);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 245e813ecd78..71cf9be75c42 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -17,6 +17,7 @@
#include "accessors.h"
#include "bio.h"
#include "transaction.h"
+#include "sysfs.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -42,6 +43,9 @@
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
+/* Default number of max active zones when the device has no limits. */
+#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128
+
/*
* Minimum of active zones we need:
*
@@ -416,7 +420,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = bdev_max_active_zones(bdev);
+ max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
+ bdev_max_open_zones(bdev));
+ if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
+ max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
btrfs_err(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
@@ -507,6 +514,11 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (max_active_zones) {
if (nactive > max_active_zones) {
+ if (bdev_max_active_zones(bdev) == 0) {
+ max_active_zones = 0;
+ zone_info->max_active_zones = 0;
+ goto validate;
+ }
btrfs_err(device->fs_info,
"zoned: %u active zones on %s exceeds max_active_zones %u",
nactive, rcu_dereference(device->name),
@@ -519,6 +531,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
}
+validate:
/* Validate superblock log */
nr_zones = BTRFS_NR_SB_LOG_ZONES;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -2168,10 +2181,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- /* No space left */
- if (btrfs_zoned_bg_is_full(block_group)) {
- ret = false;
- goto out_unlock;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
+ /* The caller should check if the block group is full. */
+ if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) {
+ ret = false;
+ goto out_unlock;
+ }
+ } else {
+ /* Since it is already written, it should have been active. */
+ WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);
}
for (i = 0; i < map->num_stripes; i++) {
@@ -2230,7 +2248,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
struct btrfs_fs_info *fs_info = block_group->fs_info;
const u64 end = block_group->start + block_group->length;
struct extent_buffer *eb;
- unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits);
+ unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);
rcu_read_lock();
xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
@@ -2245,6 +2263,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
rcu_read_unlock();
}
+static int call_zone_finish(struct btrfs_block_group *block_group,
+ struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_device *device = stripe->dev;
+ const u64 physical = stripe->physical;
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ int ret;
+
+ if (!device->bdev)
+ return 0;
+
+ if (zinfo->max_active_zones == 0)
+ return 0;
+
+ if (btrfs_dev_is_sequential(device, physical)) {
+ unsigned int nofs_flags;
+
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
+
+ if (ret)
+ return ret;
+ }
+
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ zinfo->reserved_active_zones++;
+ btrfs_dev_clear_active_zone(device, physical);
+
+ return 0;
+}
+
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2329,31 +2381,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
down_read(&dev_replace->rwsem);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_device *device = map->stripes[i].dev;
- const u64 physical = map->stripes[i].physical;
- struct btrfs_zoned_device_info *zinfo = device->zone_info;
- unsigned int nofs_flags;
-
- if (!device->bdev)
- continue;
-
- if (zinfo->max_active_zones == 0)
- continue;
-
- nofs_flags = memalloc_nofs_save();
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT);
- memalloc_nofs_restore(nofs_flags);
+ ret = call_zone_finish(block_group, &map->stripes[i]);
if (ret) {
up_read(&dev_replace->rwsem);
return ret;
}
-
- if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
- zinfo->reserved_active_zones++;
- btrfs_dev_clear_active_zone(device, physical);
}
up_read(&dev_replace->rwsem);
@@ -2488,7 +2521,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
refcount_inc(&eb->refs);
bg->last_eb = eb;
INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
- queue_work(system_unbound_wq, &bg->zone_finish_work);
+ queue_work(system_dfl_wq, &bg->zone_finish_work);
}
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2504,12 +2537,12 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- struct btrfs_space_info *space_info = data_sinfo->sub_group[0];
+ struct btrfs_space_info *space_info = data_sinfo;
struct btrfs_trans_handle *trans;
struct btrfs_block_group *bg;
struct list_head *bg_list;
u64 alloc_flags;
- bool initial = false;
+ bool first = true;
bool did_chunk_alloc = false;
int index;
int ret;
@@ -2523,21 +2556,52 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
if (sb_rdonly(fs_info->sb))
return;
- ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
index = btrfs_bg_flags_to_raid_index(alloc_flags);
- bg_list = &data_sinfo->block_groups[index];
+ /* Scan the data space_info to find empty block groups. Take the second one. */
again:
+ bg_list = &space_info->block_groups[index];
list_for_each_entry(bg, bg_list, list) {
- if (bg->used > 0)
+ if (bg->alloc_offset != 0)
continue;
- if (!initial) {
- initial = true;
+ if (first) {
+ first = false;
continue;
}
+ if (space_info == data_sinfo) {
+ /* Migrate the block group to the data relocation space_info. */
+ struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0];
+ int factor;
+
+ ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+ factor = btrfs_bg_type_to_factor(bg->flags);
+
+ down_write(&space_info->groups_sem);
+ list_del_init(&bg->list);
+ /* We can assume this as we choose the second empty one. */
+ ASSERT(!list_empty(&space_info->block_groups[index]));
+ up_write(&space_info->groups_sem);
+
+ spin_lock(&space_info->lock);
+ space_info->total_bytes -= bg->length;
+ space_info->disk_total -= bg->length * factor;
+ space_info->disk_total -= bg->zone_unusable;
+ /* There is no allocation ever happened. */
+ ASSERT(bg->used == 0);
+ /* No super block in a block group on the zoned setup. */
+ ASSERT(bg->bytes_super == 0);
+ spin_unlock(&space_info->lock);
+
+ bg->space_info = reloc_sinfo;
+ if (reloc_sinfo->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(bg);
+
+ btrfs_add_bg_to_space_info(fs_info, bg);
+ }
+
fs_info->data_reloc_bg = bg->start;
set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
btrfs_zone_activate(bg);
@@ -2552,11 +2616,18 @@ again:
if (IS_ERR(trans))
return;
+ /* Allocate new BG in the data relocation space_info. */
+ space_info = data_sinfo->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
if (ret == 1) {
+ /*
+ * We allocated a new block group in the data relocation space_info. We
+ * can take that one.
+ */
+ first = false;
did_chunk_alloc = true;
- bg_list = &space_info->block_groups[index];
goto again;
}
}
@@ -2650,7 +2721,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->alloc_offset == 0 ||
- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
continue;
diff --git a/fs/buffer.c b/fs/buffer.c
index ead4dc85debd..6a8752f7bbed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -157,8 +157,8 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
*/
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
- __end_buffer_read_notouch(bh, uptodate);
put_bh(bh);
+ __end_buffer_read_notouch(bh, uptodate);
}
EXPORT_SYMBOL(end_buffer_read_sync);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8b202d789e93..322ed268f14a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
0,
gfp_flags);
if (IS_ERR(pages[index])) {
- if (PTR_ERR(pages[index]) == -EINVAL) {
+ int err = PTR_ERR(pages[index]);
+
+ if (err == -EINVAL) {
pr_err_client(cl, "inode->i_blkbits=%hhu\n",
inode->i_blkbits);
}
@@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
BUG_ON(ceph_wbc->locked_pages == 0);
pages[index] = NULL;
- return PTR_ERR(pages[index]);
+ return err;
}
} else {
pages[index] = &folio->page;
@@ -1687,6 +1689,7 @@ get_more_pages:
process_folio_batch:
rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+ ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
if (rc)
goto release_folios;
@@ -1695,8 +1698,6 @@ process_folio_batch:
goto release_folios;
if (ceph_wbc.processed_in_fbatch) {
- ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
-
if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
ceph_wbc.locked_pages < ceph_wbc.max_pages) {
doutc(cl, "reached end fbatch, trying for more\n");
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index cab722619207..7026e794813c 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -133,6 +133,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
}
static struct fscrypt_operations ceph_fscrypt_ops = {
+ .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) -
+ (int)offsetof(struct ceph_inode_info, netfs.inode),
.needs_bounce_pages = 1,
.get_context = ceph_crypt_get_context,
.set_context = ceph_crypt_set_context,
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdd404fc8112..f3fe786b4143 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
- int pathlen = 0;
- u64 pathbase;
char *path;
mutex_lock(&mdsc->mutex);
@@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p)
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_dentry->d_lock);
@@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
@@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p)
}
if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
@@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_old_dentry,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8478e7e75df6..32973c62c1a2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
/* If op failed, mark everyone involved for errors */
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
/* mark error on parent + clear complete */
mapping_set_error(req->r_parent->i_mapping, result);
@@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_old_inode->i_mapping, result);
pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
}
out:
iput(req->r_old_inode);
@@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
int err = -EROFS;
int op;
char *path;
- int pathlen;
- u64 pathbase;
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
@@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c02f100f8552..978acd3d4b32 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
int flags, fmode, wanted;
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
int mask = MAY_READ;
@@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file)
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
pr_warn_client(cl,
"async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
@@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
int mask;
int err;
char *path;
- int pathlen;
- u64 pathbase;
doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
dir, ceph_vinop(dir), dentry, dentry,
@@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
@@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
mask |= MAY_WRITE;
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fc543075b827..949f0badc944 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
+/*
+ * Check if the parent inode matches the vino from directory reply info
+ */
+static inline bool ceph_vino_matches_parent(struct inode *parent,
+ struct ceph_vino vino)
+{
+ return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
+}
+
+/*
+ * Validate that the directory inode referenced by @req->r_parent matches the
+ * inode number and snapshot id contained in the reply's directory record. If
+ * they do not match – which can theoretically happen if the parent dentry was
+ * moved between the time the request was issued and the reply arrived – fall
+ * back to looking up the correct inode in the inode cache.
+ *
+ * A reference is *always* returned. Callers that receive a different inode
+ * than the original @parent are responsible for dropping the extra reference
+ * once the reply has been processed.
+ */
+static struct inode *ceph_get_reply_dir(struct super_block *sb,
+ struct inode *parent,
+ struct ceph_mds_reply_info_parsed *rinfo)
+{
+ struct ceph_vino vino;
+
+ if (unlikely(!rinfo->diri.in))
+ return parent; /* nothing to compare against */
+
+ /* If we didn't have a cached parent inode to begin with, just bail out. */
+ if (!parent)
+ return NULL;
+
+ vino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (likely(ceph_vino_matches_parent(parent, vino)))
+ return parent; /* matches – use the original reference */
+
+ /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
+ WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
+ ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
+
+ return ceph_get_inode(sb, vino, NULL);
+}
+
/**
* ceph_new_inode - allocate a new inode in advance of an expected create
* @dir: parent directory for new inode
@@ -665,6 +711,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
#ifdef CONFIG_FS_ENCRYPTION
+ ci->i_crypt_info = NULL;
ci->fscrypt_auth = NULL;
ci->fscrypt_auth_len = 0;
#endif
@@ -1523,6 +1570,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct ceph_vino tvino, dvino;
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_client *cl = fsc->client;
+ struct inode *parent_dir = NULL;
int err = 0;
doutc(cl, "%p is_dentry %d is_target %d\n", req,
@@ -1536,10 +1584,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
}
if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_parent;
-
- if (dir) {
- err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ /*
+ * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
+ * so we need to get the correct inode
+ */
+ parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
+ if (unlikely(IS_ERR(parent_dir))) {
+ err = PTR_ERR(parent_dir);
+ goto done;
+ }
+ if (parent_dir) {
+ err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
rinfo->dirfrag, session, -1,
&req->r_caps_reservation);
if (err < 0)
@@ -1548,14 +1603,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
bool is_nokey = false;
struct qstr dname;
struct dentry *dn, *parent;
struct fscrypt_str oname = FSTR_INIT(NULL, 0);
- struct ceph_fname fname = { .dir = dir,
+ struct ceph_fname fname = { .dir = parent_dir,
.name = rinfo->dname,
.ctext = rinfo->altname,
.name_len = rinfo->dname_len,
@@ -1564,10 +1619,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
BUG_ON(!rinfo->head->is_target);
BUG_ON(req->r_dentry);
- parent = d_find_any_alias(dir);
+ parent = d_find_any_alias(parent_dir);
BUG_ON(!parent);
- err = ceph_fname_alloc_buffer(dir, &oname);
+ err = ceph_fname_alloc_buffer(parent_dir, &oname);
if (err < 0) {
dput(parent);
goto done;
@@ -1576,7 +1631,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
if (err < 0) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
goto done;
}
dname.name = oname.name;
@@ -1595,7 +1650,7 @@ retry_lookup:
dname.len, dname.name, dn);
if (!dn) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
err = -ENOMEM;
goto done;
}
@@ -1610,12 +1665,12 @@ retry_lookup:
ceph_snap(d_inode(dn)) != tvino.snap)) {
doutc(cl, " dn %p points to wrong inode %p\n",
dn, d_inode(dn));
- ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(parent_dir);
d_delete(dn);
dput(dn);
goto retry_lookup;
}
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
req->r_dentry = dn;
dput(parent);
@@ -1794,6 +1849,9 @@ retry_lookup:
&dvino, ptvino);
}
done:
+ /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
+ if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
+ iput(parent_dir);
doutc(cl, "done err=%d\n", err);
return err;
}
@@ -2487,22 +2545,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
int truncate_retry = 20; /* The RMW will take around 50ms */
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
dentry = d_find_alias(inode);
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0f497c39ff82..73da2648fa0f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
int count;
dput(dentry);
d_prune_aliases(inode);
- count = atomic_read(&inode->i_count);
+ count = icount_read(inode);
if (count == 1)
(*remaining)--;
doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
@@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* ceph_mdsc_build_path - build a path string to a given dentry
* @mdsc: mds client
* @dentry: dentry to which path should be built
- * @plen: returned length of string
- * @pbase: returned base inode number
+ * @path_info: output path, length, base ino+snap, and freepath ownership flag
* @for_wire: is this path going to be sent to the MDS?
*
* Build a string that represents the path to the dentry. This is mostly called
@@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* foo/.snap/bar -> foo//bar
*/
char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- int *plen, u64 *pbase, int for_wire)
+ struct ceph_path_info *path_info, int for_wire)
{
struct ceph_client *cl = mdsc->fsc->client;
struct dentry *cur;
@@ -2810,16 +2809,28 @@ retry:
return ERR_PTR(-ENAMETOOLONG);
}
- *pbase = base;
- *plen = PATH_MAX - 1 - pos;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
+ path_info->vino.ino = base;
+ path_info->pathlen = PATH_MAX - 1 - pos;
+ path_info->path = path + pos;
+ path_info->freepath = true;
+
+ /* Set snap from dentry if available */
+ if (d_inode(dentry))
+ path_info->vino.snap = ceph_snap(d_inode(dentry));
+ else
+ path_info->vino.snap = CEPH_NOSNAP;
+
doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
- base, *plen, path + pos);
+ base, PATH_MAX - 1 - pos, path + pos);
return path + pos;
}
static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- struct inode *dir, const char **ppath, int *ppathlen,
- u64 *pino, bool *pfreepath, bool parent_locked)
+ struct inode *dir, struct ceph_path_info *path_info,
+ bool parent_locked)
{
char *path;
@@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
dir = d_inode_rcu(dentry->d_parent);
if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
!IS_ENCRYPTED(dir)) {
- *pino = ceph_ino(dir);
+ path_info->vino.ino = ceph_ino(dir);
+ path_info->vino.snap = ceph_snap(dir);
rcu_read_unlock();
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
+ path_info->path = dentry->d_name.name;
+ path_info->pathlen = dentry->d_name.len;
+ path_info->freepath = false;
return 0;
}
rcu_read_unlock();
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap handling.
+ */
return 0;
}
-static int build_inode_path(struct inode *inode,
- const char **ppath, int *ppathlen, u64 *pino,
- bool *pfreepath)
+static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct dentry *dentry;
char *path;
if (ceph_snap(inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(inode);
- *ppathlen = 0;
+ path_info->vino.ino = ceph_ino(inode);
+ path_info->vino.snap = ceph_snap(inode);
+ path_info->pathlen = 0;
+ path_info->freepath = false;
return 0;
}
dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
dput(dentry);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap from dentry.
+ * Override with inode's snap since that's what this function is for.
+ */
+ path_info->vino.snap = ceph_snap(inode);
return 0;
}
@@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode,
*/
static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
struct dentry *rdentry, struct inode *rdiri,
- const char *rpath, u64 rino, const char **ppath,
- int *pathlen, u64 *ino, bool *freepath,
+ const char *rpath, u64 rino,
+ struct ceph_path_info *path_info,
bool parent_locked)
{
struct ceph_client *cl = mdsc->fsc->client;
int r = 0;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
if (rinode) {
- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+ r = build_inode_path(rinode, path_info);
doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
- freepath, parent_locked);
- doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
+ r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
+ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
+ path_info->pathlen, path_info->path);
} else if (rpath || rino) {
- *ino = rino;
- *ppath = rpath;
- *pathlen = rpath ? strlen(rpath) : 0;
- doutc(cl, " path %.*s\n", *pathlen, rpath);
+ path_info->vino.ino = rino;
+ path_info->vino.snap = CEPH_NOSNAP;
+ path_info->path = rpath;
+ path_info->pathlen = rpath ? strlen(rpath) : 0;
+ path_info->freepath = false;
+
+ doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
}
return r;
@@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_request_head_legacy *lhead;
- const char *path1 = NULL;
- const char *path2 = NULL;
- u64 ino1 = 0, ino2 = 0;
- int pathlen1 = 0, pathlen2 = 0;
- bool freepath1 = false, freepath2 = false;
+ struct ceph_path_info path_info1 = {0};
+ struct ceph_path_info path_info2 = {0};
struct dentry *old_dentry = NULL;
int len;
u16 releases;
@@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
u16 request_head_version = mds_supported_head_version(session);
kuid_t caller_fsuid = req->r_cred->fsuid;
kgid_t caller_fsgid = req->r_cred->fsgid;
+ bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
- req->r_parent, req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1,
- test_bit(CEPH_MDS_R_PARENT_LOCKED,
- &req->r_req_flags));
+ req->r_parent, req->r_path1, req->r_ino1.ino,
+ &path_info1, parent_locked);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out;
}
+ /*
+ * When the parent directory's i_rwsem is *not* locked, req->r_parent may
+ * have become stale (e.g. after a concurrent rename) between the time the
+ * dentry was looked up and now. If we detect that the stored r_parent
+ * does not match the inode number we just encoded for the request, switch
+ * to the correct inode so that the MDS receives a valid parent reference.
+ */
+ if (!parent_locked && req->r_parent && path_info1.vino.ino &&
+ ceph_ino(req->r_parent) != path_info1.vino.ino) {
+ struct inode *old_parent = req->r_parent;
+ struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
+ if (!IS_ERR(correct_dir)) {
+ WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
+ ceph_ino(old_parent), path_info1.vino.ino);
+ /*
+ * Transfer CEPH_CAP_PIN from the old parent to the new one.
+ * The pin was taken earlier in ceph_mdsc_submit_request().
+ */
+ ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN);
+ iput(old_parent);
+ req->r_parent = correct_dir;
+ ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ }
+ }
+
/* If r_old_dentry is set, then assume that its parent is locked */
if (req->r_old_dentry &&
!(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
old_dentry = req->r_old_dentry;
ret = set_request_path_attr(mdsc, NULL, old_dentry,
- req->r_old_dentry_dir,
- req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2, true);
+ req->r_old_dentry_dir,
+ req->r_path2, req->r_ino2.ino,
+ &path_info2, true);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out_free1;
@@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
/* filepaths */
len += 2 * (1 + sizeof(u32) + sizeof(u64));
- len += pathlen1 + pathlen2;
+ len += path_info1.pathlen + path_info2.pathlen;
/* cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
if (req->r_dentry_drop)
- len += pathlen1;
+ len += path_info1.pathlen;
if (req->r_old_dentry_drop)
- len += pathlen2;
+ len += path_info2.pathlen;
/* MClientRequest tail */
@@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead->ino = cpu_to_le64(req->r_deleg_ino);
lhead->args = req->r_args;
- ceph_encode_filepath(&p, end, ino1, path1);
- ceph_encode_filepath(&p, end, ino2, path2);
+ ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
+ ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
/* make note of release offset, in case we need to replay */
req->r_request_release_offset = p - msg->front.iov_base;
@@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
msg->hdr.data_off = cpu_to_le16(0);
out_free2:
- if (freepath2)
- ceph_mdsc_free_path((char *)path2, pathlen2);
+ ceph_mdsc_free_path_info(&path_info2);
out_free1:
- if (freepath1)
- ceph_mdsc_free_path((char *)path1, pathlen1);
+ ceph_mdsc_free_path_info(&path_info1);
out:
return msg;
out_err:
@@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
struct ceph_cap *cap;
- char *path;
- int pathlen = 0, err;
- u64 pathbase;
+ struct ceph_path_info path_info = {0};
+ int err;
u64 snap_follows;
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
recon_state->msg_version >= 2);
dput(dentry);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out_err;
}
- } else {
- path = NULL;
- pathbase = 0;
}
spin_lock(&ci->i_ceph_lock);
@@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
+ rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
ts = inode_get_atime(inode);
ceph_encode_timespec64(&rec.v1.atime, &ts);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
+ rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -4706,7 +4744,7 @@ encode_again:
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
+ struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@@ -4730,7 +4768,7 @@ encode_again:
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@@ -4741,17 +4779,17 @@ out_freeflocks:
} else {
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
- pathlen + sizeof(rec.v1));
+ path_info.pathlen + sizeof(rec.v1));
if (err)
goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
}
out_err:
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
if (!err)
recon_state->nr_caps++;
return err;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 3e2a6fa7c19a..0428a5eaf28c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-static inline void ceph_mdsc_free_path(char *path, int len)
+/*
+ * Structure to group path-related output parameters for build_*_path functions
+ */
+struct ceph_path_info {
+ const char *path;
+ int pathlen;
+ struct ceph_vino vino;
+ bool freepath;
+};
+
+static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
{
- if (!IS_ERR_OR_NULL(path))
- __putname(path - (PATH_MAX - 1 - len));
+ if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
+ __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
}
extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
- struct dentry *dentry, int *plen, u64 *base,
+ struct dentry *dentry, struct ceph_path_info *path_info,
int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c3eb651862c5..db6c2db68f96 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -862,7 +862,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0);
if (!fsc->inode_wq)
goto fail_client;
- fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1);
+ fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1);
if (!fsc->cap_wq)
goto fail_inode_wq;
@@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = {
.alloc_inode = ceph_alloc_inode,
.free_inode = ceph_free_inode,
.write_inode = ceph_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cf176aab0f82..25d8bacbcf44 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -463,6 +463,7 @@ struct ceph_inode_info {
unsigned long i_work_mask;
#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
u32 fscrypt_auth_len;
u32 fscrypt_file_len;
u8 *fscrypt_auth;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 740f18b60c9d..456c4a2efb53 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode)
static const struct super_operations configfs_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.free_inode = configfs_free_inode,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index fedbead956ed..0d9a5d07a75d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -345,7 +345,7 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm,
was_space = false;
err = cn_printf(cn, "%c", '\0');
if (err)
- return err;
+ return false;
(*argv)[(*argc)++] = cn->used;
}
}
@@ -635,7 +635,7 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
/*
* Usermode helpers are childen of either
- * system_unbound_wq or of kthreadd. So we know that
+ * system_dfl_wq or of kthreadd. So we know that
* we're starting off with a clean file descriptor
* table. So we should always be able to use
* COREDUMP_PIDFD_NUMBER as our file descriptor value.
@@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,
ssize_t retval;
char old_core_pattern[CORENAME_MAX_SIZE];
+ if (write)
+ return proc_dostring(table, write, buffer, lenp, ppos);
+
retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
error = proc_dostring(table, write, buffer, lenp, ppos);
if (error)
return error;
+
if (!check_coredump_socket()) {
strscpy(core_pattern, old_core_pattern, retval + 1);
return -EINVAL;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b002e9b734f9..12daa85ed941 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
inode_nohighmem(inode);
inode->i_data.a_ops = &cramfs_aops;
break;
- default:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
init_special_inode(inode, cramfs_inode->mode,
old_decode_dev(cramfs_inode->size));
+ break;
+ default:
+ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ iget_failed(inode);
+ return ERR_PTR(-EIO);
}
inode->i_mode = cramfs_inode->mode;
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 486fcb2ecf13..0d746de4cd10 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -113,7 +113,7 @@ out:
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
sector_t pblk, unsigned int len)
{
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index b6ccab524fde..07f9cbfe3ea4 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
size_t len, size_t offs, gfp_t gfp_flags)
{
const struct inode *inode = folio->mapping->host;
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
struct page *ciphertext_page;
@@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
{
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
- return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
- lblk_num, page, page, len, offs);
+ return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode),
+ FS_ENCRYPT, lblk_num, page, page, len,
+ offs);
}
EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
@@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
size_t offs)
{
const struct inode *inode = folio->mapping->host;
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
@@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
{
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
- return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
- lblk_num, page, page, len, offs);
+ return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode),
+ FS_DECRYPT, lblk_num, page, page, len,
+ offs);
}
EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index f9f6713e144f..fb77ad1ca74a 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -94,7 +94,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen)
{
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
@@ -138,7 +138,7 @@ static int fname_decrypt(const struct inode *inode,
const struct fscrypt_str *iname,
struct fscrypt_str *oname)
{
- const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
@@ -274,8 +274,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
u32 max_len, u32 *encrypted_len_ret)
{
- return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
- orig_len, max_len,
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
+
+ return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len,
encrypted_len_ret);
}
EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);
@@ -543,7 +544,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
*/
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
{
- const struct fscrypt_inode_info *ci = dir->i_crypt_info;
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir);
WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index d8b485b9881c..245e6b84aa17 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -249,8 +249,8 @@ struct fscrypt_prepared_key {
* fscrypt_inode_info - the "encryption key" for an inode
*
* When an encrypted file's key is made available, an instance of this struct is
- * allocated and stored in ->i_crypt_info. Once created, it remains until the
- * inode is evicted.
+ * allocated and a pointer to it is stored in the file's in-memory inode. Once
+ * created, it remains until the inode is evicted.
*/
struct fscrypt_inode_info {
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index e0b32ac841f7..7a5d4c168c49 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -199,7 +199,7 @@ int fscrypt_prepare_setflags(struct inode *inode,
err = fscrypt_require_key(inode);
if (err)
return err;
- ci = inode->i_crypt_info;
+ ci = fscrypt_get_inode_info_raw(inode);
if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
return -EINVAL;
mk = ci->ci_master_key;
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index caaff809765b..5dee7c498bc8 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb,
bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
- return inode->i_crypt_info->ci_inlinecrypt;
+ return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt;
}
EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
@@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
if (!fscrypt_inode_uses_inline_crypto(inode))
return;
- ci = inode->i_crypt_info;
+ ci = fscrypt_get_inode_info_raw(inode);
fscrypt_generate_dun(ci, first_lblk, dun);
bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask);
@@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
u64 next_lblk)
{
const struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+ const struct fscrypt_inode_info *ci;
u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
if (!!bc != fscrypt_inode_uses_inline_crypto(inode))
return false;
if (!bc)
return true;
+ ci = fscrypt_get_inode_info_raw(inode);
/*
* Comparing the key pointers is good enough, as all I/O for each key
* uses the same pointer. I.e., there's currently no need to support
* merging requests where the keys are the same but the pointers differ.
*/
- if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key)
+ if (bc->bc_key != ci->ci_enc_key.blk_key)
return false;
- fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun);
+ fscrypt_generate_dun(ci, next_lblk, next_dun);
return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun);
}
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio);
@@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
if (nr_blocks <= 1)
return nr_blocks;
- ci = inode->i_crypt_info;
+ ci = fscrypt_get_inode_info_raw(inode);
if (!(fscrypt_policy_flags(&ci->ci_policy) &
FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
return nr_blocks;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 4f3b9ecbfe4e..c1f85715c276 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -642,15 +642,16 @@ fscrypt_setup_encryption_info(struct inode *inode,
goto out;
/*
- * For existing inodes, multiple tasks may race to set ->i_crypt_info.
- * So use cmpxchg_release(). This pairs with the smp_load_acquire() in
- * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with
- * a RELEASE barrier so that other tasks can ACQUIRE it.
+ * For existing inodes, multiple tasks may race to set the inode's
+ * fscrypt info pointer. So use cmpxchg_release(). This pairs with the
+ * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the
+ * pointer with a RELEASE barrier so that other tasks can ACQUIRE it.
*/
- if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
+ if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) ==
+ NULL) {
/*
- * We won the race and set ->i_crypt_info to our crypt_info.
- * Now link it into the master key's inode list.
+ * We won the race and set the inode's fscrypt info to our
+ * crypt_info. Now link it into the master key's inode list.
*/
if (mk) {
crypt_info->ci_master_key = mk;
@@ -681,13 +682,13 @@ out:
* %false unless the operation being performed is needed in
* order for files (or directories) to be deleted.
*
- * Set up ->i_crypt_info, if it hasn't already been done.
+ * Set up the inode's encryption key, if it hasn't already been done.
*
- * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So
+ * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So
* generally this shouldn't be called from within a filesystem transaction.
*
- * Return: 0 if ->i_crypt_info was set or was already set, *or* if the
- * encryption key is unavailable. (Use fscrypt_has_encryption_key() to
+ * Return: 0 if the key is now set up, *or* if it couldn't be set up because the
+ * needed master key is absent. (Use fscrypt_has_encryption_key() to
* distinguish these cases.) Also can return another -errno code.
*/
int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
@@ -741,9 +742,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
* ->i_ino doesn't need to be set yet.
* @encrypt_ret: (output) set to %true if the new inode will be encrypted
*
- * If the directory is encrypted, set up its ->i_crypt_info in preparation for
+ * If the directory is encrypted, set up its encryption key in preparation for
* encrypting the name of the new file. Also, if the new inode will be
- * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true.
+ * encrypted, set up its encryption key too and set *encrypt_ret=true.
*
* This isn't %GFP_NOFS-safe, and therefore it should be called before starting
* any filesystem transaction to create the inode. For this reason, ->i_ino
@@ -752,8 +753,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
* This doesn't persist the new inode's encryption context. That still needs to
* be done later by calling fscrypt_set_context().
*
- * Return: 0 on success, -ENOKEY if the encryption key is missing, or another
- * -errno code
+ * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode
+ * but the needed master key is absent, or another -errno code
*/
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
bool *encrypt_ret)
@@ -800,8 +801,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
*/
void fscrypt_put_encryption_info(struct inode *inode)
{
- put_crypt_info(inode->i_crypt_info);
- inode->i_crypt_info = NULL;
+ /*
+ * Ideally we'd start with a lightweight IS_ENCRYPTED() check here
+ * before proceeding to retrieve and check the pointer. However, during
+ * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If
+ * an error occurs, it needs to be cleaned up regardless.
+ */
+ struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode);
+
+ put_crypt_info(*ci_addr);
+ *ci_addr = NULL;
}
EXPORT_SYMBOL(fscrypt_put_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 6ad30ae07c06..9d51f3500de3 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
err = fscrypt_require_key(dir);
if (err)
return ERR_PTR(err);
- return &dir->i_crypt_info->ci_policy;
+ return &fscrypt_get_inode_info_raw(dir)->ci_policy;
}
return fscrypt_get_dummy_policy(dir->i_sb);
@@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
*/
int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
{
- struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode);
BUILD_BUG_ON(sizeof(union fscrypt_context) !=
FSCRYPT_SET_CONTEXT_MAX_SIZE);
@@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
*/
int fscrypt_set_context(struct inode *inode, void *fs_data)
{
- struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci;
union fscrypt_context ctx;
int ctxsize;
@@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
* This may be the first time the inode number is available, so do any
* delayed key setup that requires the inode number.
*/
+ ci = fscrypt_get_inode_info_raw(inode);
if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
(ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
fscrypt_hash_inode_number(ci, ci->ci_master_key);
diff --git a/fs/dax.c b/fs/dax.c
index 4229513806be..20ecf652c129 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1743,6 +1743,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
loff_t done = 0;
int ret;
+ if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC))
+ return -EIO;
+
if (!iomi.len)
return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 60046ae23d51..65cc11939654 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2509,8 +2509,8 @@ static inline unsigned start_dir_add(struct inode *dir)
{
preempt_disable_nested();
for (;;) {
- unsigned n = dir->i_dir_seq;
- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+ unsigned n = READ_ONCE(dir->i_dir_seq);
+ if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1))
return n;
cpu_relax();
}
@@ -2922,6 +2922,7 @@ void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
write_sequnlock(&rename_lock);
}
+EXPORT_SYMBOL(d_exchange);
/**
* d_ancestor - search for an ancestor
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a0357b0cf362..c12d649df6a5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -183,6 +183,9 @@ static int debugfs_reconfigure(struct fs_context *fc)
struct debugfs_fs_info *sb_opts = sb->s_fs_info;
struct debugfs_fs_info *new_opts = fc->s_fs_info;
+ if (!new_opts)
+ return 0;
+
sync_filesystem(sb);
/* structure copy of new mount options to sb */
@@ -282,10 +285,16 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
static int debugfs_get_tree(struct fs_context *fc)
{
+ int err;
+
if (!(debugfs_allow & DEBUGFS_ALLOW_API))
return -EPERM;
- return get_tree_single(fc, debugfs_fill_super);
+ err = get_tree_single(fc, debugfs_fill_super);
+ if (err)
+ return err;
+
+ return debugfs_reconfigure(fc);
}
static void debugfs_free_fc(struct fs_context *fc)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e4373bce1bc2..9a0b6c2b6b01 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1703,7 +1703,7 @@ static int work_start(void)
return -ENOMEM;
}
- process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH, 0);
+ process_workqueue = alloc_workqueue("dlm_process", WQ_HIGHPRI | WQ_BH | WQ_PERCPU, 0);
if (!process_workqueue) {
log_print("can't start dlm_process");
destroy_workqueue(io_workqueue);
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 4887c8a05318..a44d16da7187 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -52,7 +52,7 @@ static int __init init_dlm(void)
if (error)
goto out_user;
- dlm_wq = alloc_workqueue("dlm_wq", 0, 0);
+ dlm_wq = alloc_workqueue("dlm_wq", WQ_PERCPU, 0);
if (!dlm_wq) {
error = -ENOMEM;
goto out_plock;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index c4a139911356..1f4d8ce56667 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb);
static const struct super_operations efivarfs_ops = {
.statfs = efivarfs_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.alloc_inode = efivarfs_alloc_inode,
.free_inode = efivarfs_free_inode,
.show_options = efivarfs_show_options,
@@ -152,6 +152,10 @@ static int efivarfs_d_compare(const struct dentry *dentry,
{
int guid = len - EFI_VARIABLE_GUID_LEN;
+ /* Parallel lookups may produce a temporary invalid filename */
+ if (guid <= 0)
+ return 1;
+
if (name->len != len)
return 1;
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 7b26efc271ee..d81f3318417d 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,8 +3,18 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select CACHEFILES if EROFS_FS_ONDEMAND
select CRC32
+ select CRYPTO if EROFS_FS_ZIP_ACCEL
+ select CRYPTO_DEFLATE if EROFS_FS_ZIP_ACCEL
select FS_IOMAP
+ select LZ4_DECOMPRESS if EROFS_FS_ZIP
+ select NETFS_SUPPORT if EROFS_FS_ONDEMAND
+ select XXHASH if EROFS_FS_XATTR
+ select XZ_DEC if EROFS_FS_ZIP_LZMA
+ select XZ_DEC_MICROLZMA if EROFS_FS_ZIP_LZMA
+ select ZLIB_INFLATE if EROFS_FS_ZIP_DEFLATE
+ select ZSTD_DECOMPRESS if EROFS_FS_ZIP_ZSTD
help
EROFS (Enhanced Read-Only File System) is a lightweight read-only
file system with modern designs (e.g. no buffer heads, inline
@@ -38,7 +48,6 @@ config EROFS_FS_DEBUG
config EROFS_FS_XATTR
bool "EROFS extended attributes"
depends on EROFS_FS
- select XXHASH
default y
help
Extended attributes are name:value pairs associated with inodes by
@@ -94,7 +103,6 @@ config EROFS_FS_BACKED_BY_FILE
config EROFS_FS_ZIP
bool "EROFS Data Compression Support"
depends on EROFS_FS
- select LZ4_DECOMPRESS
default y
help
Enable transparent compression support for EROFS file systems.
@@ -104,8 +112,6 @@ config EROFS_FS_ZIP
config EROFS_FS_ZIP_LZMA
bool "EROFS LZMA compressed data support"
depends on EROFS_FS_ZIP
- select XZ_DEC
- select XZ_DEC_MICROLZMA
help
Saying Y here includes support for reading EROFS file systems
containing LZMA compressed data, specifically called microLZMA. It
@@ -117,7 +123,6 @@ config EROFS_FS_ZIP_LZMA
config EROFS_FS_ZIP_DEFLATE
bool "EROFS DEFLATE compressed data support"
depends on EROFS_FS_ZIP
- select ZLIB_INFLATE
help
Saying Y here includes support for reading EROFS file systems
containing DEFLATE compressed data. It gives better compression
@@ -132,7 +137,6 @@ config EROFS_FS_ZIP_DEFLATE
config EROFS_FS_ZIP_ZSTD
bool "EROFS Zstandard compressed data support"
depends on EROFS_FS_ZIP
- select ZSTD_DECOMPRESS
help
Saying Y here includes support for reading EROFS file systems
containing Zstandard compressed data. It gives better compression
@@ -147,8 +151,6 @@ config EROFS_FS_ZIP_ZSTD
config EROFS_FS_ZIP_ACCEL
bool "EROFS hardware decompression support"
depends on EROFS_FS_ZIP
- select CRYPTO
- select CRYPTO_DEFLATE
help
Saying Y here includes hardware accelerator support for reading
EROFS file systems containing compressed data. It gives better
@@ -163,9 +165,7 @@ config EROFS_FS_ZIP_ACCEL
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support (deprecated)"
depends on EROFS_FS
- select NETFS_SUPPORT
select FSCACHE
- select CACHEFILES
select CACHEFILES_ONDEMAND
help
This permits EROFS to use fscache-backed data blobs with on-demand
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 377ee12b8b96..3d5738f80072 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -12,10 +12,12 @@
/* to allow for x86 boot sectors and other oddities. */
#define EROFS_SUPER_OFFSET 1024
-#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
-#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
-#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008
+#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010
+
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ccc5f0ee8df..9319c66e86c3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -234,6 +234,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX)
+EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX)
static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
{
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index e1020aa60771..db13b40a78e0 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -174,6 +174,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
if (!erofs_is_fileio_mode(sbi)) {
dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
&dif->dax_part_off, NULL, NULL);
+ if (!dif->dax_dev && test_opt(&sbi->opt, DAX_ALWAYS)) {
+ erofs_info(sb, "DAX unsupported by %s. Turning off DAX.",
+ dif->path);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
} else if (!S_ISREG(file_inode(file)->i_mode)) {
fput(file);
return -EINVAL;
@@ -210,8 +215,13 @@ static int erofs_scan_devices(struct super_block *sb,
ondisk_extradevs, sbi->devs->extra_devices);
return -EINVAL;
}
- if (!ondisk_extradevs)
+ if (!ondisk_extradevs) {
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && !sbi->dif0.dax_dev) {
+ erofs_info(sb, "DAX unsupported by block device. Turning off DAX.");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
return 0;
+ }
if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb))
sbi->devs->flatdev = true;
@@ -313,8 +323,8 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
- sbi->dif0.blocks = (sbi->dif0.blocks << 32) |
- le16_to_cpu(dsb->rb.blocks_hi);
+ sbi->dif0.blocks = sbi->dif0.blocks |
+ ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
} else {
sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
}
@@ -338,7 +348,6 @@ static int erofs_read_superblock(struct super_block *sb)
if (ret < 0)
goto out;
- /* handle multiple devices */
ret = erofs_scan_devices(sb, dsb);
if (erofs_sb_has_48bit(sbi))
@@ -671,14 +680,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return invalfc(fc, "cannot use fsoffset in fscache mode");
}
- if (test_opt(&sbi->opt, DAX_ALWAYS)) {
- if (!sbi->dif0.dax_dev) {
- errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
- clear_opt(&sbi->opt, DAX_ALWAYS);
- } else if (sbi->blkszbits != PAGE_SHIFT) {
- errorfc(fc, "unsupported blocksize for DAX");
- clear_opt(&sbi->opt, DAX_ALWAYS);
- }
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && sbi->blkszbits != PAGE_SHIFT) {
+ erofs_info(sb, "unsupported blocksize for DAX");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
}
sb->s_time_gran = 1;
@@ -1014,10 +1018,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static void erofs_evict_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ dax_break_layout_final(inode);
+#endif
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
const struct super_operations erofs_sops = {
.put_super = erofs_put_super,
.alloc_inode = erofs_alloc_inode,
.free_inode = erofs_free_inode,
+ .evict_inode = erofs_evict_inode,
.statfs = erofs_statfs,
.show_options = erofs_show_options,
};
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index eaa9efd766ee..396536d9a862 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2;
struct erofs_xattr_prefix_item *pfs;
int ret = 0, i, len;
+ bool plain = erofs_sb_has_plain_xattr_pfx(sbi);
if (!sbi->xattr_prefix_count)
return 0;
@@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
if (!pfs)
return -ENOMEM;
- if (sbi->packed_inode)
- buf.mapping = sbi->packed_inode->i_mapping;
- else
+ if (!plain) {
+ if (erofs_sb_has_metabox(sbi))
+ (void)erofs_init_metabuf(&buf, sb, true);
+ else if (sbi->packed_inode)
+ buf.mapping = sbi->packed_inode->i_mapping;
+ else
+ plain = true;
+ }
+ if (plain)
(void)erofs_init_metabuf(&buf, sb, false);
for (i = 0; i < sbi->xattr_prefix_count; i++) {
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 792f20888a8f..2d73297003d2 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1432,6 +1432,16 @@ static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
}
#endif
+/* Use (kthread_)work in atomic contexts to minimize scheduling overhead */
+static inline bool z_erofs_in_atomic(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth())
+ return true;
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return true;
+ return !preemptible();
+}
+
static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
int bios)
{
@@ -1446,8 +1456,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
if (atomic_add_return(bios, &io->pending_bios))
return;
- /* Use (kthread_)work and sync decompression for atomic contexts only */
- if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
+ if (z_erofs_in_atomic()) {
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
struct kthread_worker *worker;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a93efd95c555..798223e6da9c 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
.map = map,
.in_mbox = erofs_inode_in_metabox(inode),
};
- int err = 0;
- unsigned int endoff, afmt;
+ unsigned int endoff;
unsigned long initial_lcn;
unsigned long long ofs, end;
+ int err;
ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
@@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode,
err = -EFSCORRUPTED;
goto unmap_out;
}
- afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
- Z_EROFS_COMPRESSION_INTERLACED :
- Z_EROFS_COMPRESSION_SHIFTED;
+ if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+ } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+ map->m_algorithmformat = vi->z_algorithmtype[1];
} else {
- afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
- vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
- if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
- erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
- afmt, vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
- }
+ map->m_algorithmformat = vi->z_algorithmtype[0];
}
- map->m_algorithmformat = afmt;
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
@@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err, headnr;
- erofs_off_t pos;
struct z_erofs_map_header *h;
+ erofs_off_t pos;
+ int err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
/*
@@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
- err = 0;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
goto out_unlock;
@@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- headnr = 0;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
- vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
- headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
- err = -EOPNOTSUPP;
- goto out_unlock;
- }
-
if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
@@ -726,6 +711,30 @@ out_unlock:
return err;
}
+static int z_erofs_map_sanity_check(struct inode *inode,
+ struct erofs_map_blocks *map)
+{
+ struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+
+ if (!(map->m_flags & EROFS_MAP_ENCODED))
+ return 0;
+ if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) {
+ erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel",
+ map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid);
+ return -EOPNOTSUPP;
+ }
+ if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX &&
+ !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ map->m_algorithmformat, EROFS_I(inode)->nid);
+ return -EFSCORRUPTED;
+ }
+ if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+ map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags)
{
@@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
else
err = z_erofs_map_blocks_fo(inode, map, flags);
}
- if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
- unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
- map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
- err = -EOPNOTSUPP;
+ if (!err)
+ err = z_erofs_map_sanity_check(inode, map);
if (err)
map->m_llen = 0;
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b22d6f819f78..ee7c4b683ec3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -46,10 +46,10 @@
*
* 1) epnested_mutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->lock (rwlock)
+ * 3) ep->lock (spinlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a rwlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -195,7 +195,7 @@ struct eventpoll {
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
- rwlock_t lock;
+ spinlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
@@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
* in a lockless way.
*/
lockdep_assert_irqs_enabled();
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_done_scan(struct eventpoll *ep,
@@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep,
{
struct epitem *epi, *nepi;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep,
wake_up(&ep->wq);
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_get(struct eventpoll *ep)
@@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
rb_erase_cached(&epi->rbn, &ep->rbr);
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep)
return -ENOMEM;
mutex_init(&ep->mtx);
- rwlock_init(&ep->lock);
+ spin_lock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1240,99 +1240,9 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
#endif /* CONFIG_KCMP */
/*
- * Adds a new entry to the tail of the list in a lockless way, i.e.
- * multiple CPUs are allowed to call this function concurrently.
- *
- * Beware: it is necessary to prevent any other modifications of the
- * existing list until all changes are completed, in other words
- * concurrent list_add_tail_lockless() calls should be protected
- * with a read lock, where write lock acts as a barrier which
- * makes sure all list_add_tail_lockless() calls are fully
- * completed.
- *
- * Also an element can be locklessly added to the list only in one
- * direction i.e. either to the tail or to the head, otherwise
- * concurrent access will corrupt the list.
- *
- * Return: %false if element has been already added to the list, %true
- * otherwise.
- */
-static inline bool list_add_tail_lockless(struct list_head *new,
- struct list_head *head)
-{
- struct list_head *prev;
-
- /*
- * This is simple 'new->next = head' operation, but cmpxchg()
- * is used in order to detect that same element has been just
- * added to the list from another CPU: the winner observes
- * new->next == new.
- */
- if (!try_cmpxchg(&new->next, &new, head))
- return false;
-
- /*
- * Initially ->next of a new element must be updated with the head
- * (we are inserting to the tail) and only then pointers are atomically
- * exchanged. XCHG guarantees memory ordering, thus ->next should be
- * updated before pointers are actually swapped and pointers are
- * swapped before prev->next is updated.
- */
-
- prev = xchg(&head->prev, new);
-
- /*
- * It is safe to modify prev->next and new->prev, because a new element
- * is added only to the tail and new->next is updated before XCHG.
- */
-
- prev->next = new;
- new->prev = prev;
-
- return true;
-}
-
-/*
- * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
- * i.e. multiple CPUs are allowed to call this function concurrently.
- *
- * Return: %false if epi element has been already chained, %true otherwise.
- */
-static inline bool chain_epi_lockless(struct epitem *epi)
-{
- struct eventpoll *ep = epi->ep;
-
- /* Fast preliminary check */
- if (epi->next != EP_UNACTIVE_PTR)
- return false;
-
- /* Check that the same epi has not been just chained from another CPU */
- if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
- return false;
-
- /* Atomically exchange tail */
- epi->next = xchg(&ep->ovflist, epi);
-
- return true;
-}
-
-/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
- *
- * This callback takes a read lock in order not to contend with concurrent
- * events from another file descriptor, thus all modifications to ->rdllist
- * or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_start/done_scan(), which stops all list modifications and guarantees
- * that lists state is seen correctly.
- *
- * Another thing worth to mention is that ep_poll_callback() can be called
- * concurrently for the same @epi from different CPUs if poll table was inited
- * with several wait queues entries. Plural wakeup from different CPUs of a
- * single wait queue is serialized by wq.lock, but the case when multiple wait
- * queues are used should be detected accordingly. This is detected using
- * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
@@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
unsigned long flags;
int ewake = 0;
- read_lock_irqsave(&ep->lock, flags);
+ spin_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (chain_epi_lockless(epi))
+ if (epi->next == EP_UNACTIVE_PTR) {
+ epi->next = READ_ONCE(ep->ovflist);
+ WRITE_ONCE(ep->ovflist, epi);
ep_pm_stay_awake_rcu(epi);
+ }
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
- if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
- ep_pm_stay_awake_rcu(epi);
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake_rcu(epi);
}
/*
@@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
pwake++;
out_unlock:
- read_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
}
/* We have to drop the new item inside our item list to keep track of it */
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
/* We have to call this outside the lock */
if (pwake)
@@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
@@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
if (!eavail)
timed_out = !ep_schedule_timeout(to) ||
@@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
@@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
}
}
diff --git a/fs/exec.c b/fs/exec.c
index 2a1e5e4042a1..e861a4b7ffda 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ
{
int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!error)
+ if (!error && !write)
validate_coredump_safety();
return error;
}
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 0a056d97e640..cf0a0970c095 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -227,6 +227,8 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
}
const struct fscrypt_operations ext4_cryptops = {
+ .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) -
+ (int)offsetof(struct ext4_inode_info, vfs_inode),
.needs_bounce_pages = 1,
.has_32bit_inodes = 1,
.supports_subblock_data_units = 1,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01a6e2de7fc3..6cb784a56b3b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1182,6 +1182,14 @@ struct ext4_inode_info {
__u32 i_csum_seed;
kprojid_t i_projid;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
+#endif
+
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info;
+#endif
};
/*
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 383c6edea6dd..91185c40f755 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -393,6 +393,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
/* Reserved GDT blocks */
if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) {
len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+
+ /*
+ * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases,
+ * check for that.
+ */
+ if (!len)
+ return 0;
+
error = ext4_getfsmap_fill(meta_list, fsb, len,
EXT4_FMR_OWN_RESV_GDT);
if (error)
@@ -526,6 +534,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
ext4_group_t end_ag;
ext4_grpblk_t first_cluster;
ext4_grpblk_t last_cluster;
+ struct ext4_fsmap irec;
int error = 0;
bofs = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -609,10 +618,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
goto err;
}
- /* Report any gaps at the end of the bg */
+ /*
+ * The dummy record below will cause ext4_getfsmap_helper() to report
+ * any allocated blocks at the end of the range.
+ */
+ irec.fmr_device = 0;
+ irec.fmr_physical = end_fsb + 1;
+ irec.fmr_length = 0;
+ irec.fmr_owner = EXT4_FMR_OWN_FREE;
+ irec.fmr_flags = 0;
+
info->gfi_last = true;
- error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1,
- 0, info);
+ error = ext4_getfsmap_helper(sb, info, &irec);
if (error)
goto err;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index df4051613b29..ba4fd9aba1c1 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
"nonexistent device\n", __func__, __LINE__);
return;
}
- if (atomic_read(&inode->i_count) > 1) {
+ if (icount_read(inode) > 1) {
ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
__func__, __LINE__, inode->i_ino,
- atomic_read(&inode->i_count));
+ icount_read(inode));
return;
}
if (inode->i_nlink) {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 7de327fa7b1c..d45124318200 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -539,7 +539,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
int indirect_blks;
int blocks_to_boundary = 0;
int depth;
- int count = 0;
+ u64 count = 0;
ext4_fsblk_t first_block = 0;
trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
@@ -588,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
count++;
/* Fill in size of a hole we found */
map->m_pblk = 0;
- map->m_len = min_t(unsigned int, map->m_len, count);
+ map->m_len = umin(map->m_len, count);
goto cleanup;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ed54c4d0f2f9..5b7a15db4953 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -146,7 +146,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
*/
int ext4_inode_is_fast_symlink(struct inode *inode)
{
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ if (!ext4_has_feature_ea_inode(inode->i_sb)) {
int ea_blocks = EXT4_I(inode)->i_file_acl ?
EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
@@ -3155,7 +3155,7 @@ retry:
folio_unlock(folio);
folio_put(folio);
/*
- * block_write_begin may have instantiated a few blocks
+ * ext4_block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold inode lock.
*/
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5898d92ba19f..8b18802e83eb 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3995,7 +3995,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
list_splice_tail(&freed_data_list, &sbi->s_discard_list);
spin_unlock(&sbi->s_md_lock);
if (wake)
- queue_work(system_unbound_wq, &sbi->s_discard_work);
+ queue_work(system_dfl_wq, &sbi->s_discard_work);
} else {
list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
kmem_cache_free(ext4_free_data_cachep, entry);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d83f91b62317..2cd36f59c9e3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2965,7 +2965,6 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode)
{
struct buffer_head *dir_block = NULL;
- struct ext4_dir_entry_2 *de;
ext4_lblk_t block = 0;
int err;
@@ -2982,10 +2981,7 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
dir_block = ext4_append(handle, inode, &block);
if (IS_ERR(dir_block))
return PTR_ERR(dir_block);
- de = (struct ext4_dir_entry_2 *)dir_block->b_data;
err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0);
- if (err)
- goto out;
out:
brelse(dir_block);
return err;
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 7c7f792ad6ab..524d4658fa40 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -589,8 +589,9 @@ int ext4_init_orphan_info(struct super_block *sb)
}
oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
- oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
- GFP_KERNEL);
+ oi->of_binfo = kmalloc_array(oi->of_blocks,
+ sizeof(struct ext4_orphan_block),
+ GFP_KERNEL);
if (!oi->of_binfo) {
ret = -ENOMEM;
goto out_put;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 3d8b0f6d2dea..39abfeec5f36 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -547,7 +547,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
* first page of the bio. Otherwise it can deadlock.
*/
if (io->io_bio)
- gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
+ gfp_flags = GFP_NOWAIT;
retry_encrypt:
bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
enc_bytes, 0, gfp_flags);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c7d39da7e733..7f2d4014d128 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -268,7 +268,7 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
- sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);
+ sb->s_blocksize, GFP_NOWAIT);
if (likely(bh)) {
if (trylock_buffer(bh))
@@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
static int ext4_drop_inode(struct inode *inode)
{
- int drop = generic_drop_inode(inode);
+ int drop = inode_generic_drop(inode);
if (!drop)
drop = fscrypt_drop_inode(inode);
@@ -1470,6 +1470,12 @@ static void init_once(void *foo)
init_rwsem(&ei->i_data_sem);
inode_init_once(&ei->vfs_inode);
ext4_fc_init_inode(&ei->vfs_inode);
+#ifdef CONFIG_FS_ENCRYPTION
+ ei->i_crypt_info = NULL;
+#endif
+#ifdef CONFIG_FS_VERITY
+ ei->i_verity_info = NULL;
+#endif
}
static int __init init_inodecache(void)
@@ -1998,6 +2004,9 @@ int ext4_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &ext4_context_ops;
+ /* i_version is always enabled now */
+ fc->sb_flags |= SB_I_VERSION;
+
return 0;
}
@@ -2975,6 +2984,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+ if (nodefs && sb->s_flags & SB_I_VERSION)
+ SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
@@ -5314,9 +5325,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
- /* i_version is always enabled now */
- sb->s_flags |= SB_I_VERSION;
-
/* HSM events are allowed by default. */
sb->s_iflags |= SB_I_ALLOW_HSM;
@@ -5414,6 +5422,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
+ if (bdev_read_only(sb->s_bdev))
+ needs_recovery = 0;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index d9203228ce97..b0acb0c50313 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -389,6 +389,8 @@ static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
}
const struct fsverity_operations ext4_verityops = {
+ .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) -
+ (int)offsetof(struct ext4_inode_info, vfs_inode),
.begin_enable_verity = ext4_begin_enable_verity,
.end_enable_verity = ext4_end_enable_verity,
.get_verity_descriptor = ext4_get_verity_descriptor,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 46be7560548c..6e465bbc85ee 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -907,6 +907,12 @@ struct f2fs_inode_info {
unsigned int atomic_write_cnt;
loff_t original_i_size; /* original i_size before atomic write */
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */
+#endif
+#ifdef CONFIG_FS_VERITY
+ struct fsverity_info *i_verity_info; /* filesystem verity info */
+#endif
};
static inline void get_read_extent_info(struct extent_info *ext,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e16c4e2830c2..2619cbbd7d2d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -480,6 +480,12 @@ static void init_once(void *foo)
struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
inode_init_once(&fi->vfs_inode);
+#ifdef CONFIG_FS_ENCRYPTION
+ fi->i_crypt_info = NULL;
+#endif
+#ifdef CONFIG_FS_VERITY
+ fi->i_verity_info = NULL;
+#endif
}
#ifdef CONFIG_QUOTA
@@ -1744,7 +1750,7 @@ static int f2fs_drop_inode(struct inode *inode)
if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) {
if (!inode->i_nlink && !is_bad_inode(inode)) {
/* to avoid evict_inode call simultaneously */
- atomic_inc(&inode->i_count);
+ __iget(inode);
spin_unlock(&inode->i_lock);
/* should remain fi->extent_tree for writepage */
@@ -1763,12 +1769,12 @@ static int f2fs_drop_inode(struct inode *inode)
sb_end_intwrite(inode->i_sb);
spin_lock(&inode->i_lock);
- atomic_dec(&inode->i_count);
+ iput(inode);
}
trace_f2fs_drop_inode(inode, 0);
return 0;
}
- ret = generic_drop_inode(inode);
+ ret = inode_generic_drop(inode);
if (!ret)
ret = fscrypt_drop_inode(inode);
trace_f2fs_drop_inode(inode, ret);
@@ -3570,6 +3576,8 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
}
static const struct fscrypt_operations f2fs_cryptops = {
+ .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_crypt_info) -
+ (int)offsetof(struct f2fs_inode_info, vfs_inode),
.needs_bounce_pages = 1,
.has_32bit_inodes = 1,
.supports_subblock_data_units = 1,
@@ -3581,7 +3589,7 @@ static const struct fscrypt_operations f2fs_cryptops = {
.has_stable_inodes = f2fs_has_stable_inodes,
.get_devices = f2fs_get_devices,
};
-#endif
+#endif /* CONFIG_FS_ENCRYPTION */
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 2287f238ae09..f0ab9a3c7a82 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -287,6 +287,8 @@ static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
}
const struct fsverity_operations f2fs_verityops = {
+ .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_verity_info) -
+ (int)offsetof(struct f2fs_inode_info, vfs_inode),
.begin_enable_verity = f2fs_begin_enable_verity,
.end_enable_verity = f2fs_end_enable_verity,
.get_verity_descriptor = f2fs_get_verity_descriptor,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 5598e4d57422..72f8433d9109 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -355,8 +355,7 @@ static bool rw_hint_valid(u64 hint)
}
}
-static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fcntl_get_rw_hint(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
@@ -367,8 +366,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
return 0;
}
-static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fcntl_set_rw_hint(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
@@ -547,10 +545,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = memfd_fcntl(filp, cmd, argi);
break;
case F_GET_RW_HINT:
- err = fcntl_get_rw_hint(filp, cmd, arg);
+ err = fcntl_get_rw_hint(filp, arg);
break;
case F_SET_RW_HINT:
- err = fcntl_set_rw_hint(filp, cmd, arg);
+ err = fcntl_set_rw_hint(filp, arg);
break;
default:
break;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 7c236f64cdea..052f9c9368fb 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -11,6 +11,7 @@
#include <linux/personality.h>
#include <linux/uaccess.h>
#include <linux/compat.h>
+#include <linux/nsfs.h>
#include "internal.h"
#include "mount.h"
@@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root)
return 0;
}
+ if (fd == FD_NSFS_ROOT) {
+ nsfs_get_root(root);
+ return 0;
+ }
+
return -EBADF;
}
@@ -208,6 +214,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
return 1;
/*
+ * Verify that the decoded dentry itself has a valid id mapping.
+ * In case the decoded dentry is the mountfd root itself, this
+ * verifies that the mountfd inode itself has a valid id mapping.
+ */
+ if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry)))
+ return 0;
+
+ /*
* It's racy as we're not taking rename_lock but we're able to ignore
* permissions and we just need an approximation whether we were able
* to follow a path to the file.
@@ -402,7 +416,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
if (retval)
return retval;
- CLASS(get_unused_fd, fd)(O_CLOEXEC);
+ CLASS(get_unused_fd, fd)(open_flag);
if (fd < 0)
return fd;
diff --git a/fs/file.c b/fs/file.c
index 6d2275c3be9c..28743b742e3c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
err = expand_files(files, fd);
if (unlikely(err < 0))
goto out_unlock;
- return do_dup2(files, file, fd, flags);
+ err = do_dup2(files, file, fd, flags);
+ if (err < 0)
+ return err;
+ return 0;
out_unlock:
spin_unlock(&files->file_lock);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 52129267e3bd..2b35e80037fe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1162,7 +1162,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
dirty = dirty * 10 / 8;
/* issue the writeback work */
- work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
+ work = kzalloc(sizeof(*work), GFP_NOWAIT);
if (work) {
work->nr_pages = dirty;
work->sync_mode = WB_SYNC_NONE;
@@ -1219,7 +1219,7 @@ void cgroup_writeback_umount(struct super_block *sb)
static int __init cgroup_writeback_init(void)
{
- isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ isw_wq = alloc_workqueue("inode_switch_wbs", WQ_PERCPU, 0);
if (!isw_wq)
return -ENOMEM;
return 0;
@@ -1806,7 +1806,7 @@ static int writeback_single_inode(struct inode *inode,
int ret = 0;
spin_lock(&inode->i_lock);
- if (!atomic_read(&inode->i_count))
+ if (!icount_read(inode))
WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
else
WARN_ON(inode->i_state & I_WILL_FREE);
@@ -2481,7 +2481,7 @@ static int dirtytime_interval_handler(const struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
- mod_delayed_work(system_wq, &dirtytime_work, 0);
+ mod_delayed_work(system_percpu_wq, &dirtytime_work, 0);
return ret;
}
@@ -2647,10 +2647,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
wakeup_bdi = inode_io_list_move_locked(inode, wb,
dirty_list);
- spin_unlock(&wb->list_lock);
- spin_unlock(&inode->i_lock);
- trace_writeback_dirty_inode_enqueue(inode);
-
/*
* If this is the first dirty inode for this bdi,
* we have to wake-up the corresponding bdi thread
@@ -2660,6 +2656,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (wakeup_bdi &&
(wb->bdi->capabilities & BDI_CAP_WRITEBACK))
wb_wakeup_delayed(wb);
+
+ spin_unlock(&wb->list_lock);
+ spin_unlock(&inode->i_lock);
+ trace_writeback_dirty_inode_enqueue(inode);
+
return;
}
}
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 1aaf4cb2afb2..f645c99204eb 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -18,50 +18,56 @@
#include "internal.h"
#include "mount.h"
+static inline const char *fetch_message_locked(struct fc_log *log, size_t len,
+ bool *need_free)
+{
+ const char *p;
+ int index;
+
+ if (unlikely(log->head == log->tail))
+ return ERR_PTR(-ENODATA);
+
+ index = log->tail & (ARRAY_SIZE(log->buffer) - 1);
+ p = log->buffer[index];
+ if (unlikely(strlen(p) > len))
+ return ERR_PTR(-EMSGSIZE);
+
+ log->buffer[index] = NULL;
+ *need_free = log->need_free & (1 << index);
+ log->need_free &= ~(1 << index);
+ log->tail++;
+
+ return p;
+}
+
/*
* Allow the user to read back any error, warning or informational messages.
+ * Only one message is returned for each read(2) call.
*/
static ssize_t fscontext_read(struct file *file,
char __user *_buf, size_t len, loff_t *pos)
{
struct fs_context *fc = file->private_data;
- struct fc_log *log = fc->log.log;
- unsigned int logsize = ARRAY_SIZE(log->buffer);
- ssize_t ret;
- char *p;
+ ssize_t err;
+ const char *p __free(kfree) = NULL, *message;
bool need_free;
- int index, n;
+ int n;
- ret = mutex_lock_interruptible(&fc->uapi_mutex);
- if (ret < 0)
- return ret;
-
- if (log->head == log->tail) {
- mutex_unlock(&fc->uapi_mutex);
- return -ENODATA;
- }
-
- index = log->tail & (logsize - 1);
- p = log->buffer[index];
- need_free = log->need_free & (1 << index);
- log->buffer[index] = NULL;
- log->need_free &= ~(1 << index);
- log->tail++;
+ err = mutex_lock_interruptible(&fc->uapi_mutex);
+ if (err < 0)
+ return err;
+ message = fetch_message_locked(fc->log.log, len, &need_free);
mutex_unlock(&fc->uapi_mutex);
+ if (IS_ERR(message))
+ return PTR_ERR(message);
- ret = -EMSGSIZE;
- n = strlen(p);
- if (n > len)
- goto err_free;
- ret = -EFAULT;
- if (copy_to_user(_buf, p, n) != 0)
- goto err_free;
- ret = n;
-
-err_free:
if (need_free)
- kfree(p);
- return ret;
+ p = message;
+
+ n = strlen(message);
+ if (copy_to_user(_buf, message, n))
+ return -EFAULT;
+ return n;
}
static int fscontext_release(struct inode *inode, struct file *file)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e80cd8f2c049..66a1ba8c56b5 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -119,7 +119,7 @@ void fuse_check_timeout(struct work_struct *work)
goto abort_conn;
out:
- queue_delayed_work(system_wq, &fc->timeout.work,
+ queue_delayed_work(system_percpu_wq, &fc->timeout.work,
fuse_timeout_timer_freq);
return;
@@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
index = outarg->offset >> PAGE_SHIFT;
- while (num) {
+ while (num && ap->num_folios < num_pages) {
struct folio *folio;
unsigned int folio_offset;
unsigned int nr_bytes;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2d817d7cab26..5c569c3cb53f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
if (attr->blksize != 0)
blkbits = ilog2(attr->blksize);
else
- blkbits = inode->i_sb->s_blocksize_bits;
+ blkbits = fc->blkbits;
stat->blksize = 1 << blkbits;
}
@@ -1377,6 +1377,7 @@ retry:
generic_fillattr(idmap, request_mask, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
+ stat->blksize = 1 << fi->cached_i_blkbits;
if (test_bit(FUSE_I_BTIME, &fi->state)) {
stat->btime = fi->i_btime;
stat->result_mask |= STATX_BTIME;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5525a4520b0f..4adcf09d4b01 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
.nodeid_out = ff_out->nodeid,
.fh_out = ff_out->fh,
.off_out = pos_out,
- .len = len,
+ .len = min_t(size_t, len, UINT_MAX & PAGE_MASK),
.flags = flags
};
struct fuse_write_out outarg;
@@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
fc->no_copy_file_range = 1;
err = -EOPNOTSUPP;
}
+ if (!err && outarg.size > len)
+ err = -EIO;
+
if (err)
goto out;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ec248d13c8bf..cc428d04be3e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -210,6 +210,12 @@ struct fuse_inode {
/** Reference to backing file in passthrough mode */
struct fuse_backing *fb;
#endif
+
+ /*
+ * The underlying inode->i_blkbits value will not be modified,
+ * so preserve the blocksize specified by the server.
+ */
+ u8 cached_i_blkbits;
};
/** FUSE inode state bits */
@@ -969,6 +975,14 @@ struct fuse_conn {
/* Request timeout (in jiffies). 0 = no timeout */
unsigned int req_timeout;
} timeout;
+
+ /*
+ * This is a workaround until fuse uses iomap for reads.
+ * For fuseblk servers, this represents the blocksize passed in at
+ * mount time and for regular fuse servers, this is equivalent to
+ * inode->i_blkbits.
+ */
+ u8 blkbits;
};
/*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ecb869e895ab..7485a41af892 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -289,10 +289,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
}
}
- if (attr->blksize != 0)
- inode->i_blkbits = ilog2(attr->blksize);
+ if (attr->blksize)
+ fi->cached_i_blkbits = ilog2(attr->blksize);
else
- inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+ fi->cached_i_blkbits = fc->blkbits;
/*
* Don't set the sticky bit in i_mode, unless we want the VFS
@@ -1209,7 +1209,7 @@ static const struct super_operations fuse_super_operations = {
.free_inode = fuse_free_inode,
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
.sync_fs = fuse_sync_fs,
@@ -1273,7 +1273,7 @@ static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout)
{
fc->timeout.req_timeout = secs_to_jiffies(timeout);
INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout);
- queue_delayed_work(system_wq, &fc->timeout.work,
+ queue_delayed_work(system_percpu_wq, &fc->timeout.work,
fuse_timeout_timer_freq);
}
@@ -1810,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -EINVAL;
if (!sb_set_blocksize(sb, ctx->blksize))
goto err;
+ /*
+ * This is a workaround until fuse hooks into iomap for reads.
+ * Use PAGE_SIZE for the blocksize else if the writeback cache
+ * is enabled, buffered writes go through iomap and a read may
+ * overwrite partially written data if blocksize < PAGE_SIZE
+ */
+ fc->blkbits = sb->s_blocksize_bits;
+ if (ctx->blksize != PAGE_SIZE &&
+ !sb_set_blocksize(sb, PAGE_SIZE))
+ goto err;
#endif
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
+ fc->blkbits = sb->s_blocksize_bits;
}
sb->s_subtype = ctx->subtype;
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 607ef735ad4a..eb97ac009e75 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
if (!file)
goto out;
+ /* read/write/splice/mmap passthrough only relevant for regular files */
+ res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL;
+ if (!d_is_reg(file->f_path.dentry))
+ goto out_fput;
+
backing_sb = file_inode(file)->i_sb;
res = -ELOOP;
if (backing_sb->s_stack_depth >= fc->max_stack_depth)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index c826e7ca49f5..76c8fd0bfc75 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (kaddr)
*kaddr = fs->window_kaddr + offset;
if (pfn)
- *pfn = fs->window_phys_addr + offset;
+ *pfn = PHYS_PFN(fs->window_phys_addr + offset);
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0727f60ad028..9d65719353fa 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -151,7 +151,8 @@ static int __init init_gfs2_fs(void)
error = -ENOMEM;
gfs2_recovery_wq = alloc_workqueue("gfs2_recovery",
- WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
+ WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU,
+ 0);
if (!gfs2_recovery_wq)
goto fail_wq1;
@@ -160,7 +161,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_control_wq)
goto fail_wq2;
- gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0);
+ gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", WQ_PERCPU, 0);
if (!gfs2_freeze_wq)
goto fail_wq3;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index efe99b732551..aa15183f9a16 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1193,13 +1193,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
error = -ENOMEM;
sdp->sd_glock_wq = alloc_workqueue("gfs2-glock/%s",
- WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0,
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE | WQ_PERCPU,
+ 0,
sdp->sd_fsname);
if (!sdp->sd_glock_wq)
goto fail_iput;
sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s",
- WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname);
+ WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU, 0,
+ sdp->sd_fsname);
if (!sdp->sd_delete_wq)
goto fail_glock_wq;
@@ -1754,7 +1756,7 @@ static void gfs2_evict_inodes(struct super_block *sb)
spin_unlock(&inode->i_lock);
continue;
}
- atomic_inc(&inode->i_count);
+ __iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b42e2110084b..644b2d1e7276 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode)
if (test_bit(SDF_EVICTING, &sdp->sd_flags))
return 1;
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
/**
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 01e516175bcd..1e1acf5775ab 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
static const struct super_operations hostfs_sbops = {
.alloc_inode = hostfs_alloc_inode,
.free_inode = hostfs_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = hostfs_evict_inode,
.statfs = hostfs_statfs,
.show_options = hostfs_show_options,
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index a59e8fa630db..34008442ee26 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i)
struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
struct inode *parent;
if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
- if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) {
+ if (hpfs_inode->i_rddir_off && !icount_read(i)) {
if (*hpfs_inode->i_rddir_off)
pr_err("write_inode: some position still there\n");
kfree(hpfs_inode->i_rddir_off);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 09d4baef29cf..be4be99304bc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -517,14 +517,16 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
/*
* If folio is mapped, it was faulted in after being
- * unmapped in caller. Unmap (again) while holding
- * the fault mutex. The mutex will prevent faults
- * until we finish removing the folio.
+ * unmapped in caller or hugetlb_vmdelete_list() skips
+ * unmapping it due to fail to grab lock. Unmap (again)
+ * while holding the fault mutex. The mutex will prevent
+ * faults until we finish removing the folio. Hold folio
+ * lock to guarantee no concurrent migration.
*/
+ folio_lock(folio);
if (unlikely(folio_mapped(folio)))
hugetlb_unmap_file_folio(h, mapping, folio, index);
- folio_lock(folio);
/*
* We must remove the folio from page cache before removing
* the region/ reserve map (hugetlb_unreserve_pages). In
diff --git a/fs/inode.c b/fs/inode.c
index 01ebdc40021e..ec9339024ac3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
{
if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
return;
- if (atomic_read(&inode->i_count))
+ if (icount_read(inode))
return;
if (!(inode->i_sb->s_flags & SB_ACTIVE))
return;
@@ -550,11 +550,11 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
struct inode *inode, u32 bit)
{
- void *bit_address;
+ void *bit_address;
- bit_address = inode_state_wait_address(inode, bit);
- init_wait_var_entry(wqe, bit_address, 0);
- return __var_waitqueue(bit_address);
+ bit_address = inode_state_wait_address(inode, bit);
+ init_wait_var_entry(wqe, bit_address, 0);
+ return __var_waitqueue(bit_address);
}
EXPORT_SYMBOL(inode_bit_waitqueue);
@@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb)
again:
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- if (atomic_read(&inode->i_count))
+ if (icount_read(inode))
continue;
spin_lock(&inode->i_lock);
- if (atomic_read(&inode->i_count)) {
+ if (icount_read(inode)) {
spin_unlock(&inode->i_lock);
continue;
}
@@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
* unreclaimable for a while. Remove them lazily here; iput,
* sync, or the last page cache deletion will requeue them.
*/
- if (atomic_read(&inode->i_count) ||
+ if (icount_read(inode) ||
(inode->i_state & ~I_REFERENCED) ||
!mapping_shrinkable(&inode->i_data)) {
list_lru_isolate(lru, &inode->i_lru);
@@ -1279,6 +1279,8 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
struct inode *old;
+ might_sleep();
+
again:
spin_lock(&inode_hash_lock);
old = find_inode(inode->i_sb, head, test, data, true);
@@ -1382,6 +1384,8 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode, *new;
+ might_sleep();
+
again:
inode = find_inode(sb, head, test, data, false);
if (inode) {
@@ -1422,6 +1426,9 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
+
+ might_sleep();
+
again:
inode = find_inode_fast(sb, head, ino, false);
if (inode) {
@@ -1605,6 +1612,9 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
struct inode *inode;
+
+ might_sleep();
+
again:
inode = ilookup5_nowait(sb, hashval, test, data);
if (inode) {
@@ -1630,6 +1640,9 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
+
+ might_sleep();
+
again:
inode = find_inode_fast(sb, head, ino, false);
@@ -1780,6 +1793,8 @@ int insert_inode_locked(struct inode *inode)
ino_t ino = inode->i_ino;
struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ might_sleep();
+
while (1) {
struct inode *old = NULL;
spin_lock(&inode_hash_lock);
@@ -1826,6 +1841,8 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
{
struct inode *old;
+ might_sleep();
+
inode->i_state |= I_CREATING;
old = inode_insert5(inode, hashval, test, NULL, data);
@@ -1838,11 +1855,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
EXPORT_SYMBOL(insert_inode_locked4);
-int generic_delete_inode(struct inode *inode)
+int inode_just_drop(struct inode *inode)
{
return 1;
}
-EXPORT_SYMBOL(generic_delete_inode);
+EXPORT_SYMBOL(inode_just_drop);
/*
* Called when we're dropping the last reference
@@ -1866,7 +1883,7 @@ static void iput_final(struct inode *inode)
if (op->drop_inode)
drop = op->drop_inode(inode);
else
- drop = generic_drop_inode(inode);
+ drop = inode_generic_drop(inode);
if (!drop &&
!(inode->i_state & I_DONTCACHE) &&
@@ -1908,20 +1925,45 @@ static void iput_final(struct inode *inode)
*/
void iput(struct inode *inode)
{
- if (!inode)
+ might_sleep();
+ if (unlikely(!inode))
return;
- BUG_ON(inode->i_state & I_CLEAR);
+
retry:
- if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
- if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
- atomic_inc(&inode->i_count);
- spin_unlock(&inode->i_lock);
- trace_writeback_lazytime_iput(inode);
- mark_inode_dirty_sync(inode);
- goto retry;
- }
- iput_final(inode);
+ lockdep_assert_not_held(&inode->i_lock);
+ VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode);
+ /*
+ * Note this assert is technically racy as if the count is bogusly
+ * equal to one, then two CPUs racing to further drop it can both
+ * conclude it's fine.
+ */
+ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode);
+
+ if (atomic_add_unless(&inode->i_count, -1, 1))
+ return;
+
+ if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) {
+ trace_writeback_lazytime_iput(inode);
+ mark_inode_dirty_sync(inode);
+ goto retry;
+ }
+
+ spin_lock(&inode->i_lock);
+ if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) {
+ spin_unlock(&inode->i_lock);
+ goto retry;
}
+
+ if (!atomic_dec_and_test(&inode->i_count)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ /*
+ * iput_final() drops ->i_lock, we can't assert on it as the inode may
+ * be deallocated by the time the call returns.
+ */
+ iput_final(inode);
}
EXPORT_SYMBOL(iput);
@@ -2189,7 +2231,7 @@ static int __remove_privs(struct mnt_idmap *idmap,
return notify_change(idmap, dentry, &newattrs, NULL);
}
-int file_remove_privs_flags(struct file *file, unsigned int flags)
+static int file_remove_privs_flags(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
@@ -2214,7 +2256,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags)
inode_has_no_xattr(inode);
return error;
}
-EXPORT_SYMBOL_GPL(file_remove_privs_flags);
/**
* file_remove_privs - remove special file privileges (suid, capabilities)
@@ -2519,21 +2560,28 @@ void __init inode_init(void)
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_mode = mode;
- if (S_ISCHR(mode)) {
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFCHR:
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
- } else if (S_ISBLK(mode)) {
+ break;
+ case S_IFBLK:
if (IS_ENABLED(CONFIG_BLOCK))
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
- } else if (S_ISFIFO(mode))
+ break;
+ case S_IFIFO:
inode->i_fop = &pipefifo_fops;
- else if (S_ISSOCK(mode))
- ; /* leave it no_open_fops */
- else
+ break;
+ case S_IFSOCK:
+ /* leave it no_open_fops */
+ break;
+ default:
printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
" inode %s:%lu\n", mode, inode->i_sb->s_id,
inode->i_ino);
+ break;
+ }
}
EXPORT_SYMBOL(init_special_inode);
@@ -2911,10 +2959,18 @@ EXPORT_SYMBOL(mode_strip_sgid);
*
* TODO: add a proper inode dumping routine, this is a stub to get debug off the
* ground.
+ *
+ * TODO: handle getting to fs type with get_kernel_nofault()?
+ * See dump_mapping() above.
*/
void dump_inode(struct inode *inode, const char *reason)
{
- pr_warn("%s encountered for inode %px", reason, inode);
+ struct super_block *sb = inode->i_sb;
+
+ pr_warn("%s encountered for inode %px\n"
+ "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n",
+ reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags,
+ inode->i_flags, inode->i_state, atomic_read(&inode->i_count));
}
EXPORT_SYMBOL(dump_inode);
diff --git a/fs/internal.h b/fs/internal.h
index 38e8aab27bbd..a33d18ee5b74 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
void pidfs_get_root(struct path *path);
+void nsfs_get_root(struct path *path);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0248cb8db2d3..1c152c2b1b67 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -41,7 +41,7 @@
*
* Returns 0 on success, -errno on error.
*/
-int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
out:
return error;
}
-EXPORT_SYMBOL(vfs_ioctl);
static int ioctl_fibmap(struct file *filp, int __user *p)
{
@@ -426,7 +425,7 @@ static int ioctl_file_dedupe_range(struct file *file,
goto out;
}
- size = offsetof(struct file_dedupe_range, info[count]);
+ size = struct_size(same, info, count);
if (size > PAGE_SIZE) {
ret = -ENOMEM;
goto out;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index fd827398afd2..8b847a1e27f1 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -304,6 +304,9 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
size_t size = i_size_read(iter->inode) - iomap->offset;
size_t offset = offset_in_folio(folio, iomap->offset);
+ if (WARN_ON_ONCE(!iomap->inline_data))
+ return -EIO;
+
if (folio_test_uptodate(folio))
return 0;
@@ -894,7 +897,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
return true;
}
-static void iomap_write_end_inline(const struct iomap_iter *iter,
+static bool iomap_write_end_inline(const struct iomap_iter *iter,
struct folio *folio, loff_t pos, size_t copied)
{
const struct iomap *iomap = &iter->iomap;
@@ -903,12 +906,16 @@ static void iomap_write_end_inline(const struct iomap_iter *iter,
WARN_ON_ONCE(!folio_test_uptodate(folio));
BUG_ON(!iomap_inline_data_valid(iomap));
+ if (WARN_ON_ONCE(!iomap->inline_data))
+ return false;
+
flush_dcache_folio(folio);
addr = kmap_local_folio(folio, pos);
memcpy(iomap_inline_data(iomap, pos), addr, copied);
kunmap_local(addr);
mark_inode_dirty(iter->inode);
+ return true;
}
/*
@@ -921,10 +928,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
- if (srcmap->type == IOMAP_INLINE) {
- iomap_write_end_inline(iter, folio, pos, copied);
- return true;
- }
+ if (srcmap->type == IOMAP_INLINE)
+ return iomap_write_end_inline(iter, folio, pos, copied);
if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
size_t bh_written;
@@ -1396,6 +1401,9 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
/* warn about zeroing folios beyond eof that won't write back */
WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
+ trace_iomap_zero_iter(iter->inode, folio_pos(folio) + offset,
+ bytes);
+
folio_zero_range(folio, offset, bytes);
folio_mark_accessed(folio);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 6f25d4cfea9f..46aa85af13dc 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -363,14 +363,14 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
if (iomap->flags & IOMAP_F_SHARED)
dio->flags |= IOMAP_DIO_COW;
- if (iomap->flags & IOMAP_F_NEW) {
+ if (iomap->flags & IOMAP_F_NEW)
need_zeroout = true;
- } else if (iomap->type == IOMAP_MAPPED) {
- if (iomap_dio_can_use_fua(iomap, dio))
- bio_opf |= REQ_FUA;
- else
- dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
- }
+ else if (iomap->type == IOMAP_MAPPED &&
+ iomap_dio_can_use_fua(iomap, dio))
+ bio_opf |= REQ_FUA;
+
+ if (!(bio_opf & REQ_FUA))
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
/*
* We can only do deferred completion for pure overwrites that
@@ -519,6 +519,9 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
loff_t pos = iomi->pos;
u64 copied;
+ if (WARN_ON_ONCE(!inline_data))
+ return -EIO;
+
if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
return -EIO;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 6ad66e6ba653..a61c1dae4742 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -84,6 +84,7 @@ DEFINE_RANGE_EVENT(iomap_release_folio);
DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
+DEFINE_RANGE_EVENT(iomap_zero_iter);
#define IOMAP_TYPE_STRINGS \
{ IOMAP_HOLE, "HOLE" }, \
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index b3971e91e8eb..38861ca04899 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -285,6 +285,7 @@ restart:
retry:
if (batch_count)
__flush_batch(journal, &batch_count);
+ cond_resched();
spin_lock(&journal->j_list_lock);
goto restart;
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index a6c692cac616..9adf36e6364b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
!list_empty(&of->list));
}
+/* Get active reference to kernfs node for an open file */
+static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of)
+{
+ /* Skip if file was already released */
+ if (unlikely(of->released))
+ return NULL;
+
+ if (!kernfs_get_active(of->kn))
+ return NULL;
+
+ return of;
+}
+
+static void kernfs_put_active_of(struct kernfs_open_file *of)
+{
+ return kernfs_put_active(of->kn);
+}
+
/**
* kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
*
@@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
if (ops->seq_stop)
ops->seq_stop(sf, v);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
}
static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
@@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return ERR_PTR(-ENODEV);
ops = kernfs_ops(of->kn);
@@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
len = -ENODEV;
mutex_unlock(&of->mutex);
goto out_free;
@@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
else
len = -EINVAL;
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
if (len < 0)
@@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
mutex_unlock(&of->mutex);
len = -ENODEV;
goto out_free;
@@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
else
len = -EINVAL;
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
if (len > 0)
@@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
if (!of->vm_ops)
return;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return;
if (of->vm_ops->open)
of->vm_ops->open(vma);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
}
static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
@@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return VM_FAULT_SIGBUS;
ret = VM_FAULT_SIGBUS;
if (of->vm_ops->fault)
ret = of->vm_ops->fault(vmf);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return VM_FAULT_SIGBUS;
ret = 0;
@@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
else
file_update_time(file);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
if (!of->vm_ops)
return -EINVAL;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
return -EINVAL;
ret = -EINVAL;
if (of->vm_ops->access)
ret = of->vm_ops->access(vma, addr, buf, len, write);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
mutex_lock(&of->mutex);
rc = -ENODEV;
- if (!kernfs_get_active(of->kn))
+ if (!kernfs_get_active_of(of))
goto out_unlock;
ops = kernfs_ops(of->kn);
@@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
}
vma->vm_ops = &kernfs_vm_ops;
out_put:
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
out_unlock:
mutex_unlock(&of->mutex);
@@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
__poll_t ret;
- if (!kernfs_get_active(kn))
+ if (!kernfs_get_active_of(of))
return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
if (kn->attr.ops->poll)
@@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
else
ret = kernfs_generic_poll(of, wait);
- kernfs_put_active(kn);
+ kernfs_put_active_of(of);
return ret;
}
@@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
+ if (!kernfs_get_active_of(of)) {
mutex_unlock(&of->mutex);
return -ENODEV;
}
@@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
else
ret = generic_file_llseek(file, offset, whence);
- kernfs_put_active(of->kn);
+ kernfs_put_active_of(of);
mutex_unlock(&of->mutex);
return ret;
}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3c293a5a21b1..457f91c412d4 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
struct kernfs_node *kn = kernfs_dentry_node(dentry);
struct kernfs_iattrs *attrs;
- attrs = kernfs_iattrs_noalloc(kn);
+ attrs = kernfs_iattrs(kn);
if (!attrs)
- return -ENODATA;
+ return -ENOMEM;
return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
}
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index e384a69fbece..76eaf64b9d9e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf)
const struct super_operations kernfs_sops = {
.statfs = kernfs_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = kernfs_evict_inode,
.show_options = kernfs_sop_show_options,
diff --git a/fs/locks.c b/fs/locks.c
index 559f02aa4172..04a3f0e20724 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2328,8 +2328,8 @@ out:
* To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
* locks, the ->lock() interface may return asynchronously, before the lock has
* been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
- * flags need to be set.
+ * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags
+ * need to be set.
*
* Callers expecting ->lock() to return asynchronously will only use F_SETLK,
* not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df9d11479caf..32db676127a9 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev)
inode->i_op = &minix_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_mapping->a_ops = &minix_aops;
- } else
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
init_special_inode(inode, inode->i_mode, rdev);
+ } else {
+ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ make_bad_inode(inode);
+ }
}
/*
diff --git a/fs/mount.h b/fs/mount.h
index 97737051a8b9..79c85639a7ba 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -17,11 +17,7 @@ struct mnt_namespace {
};
struct user_namespace *user_ns;
struct ucounts *ucounts;
- u64 seq; /* Sequence number to prevent loops */
- union {
- wait_queue_head_t poll;
- struct rcu_head mnt_ns_rcu;
- };
+ wait_queue_head_t poll;
u64 seq_origin; /* Sequence number of origin mount namespace */
u64 event;
#ifdef CONFIG_FSNOTIFY
@@ -30,8 +26,6 @@ struct mnt_namespace {
#endif
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
- struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
- struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;
@@ -149,7 +143,7 @@ static inline void detach_mounts(struct dentry *dentry)
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
- refcount_inc(&ns->ns.count);
+ ns_ref_inc(ns);
}
extern seqlock_t mount_lock;
@@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry)
static inline bool is_anon_ns(struct mnt_namespace *ns)
{
- return ns->seq == 0;
+ return ns->ns.ns_id == 0;
}
static inline bool anon_ns_root(const struct mount *m)
diff --git a/fs/namei.c b/fs/namei.c
index cd43ff89fbaa..44856b70ea3b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1449,6 +1449,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags
dentry->d_inode)
return -EISDIR;
+ /* No need to trigger automounts if mountpoint crossing is disabled. */
+ if (lookup_flags & LOOKUP_NO_XDEV)
+ return -EXDEV;
+
if (count && (*count)++ >= MAXSYMLINKS)
return -ELOOP;
@@ -1472,6 +1476,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
/* Allow the filesystem to manage the transit without i_rwsem
* being held. */
if (flags & DCACHE_MANAGE_TRANSIT) {
+ if (lookup_flags & LOOKUP_NO_XDEV) {
+ ret = -EXDEV;
+ break;
+ }
ret = path->dentry->d_op->d_manage(path, false);
flags = smp_load_acquire(&path->dentry->d_flags);
if (ret < 0)
@@ -1489,6 +1497,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
// here we know it's positive
flags = path->dentry->d_flags;
need_mntput = true;
+ if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) {
+ ret = -EXDEV;
+ break;
+ }
continue;
}
}
@@ -1630,12 +1642,8 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
return -ECHILD;
}
ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
- if (jumped) {
- if (unlikely(nd->flags & LOOKUP_NO_XDEV))
- ret = -EXDEV;
- else
- nd->state |= ND_JUMPED;
- }
+ if (jumped)
+ nd->state |= ND_JUMPED;
if (unlikely(ret)) {
dput(path->dentry);
if (path->mnt != nd->path.mnt)
@@ -4828,7 +4836,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
return -EPERM;
/*
* Updating the link count will likely cause i_uid and i_gid to
- * be writen back improperly if their true value is unknown to
+ * be written back improperly if their true value is unknown to
* the vfs.
*/
if (HAS_UNMAPPED_ID(idmap, inode))
diff --git a/fs/namespace.c b/fs/namespace.c
index ddfd4457d338..dc01b14c58cd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -33,6 +33,7 @@
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/pidfs.h>
+#include <linux/nstree.h>
#include "pnode.h"
#include "internal.h"
@@ -65,6 +66,15 @@ static int __init set_mphash_entries(char *str)
}
__setup("mphash_entries=", set_mphash_entries);
+static char * __initdata initramfs_options;
+static int __init initramfs_options_setup(char *str)
+{
+ initramfs_options = str;
+ return 1;
+}
+
+__setup("initramfs_options=", initramfs_options_setup);
+
static u64 event;
static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
static DEFINE_IDA(mnt_group_ida);
@@ -80,13 +90,10 @@ static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
-static DEFINE_SEQLOCK(mnt_ns_tree_lock);
#ifdef CONFIG_FSNOTIFY
LIST_HEAD(notify_list); /* protected by namespace_sem */
#endif
-static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
-static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
enum mount_kattr_flags_t {
MOUNT_KATTR_RECURSE = (1 << 0),
@@ -119,59 +126,18 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
+ struct ns_common *ns;
+
if (!node)
return NULL;
- return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
-}
-
-static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
-{
- struct mnt_namespace *ns_a = node_to_mnt_ns(a);
- struct mnt_namespace *ns_b = node_to_mnt_ns(b);
- u64 seq_a = ns_a->seq;
- u64 seq_b = ns_b->seq;
-
- if (seq_a < seq_b)
- return -1;
- if (seq_a > seq_b)
- return 1;
- return 0;
-}
-
-static inline void mnt_ns_tree_write_lock(void)
-{
- write_seqlock(&mnt_ns_tree_lock);
-}
-
-static inline void mnt_ns_tree_write_unlock(void)
-{
- write_sequnlock(&mnt_ns_tree_lock);
-}
-
-static void mnt_ns_tree_add(struct mnt_namespace *ns)
-{
- struct rb_node *node, *prev;
-
- mnt_ns_tree_write_lock();
- node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
- /*
- * If there's no previous entry simply add it after the
- * head and if there is add it after the previous entry.
- */
- prev = rb_prev(&ns->mnt_ns_tree_node);
- if (!prev)
- list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
- else
- list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
- mnt_ns_tree_write_unlock();
-
- WARN_ON_ONCE(node);
+ ns = rb_entry(node, struct ns_common, ns_tree_node);
+ return container_of(ns, struct mnt_namespace, ns);
}
static void mnt_ns_release(struct mnt_namespace *ns)
{
/* keep alive for {list,stat}mount() */
- if (refcount_dec_and_test(&ns->passive)) {
+ if (ns && refcount_dec_and_test(&ns->passive)) {
fsnotify_mntns_delete(ns);
put_user_ns(ns->user_ns);
kfree(ns);
@@ -181,32 +147,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
- mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+ mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu));
}
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
/* remove from global mount namespace list */
- if (!is_anon_ns(ns)) {
- mnt_ns_tree_write_lock();
- rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
- list_bidir_del_rcu(&ns->mnt_ns_list);
- mnt_ns_tree_write_unlock();
- }
-
- call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
-}
+ if (ns_tree_active(ns))
+ ns_tree_remove(ns);
-static int mnt_ns_find(const void *key, const struct rb_node *node)
-{
- const u64 mnt_ns_id = *(u64 *)key;
- const struct mnt_namespace *ns = node_to_mnt_ns(node);
-
- if (mnt_ns_id < ns->seq)
- return -1;
- if (mnt_ns_id > ns->seq)
- return 1;
- return 0;
+ call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu);
}
/*
@@ -225,28 +175,21 @@ static int mnt_ns_find(const void *key, const struct rb_node *node)
*/
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
- struct mnt_namespace *ns;
- struct rb_node *node;
- unsigned int seq;
+ struct mnt_namespace *mnt_ns;
+ struct ns_common *ns;
guard(rcu)();
- do {
- seq = read_seqbegin(&mnt_ns_tree_lock);
- node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
- if (node)
- break;
- } while (read_seqretry(&mnt_ns_tree_lock, seq));
-
- if (!node)
+ ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS);
+ if (!ns)
return NULL;
/*
* The last reference count is put with RCU delay so we can
* unconditonally acquire a reference here.
*/
- ns = node_to_mnt_ns(node);
- refcount_inc(&ns->passive);
- return ns;
+ mnt_ns = container_of(ns, struct mnt_namespace, ns);
+ refcount_inc(&mnt_ns->passive);
+ return mnt_ns;
}
static inline void lock_mount_hash(void)
@@ -1017,7 +960,7 @@ static inline bool check_anonymous_mnt(struct mount *mnt)
return false;
seq = mnt->mnt_ns->seq_origin;
- return !seq || (seq == current->nsproxy->mnt_ns->seq);
+ return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id);
}
/*
@@ -1197,10 +1140,7 @@ static void commit_tree(struct mount *mnt)
if (!mnt_ns_attached(mnt)) {
for (struct mount *m = mnt; m; m = next_mnt(m, mnt))
- if (unlikely(mnt_ns_attached(m)))
- m = skip_mnt_tree(m);
- else
- mnt_add_to_ns(n, m);
+ mnt_add_to_ns(n, m);
n->nr_mounts += n->pending_mounts;
n->pending_mounts = 0;
}
@@ -2155,19 +2095,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
+ struct ns_common *ns;
+
guard(rcu)();
for (;;) {
- struct list_head *list;
-
- if (previous)
- list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
- else
- list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
- if (list_is_head(list, &mnt_ns_list))
- return ERR_PTR(-ENOENT);
+ ns = ns_tree_adjoined_rcu(mntns, previous);
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
- mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
+ mntns = to_mnt_ns(ns);
/*
* The last passive reference count is put with RCU
@@ -2182,7 +2119,7 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr
* the mount namespace and it might already be on its
* deathbed.
*/
- if (!refcount_inc_not_zero(&mntns->ns.count))
+ if (!ns_ref_get(mntns))
continue;
return mntns;
@@ -2207,7 +2144,7 @@ static bool mnt_ns_loop(struct dentry *dentry)
if (!mnt_ns)
return false;
- return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
+ return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id;
}
struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
@@ -2458,7 +2395,7 @@ struct vfsmount *clone_private_mount(const struct path *path)
return ERR_PTR(-EINVAL);
}
- if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (__has_locked_children(old_mnt, path->dentry))
@@ -2704,6 +2641,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
lock_mnt_tree(child);
q = __lookup_mnt(&child->mnt_parent->mnt,
child->mnt_mountpoint);
+ commit_tree(child);
if (q) {
struct mountpoint *mp = root.mp;
struct mount *r = child;
@@ -2713,7 +2651,6 @@ static int attach_recursive_mnt(struct mount *source_mnt,
mp = shorter;
mnt_change_mountpoint(r, mp, q);
}
- commit_tree(child);
}
unpin_mountpoint(&root);
unlock_mount_hash();
@@ -2862,6 +2799,19 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
return attach_recursive_mnt(mnt, p, mp);
}
+static int may_change_propagation(const struct mount *m)
+{
+ struct mnt_namespace *ns = m->mnt_ns;
+
+ // it must be mounted in some namespace
+ if (IS_ERR_OR_NULL(ns)) // is_mounted()
+ return -EINVAL;
+ // and the caller must be admin in userns of that namespace
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+ return 0;
+}
+
/*
* Sanity check the flags to change_mnt_propagation.
*/
@@ -2898,10 +2848,10 @@ static int do_change_type(struct path *path, int ms_flags)
return -EINVAL;
namespace_lock();
- if (!check_mnt(mnt)) {
- err = -EINVAL;
+ err = may_change_propagation(mnt);
+ if (err)
goto out_unlock;
- }
+
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
@@ -3070,7 +3020,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
if (is_anon_ns(src_mnt_ns))
ns->seq_origin = src_mnt_ns->seq_origin;
else
- ns->seq_origin = src_mnt_ns->seq;
+ ns->seq_origin = src_mnt_ns->ns.ns_id;
}
mnt = __do_loopback(path, recursive);
@@ -3279,7 +3229,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
* If you've mounted a non-root directory somewhere and want to do remount
* on it - tough luck.
*/
-static int do_remount(struct path *path, int ms_flags, int sb_flags,
+static int do_remount(struct path *path, int sb_flags,
int mnt_flags, void *data)
{
int err;
@@ -3347,18 +3297,11 @@ static int do_set_group(struct path *from_path, struct path *to_path)
namespace_lock();
- err = -EINVAL;
- /* To and From must be mounted */
- if (!is_mounted(&from->mnt))
- goto out;
- if (!is_mounted(&to->mnt))
- goto out;
-
- err = -EPERM;
- /* We should be allowed to modify mount namespaces of both mounts */
- if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ err = may_change_propagation(from);
+ if (err)
goto out;
- if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ err = may_change_propagation(to);
+ if (err)
goto out;
err = -EINVAL;
@@ -3724,8 +3667,10 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
int error;
error = security_sb_kern_mount(sb);
- if (!error && mount_too_revealing(sb, &mnt_flags))
+ if (!error && mount_too_revealing(sb, &mnt_flags)) {
+ errorfcp(fc, "VFS", "Mount too revealing");
error = -EPERM;
+ }
if (unlikely(error)) {
fc_drop_locked(fc);
@@ -4109,7 +4054,7 @@ int path_mount(const char *dev_name, struct path *path,
if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
return do_reconfigure_mnt(path, mnt_flags);
if (flags & MS_REMOUNT)
- return do_remount(path, flags, sb_flags, mnt_flags, data_page);
+ return do_remount(path, sb_flags, mnt_flags, data_page);
if (flags & MS_BIND)
return do_loopback(path, dev_name, flags & MS_REC);
if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
@@ -4148,20 +4093,11 @@ static void dec_mnt_namespaces(struct ucounts *ucounts)
static void free_mnt_ns(struct mnt_namespace *ns)
{
if (!is_anon_ns(ns))
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
dec_mnt_namespaces(ns->ucounts);
mnt_ns_tree_remove(ns);
}
-/*
- * Assign a sequence number so we can detect when we attempt to bind
- * mount a reference to an older mount namespace into the current
- * mount namespace, preventing reference counting loops. A 64bit
- * number incrementing at 10Ghz will take 12,427 years to wrap which
- * is effectively never, so we can ignore the possibility.
- */
-static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
-
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
struct mnt_namespace *new_ns;
@@ -4177,22 +4113,20 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
}
- if (!anon) {
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
- dec_mnt_namespaces(ucounts);
- return ERR_PTR(ret);
- }
+
+ if (anon)
+ ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO);
+ else
+ ret = ns_common_init(new_ns);
+ if (ret) {
+ kfree(new_ns);
+ dec_mnt_namespaces(ucounts);
+ return ERR_PTR(ret);
}
- new_ns->ns.ops = &mntns_operations;
if (!anon)
- new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
- refcount_set(&new_ns->ns.count, 1);
+ ns_tree_gen_id(&new_ns->ns);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
- INIT_LIST_HEAD(&new_ns->mnt_ns_list);
- RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
@@ -4200,7 +4134,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
}
__latent_entropy
-struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
+struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
@@ -4231,7 +4165,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
- ns_free_inum(&new_ns->ns);
+ ns_common_free(ns);
dec_mnt_namespaces(new_ns->ucounts);
mnt_ns_release(new_ns);
return ERR_CAST(new);
@@ -4278,7 +4212,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
if (pwdmnt)
mntput(pwdmnt);
- mnt_ns_tree_add(new_ns);
+ ns_tree_add_raw(new_ns);
return new_ns;
}
@@ -4441,7 +4375,7 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
ret = -EPERM;
if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
- pr_warn("VFS: Mount too revealing\n");
+ errorfcp(fc, "VFS", "Mount too revealing");
goto err_unlock;
}
@@ -4551,20 +4485,10 @@ SYSCALL_DEFINE5(move_mount,
if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION;
if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH;
- lflags = 0;
- if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
- if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
uflags = 0;
- if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH;
- from_name = getname_maybe_null(from_pathname, uflags);
- if (IS_ERR(from_name))
- return PTR_ERR(from_name);
+ if (flags & MOVE_MOUNT_T_EMPTY_PATH)
+ uflags = AT_EMPTY_PATH;
- lflags = 0;
- if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
- if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
- uflags = 0;
- if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH;
to_name = getname_maybe_null(to_pathname, uflags);
if (IS_ERR(to_name))
return PTR_ERR(to_name);
@@ -4577,11 +4501,24 @@ SYSCALL_DEFINE5(move_mount,
to_path = fd_file(f_to)->f_path;
path_get(&to_path);
} else {
+ lflags = 0;
+ if (flags & MOVE_MOUNT_T_SYMLINKS)
+ lflags |= LOOKUP_FOLLOW;
+ if (flags & MOVE_MOUNT_T_AUTOMOUNTS)
+ lflags |= LOOKUP_AUTOMOUNT;
ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
if (ret)
return ret;
}
+ uflags = 0;
+ if (flags & MOVE_MOUNT_F_EMPTY_PATH)
+ uflags = AT_EMPTY_PATH;
+
+ from_name = getname_maybe_null(from_pathname, uflags);
+ if (IS_ERR(from_name))
+ return PTR_ERR(from_name);
+
if (!from_name && from_dfd >= 0) {
CLASS(fd_raw, f_from)(from_dfd);
if (fd_empty(f_from))
@@ -4590,6 +4527,11 @@ SYSCALL_DEFINE5(move_mount,
return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
}
+ lflags = 0;
+ if (flags & MOVE_MOUNT_F_SYMLINKS)
+ lflags |= LOOKUP_FOLLOW;
+ if (flags & MOVE_MOUNT_F_AUTOMOUNTS)
+ lflags |= LOOKUP_AUTOMOUNT;
ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
if (ret)
return ret;
@@ -4996,7 +4938,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
return -EINVAL;
ns = get_proc_ns(file_inode(fd_file(f)));
- if (ns->ops->type != CLONE_NEWUSER)
+ if (ns->ns_type != CLONE_NEWUSER)
return -EINVAL;
/*
@@ -5176,7 +5118,8 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
int ret;
struct mount_kattr kattr = {};
- kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
+ if (flags & OPEN_TREE_CLONE)
+ kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
if (flags & AT_RECURSIVE)
kattr.kflags |= MOUNT_KATTR_RECURSE;
@@ -5388,7 +5331,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
s->sm.mask |= STATMOUNT_MNT_NS_ID;
- s->sm.mnt_ns_id = ns->seq;
+ s->sm.mnt_ns_id = ns->ns.ns_id;
}
static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
@@ -5699,7 +5642,6 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
- struct path root __free(path_put) = {};
struct mount *m;
int err;
@@ -5711,7 +5653,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!s->mnt)
return -ENOENT;
- err = grab_requested_root(ns, &root);
+ err = grab_requested_root(ns, &s->root);
if (err)
return err;
@@ -5720,7 +5662,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
* mounts to show users.
*/
m = real_mount(s->mnt);
- if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
+ if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -5728,8 +5670,6 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (err)
return err;
- s->root = root;
-
/*
* Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
* can change concurrently as we only hold the read-side of the
@@ -5898,7 +5838,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
return ERR_PTR(-EINVAL);
ns = get_proc_ns(file_inode(fd_file(f)));
- if (ns->ops->type != CLONE_NEWNS)
+ if (ns->ns_type != CLONE_NEWNS)
return ERR_PTR(-EINVAL);
mnt_ns = to_mnt_ns(ns);
@@ -5951,28 +5891,40 @@ retry:
if (!ret)
ret = copy_statmount_to_user(ks);
kvfree(ks->seq.buf);
+ path_put(&ks->root);
if (retry_statmount(ret, &seq_size))
goto retry;
return ret;
}
-static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
- u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
- bool reverse)
+struct klistmount {
+ u64 last_mnt_id;
+ u64 mnt_parent_id;
+ u64 *kmnt_ids;
+ u32 nr_mnt_ids;
+ struct mnt_namespace *ns;
+ struct path root;
+};
+
+static ssize_t do_listmount(struct klistmount *kls, bool reverse)
{
- struct path root __free(path_put) = {};
+ struct mnt_namespace *ns = kls->ns;
+ u64 mnt_parent_id = kls->mnt_parent_id;
+ u64 last_mnt_id = kls->last_mnt_id;
+ u64 *mnt_ids = kls->kmnt_ids;
+ size_t nr_mnt_ids = kls->nr_mnt_ids;
struct path orig;
struct mount *r, *first;
ssize_t ret;
rwsem_assert_held(&namespace_sem);
- ret = grab_requested_root(ns, &root);
+ ret = grab_requested_root(ns, &kls->root);
if (ret)
return ret;
if (mnt_parent_id == LSMT_ROOT) {
- orig = root;
+ orig = kls->root;
} else {
orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
if (!orig.mnt)
@@ -5984,7 +5936,7 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
* Don't trigger audit denials. We just want to determine what
* mounts to show users.
*/
- if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
+ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -6017,14 +5969,45 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
return ret;
}
+static void __free_klistmount_free(const struct klistmount *kls)
+{
+ path_put(&kls->root);
+ kvfree(kls->kmnt_ids);
+ mnt_ns_release(kls->ns);
+}
+
+static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
+ size_t nr_mnt_ids)
+{
+
+ u64 last_mnt_id = kreq->param;
+
+ /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
+ return -EINVAL;
+
+ kls->last_mnt_id = last_mnt_id;
+
+ kls->nr_mnt_ids = nr_mnt_ids;
+ kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids),
+ GFP_KERNEL_ACCOUNT);
+ if (!kls->kmnt_ids)
+ return -ENOMEM;
+
+ kls->ns = grab_requested_mnt_ns(kreq);
+ if (!kls->ns)
+ return -ENOENT;
+
+ kls->mnt_parent_id = kreq->mnt_id;
+ return 0;
+}
+
SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
- u64 *kmnt_ids __free(kvfree) = NULL;
+ struct klistmount kls __free(klistmount_free) = {};
const size_t maxcount = 1000000;
- struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
struct mnt_id_req kreq;
- u64 last_mnt_id;
ssize_t ret;
if (flags & ~LISTMOUNT_REVERSE)
@@ -6045,22 +6028,12 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
- last_mnt_id = kreq.param;
- /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
- if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
- return -EINVAL;
-
- kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
- GFP_KERNEL_ACCOUNT);
- if (!kmnt_ids)
- return -ENOMEM;
-
- ns = grab_requested_mnt_ns(&kreq);
- if (!ns)
- return -ENOENT;
+ ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids);
+ if (ret)
+ return ret;
- if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
- !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+ if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) &&
+ !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN))
return -ENOENT;
/*
@@ -6068,39 +6041,43 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
* listmount() doesn't care about any mount properties.
*/
scoped_guard(rwsem_read, &namespace_sem)
- ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
- nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
+ ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
if (ret <= 0)
return ret;
- if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
+ if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids)))
return -EFAULT;
return ret;
}
+struct mnt_namespace init_mnt_ns = {
+ .ns.inum = ns_init_inum(&init_mnt_ns),
+ .ns.ops = &mntns_operations,
+ .user_ns = &init_user_ns,
+ .ns.__ns_ref = REFCOUNT_INIT(1),
+ .ns.ns_type = ns_common_type(&init_mnt_ns),
+ .passive = REFCOUNT_INIT(1),
+ .mounts = RB_ROOT,
+ .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
+};
+
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
struct mount *m;
- struct mnt_namespace *ns;
struct path root;
- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
+ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
if (IS_ERR(mnt))
panic("Can't create rootfs");
- ns = alloc_mnt_ns(&init_user_ns, true);
- if (IS_ERR(ns))
- panic("Can't allocate initial namespace");
- ns->seq = atomic64_inc_return(&mnt_ns_seq);
- ns->ns.inum = PROC_MNT_INIT_INO;
m = real_mount(mnt);
- ns->root = m;
- ns->nr_mounts = 1;
- mnt_add_to_ns(ns, m);
- init_task.nsproxy->mnt_ns = ns;
- get_mnt_ns(ns);
+ init_mnt_ns.root = m;
+ init_mnt_ns.nr_mounts = 1;
+ mnt_add_to_ns(&init_mnt_ns, m);
+ init_task.nsproxy->mnt_ns = &init_mnt_ns;
+ get_mnt_ns(&init_mnt_ns);
root.mnt = mnt;
root.dentry = mnt->mnt_root;
@@ -6108,7 +6085,7 @@ static void __init init_mount_tree(void)
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
- mnt_ns_tree_add(ns);
+ ns_tree_add(&init_mnt_ns);
}
void __init mnt_init(void)
@@ -6148,7 +6125,7 @@ void __init mnt_init(void)
void put_mnt_ns(struct mnt_namespace *ns)
{
- if (!refcount_dec_and_test(&ns->ns.count))
+ if (!ns_ref_put(ns))
return;
namespace_lock();
emptied_ns = ns;
@@ -6397,7 +6374,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns)
const struct proc_ns_operations mntns_operations = {
.name = "mnt",
- .type = CLONE_NEWNS,
.get = mntns_get,
.put = mntns_put,
.install = mntns_install,
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 18b3dc74c70e..37ab6f28b5ad 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -369,7 +369,7 @@ void netfs_readahead(struct readahead_control *ractl)
return netfs_put_request(rreq, netfs_rreq_trace_put_return);
cleanup_free:
- return netfs_put_request(rreq, netfs_rreq_trace_put_failed);
+ return netfs_put_failed_request(rreq);
}
EXPORT_SYMBOL(netfs_readahead);
@@ -472,7 +472,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
return ret < 0 ? ret : 0;
discard:
- netfs_put_request(rreq, netfs_rreq_trace_put_discard);
+ netfs_put_failed_request(rreq);
alloc_error:
folio_unlock(folio);
return ret;
@@ -532,7 +532,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
return ret < 0 ? ret : 0;
discard:
- netfs_put_request(rreq, netfs_rreq_trace_put_discard);
+ netfs_put_failed_request(rreq);
alloc_error:
folio_unlock(folio);
return ret;
@@ -699,7 +699,7 @@ have_folio_no_wait:
return 0;
error_put:
- netfs_put_request(rreq, netfs_rreq_trace_put_failed);
+ netfs_put_failed_request(rreq);
error:
if (folio) {
folio_unlock(folio);
@@ -754,7 +754,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
return ret < 0 ? ret : 0;
error_put:
- netfs_put_request(rreq, netfs_rreq_trace_put_discard);
+ netfs_put_failed_request(rreq);
error:
_leave(" = %d", ret);
return ret;
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index f27ea5099a68..09394ac2c180 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -347,7 +347,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
folio_put(folio);
ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1);
if (ret < 0)
- goto error_folio_unlock;
+ goto out;
continue;
copied:
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index a05e13472baf..a498ee8d6674 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -131,6 +131,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
if (rreq->len == 0) {
pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
+ netfs_put_request(rreq, netfs_rreq_trace_put_discard);
return -EIO;
}
@@ -205,7 +206,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
if (user_backed_iter(iter)) {
ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0);
if (ret < 0)
- goto out;
+ goto error_put;
rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec;
rreq->direct_bv_count = ret;
rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
@@ -238,6 +239,10 @@ out:
if (ret > 0)
orig_count -= ret;
return ret;
+
+error_put:
+ netfs_put_failed_request(rreq);
+ return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index a16660ab7f83..a9d1c3b2c084 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -57,7 +57,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
if (n < 0) {
ret = n;
- goto out;
+ goto error_put;
}
wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
wreq->direct_bv_count = n;
@@ -101,6 +101,10 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
out:
netfs_put_request(wreq, netfs_rreq_trace_put_return);
return ret;
+
+error_put:
+ netfs_put_failed_request(wreq);
+ return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index d4f16fefd965..4319611f5354 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -87,6 +87,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
void netfs_clear_subrequests(struct netfs_io_request *rreq);
void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
+void netfs_put_failed_request(struct netfs_io_request *rreq);
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq);
static inline void netfs_see_request(struct netfs_io_request *rreq,
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 20748bcfbf59..486166460e17 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -321,7 +321,7 @@ void netfs_wake_collector(struct netfs_io_request *rreq)
{
if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) &&
!test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) {
- queue_work(system_unbound_wq, &rreq->work);
+ queue_work(system_dfl_wq, &rreq->work);
} else {
trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
wake_up(&rreq->waitq);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e8c99738b5bb..b8c4918d3dcd 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -116,10 +116,8 @@ static void netfs_free_request_rcu(struct rcu_head *rcu)
netfs_stat_d(&netfs_n_rh_rreq);
}
-static void netfs_free_request(struct work_struct *work)
+static void netfs_deinit_request(struct netfs_io_request *rreq)
{
- struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, cleanup_work);
struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned int i;
@@ -149,6 +147,14 @@ static void netfs_free_request(struct work_struct *work)
if (atomic_dec_and_test(&ictx->io_count))
wake_up_var(&ictx->io_count);
+}
+
+static void netfs_free_request(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, cleanup_work);
+
+ netfs_deinit_request(rreq);
call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
@@ -163,11 +169,29 @@ void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
dead = __refcount_dec_and_test(&rreq->ref, &r);
trace_netfs_rreq_ref(debug_id, r - 1, what);
if (dead)
- WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work));
+ WARN_ON(!queue_work(system_dfl_wq, &rreq->cleanup_work));
}
}
/*
+ * Free a request (synchronously) that was just allocated but has
+ * failed before it could be submitted.
+ */
+void netfs_put_failed_request(struct netfs_io_request *rreq)
+{
+ int r = refcount_read(&rreq->ref);
+
+ /* new requests have two references (see
+ * netfs_alloc_request(), and this function is only allowed on
+ * new request objects
+ */
+ WARN_ON_ONCE(r != 2);
+
+ trace_netfs_rreq_ref(rreq->debug_id, r, netfs_rreq_trace_put_failed);
+ netfs_free_request(&rreq->cleanup_work);
+}
+
+/*
* Allocate and partially initialise an I/O request structure.
*/
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index 3e804da1e1eb..a95e7aadafd0 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -281,8 +281,10 @@ reassess:
} else if (test_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags)) {
notes |= MADE_PROGRESS;
} else {
- if (!stream->failed)
+ if (!stream->failed) {
stream->transferred += transferred;
+ stream->transferred_valid = true;
+ }
if (front->transferred < front->len)
set_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags);
notes |= MADE_PROGRESS;
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index 8097bc069c1d..a1489aa29f78 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -118,7 +118,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(
return creq;
cancel_put:
- netfs_put_request(creq, netfs_rreq_trace_put_return);
+ netfs_put_failed_request(creq);
cancel:
rreq->copy_to_cache = ERR_PTR(-ENOBUFS);
clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index fa622a6cd56d..5c0dc4efc792 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -189,7 +189,7 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite
return ret;
cleanup_free:
- netfs_put_request(rreq, netfs_rreq_trace_put_failed);
+ netfs_put_failed_request(rreq);
return ret;
}
EXPORT_SYMBOL(netfs_read_single);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 0f3a36852a4d..cbf3d9194c7b 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -254,6 +254,7 @@ reassess_streams:
if (front->start + front->transferred > stream->collected_to) {
stream->collected_to = front->start + front->transferred;
stream->transferred = stream->collected_to - wreq->start;
+ stream->transferred_valid = true;
notes |= MADE_PROGRESS;
}
if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
@@ -356,6 +357,7 @@ bool netfs_write_collection(struct netfs_io_request *wreq)
{
struct netfs_inode *ictx = netfs_inode(wreq->inode);
size_t transferred;
+ bool transferred_valid = false;
int s;
_enter("R=%x", wreq->debug_id);
@@ -376,12 +378,16 @@ bool netfs_write_collection(struct netfs_io_request *wreq)
continue;
if (!list_empty(&stream->subrequests))
return false;
- if (stream->transferred < transferred)
+ if (stream->transferred_valid &&
+ stream->transferred < transferred) {
transferred = stream->transferred;
+ transferred_valid = true;
+ }
}
/* Okay, declare that all I/O is complete. */
- wreq->transferred = transferred;
+ if (transferred_valid)
+ wreq->transferred = transferred;
trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
if (wreq->io_streams[1].active &&
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 50bee2c4130d..dd8743bc8d7f 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -118,12 +118,12 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
wreq->io_streams[0].prepare_write = ictx->ops->prepare_write;
wreq->io_streams[0].issue_write = ictx->ops->issue_write;
wreq->io_streams[0].collected_to = start;
- wreq->io_streams[0].transferred = LONG_MAX;
+ wreq->io_streams[0].transferred = 0;
wreq->io_streams[1].stream_nr = 1;
wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE;
wreq->io_streams[1].collected_to = start;
- wreq->io_streams[1].transferred = LONG_MAX;
+ wreq->io_streams[1].transferred = 0;
if (fscache_resources_valid(&wreq->cache_resources)) {
wreq->io_streams[1].avail = true;
wreq->io_streams[1].active = true;
@@ -133,8 +133,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
return wreq;
nomem:
- wreq->error = -ENOMEM;
- netfs_put_request(wreq, netfs_rreq_trace_put_failed);
+ netfs_put_failed_request(wreq);
return ERR_PTR(-ENOMEM);
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8fb4a950dd55..4e3dcc157a83 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
if (fsinfo->xattr_support)
server->caps |= NFS_CAP_XATTR;
+ else
+ server->caps &= ~NFS_CAP_XATTR;
#endif
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 86e36c630f09..8059ece82468 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -28,6 +28,7 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/gfp.h>
+#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/compaction.h>
@@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL_GPL(nfs_file_fsync);
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+ loff_t to)
+{
+ struct folio *folio;
+
+ if (from >= to)
+ return;
+
+ folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+
+ if (folio_test_uptodate(folio)) {
+ loff_t fpos = folio_pos(folio);
+ size_t offset = from - fpos;
+ size_t end = folio_size(folio);
+
+ if (to - fpos < end)
+ end = to - fpos;
+ folio_zero_segment(folio, offset, end);
+ trace_nfs_size_truncate_folio(mapping->host, to);
+ }
+
+ folio_unlock(folio);
+ folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(nfs_truncate_last_folio);
+
/*
* Decide whether a read/modify/write cycle may be more efficient
* then a modify/write/read cycle when writing to a page in the
@@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb,
dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
+ nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos);
fgp |= fgf_set_order(len);
start:
@@ -442,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
folio->index, offset, length);
- if (offset != 0 || length < folio_size(folio))
- return;
/* Cancel any unstarted writes on this page */
- nfs_wb_folio_cancel(inode, folio);
+ if (offset != 0 || length < folio_size(folio))
+ nfs_wb_folio(inode, folio);
+ else
+ nfs_wb_folio_cancel(inode, folio);
folio_wait_private_2(folio); /* [DEPRECATED] */
trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length);
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 8dc921d83538..9edb5f9b0c4e 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
struct pnfs_layout_segment *l2)
{
const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
- const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+ const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2);
u32 i;
if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
@@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
continue;
if (check_device &&
- nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
+ nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) {
+ // reinitialize the error state in case if this is the last iteration
+ ds = ERR_PTR(-EINVAL);
continue;
+ }
*best_idx = idx;
break;
@@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
- if (ds)
+ if (!IS_ERR(ds))
return ds;
return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
}
@@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
best_idx);
- if (ds || !pgio->pg_mirror_idx)
+ if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
return ds;
return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
}
@@ -868,7 +871,7 @@ retry:
req->wb_nio = 0;
ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
- if (!ds) {
+ if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
pnfs_generic_pg_cleanup(pgio);
@@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
u32 new_idx = 0;
+ struct nfs4_pnfs_ds *ds;
- if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx))
- ff_layout_send_layouterror(hdr->lseg);
- else
+ ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+ if (IS_ERR(ds))
pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
+ else
+ ff_layout_send_layouterror(hdr->lseg);
pnfs_read_resend_pnfs(hdr, new_idx);
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 338ef77ae423..9bdaf7f38bed 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
int nfs_drop_inode(struct inode *inode)
{
- return NFS_STALE(inode) || generic_drop_inode(inode);
+ return NFS_STALE(inode) || inode_generic_drop(inode);
}
EXPORT_SYMBOL_GPL(nfs_drop_inode);
@@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(inode),
nfs_display_fhandle_hash(fh),
- atomic_read(&inode->i_count));
+ icount_read(inode));
out:
return inode;
@@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct nfs_fattr *fattr;
+ loff_t oldsize = i_size_read(inode);
int error = 0;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (error)
return error;
- if (attr->ia_size == i_size_read(inode))
+ if (attr->ia_size == oldsize)
attr->ia_valid &= ~ATTR_SIZE;
}
@@ -767,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
trace_nfs_setattr_enter(inode);
/* Write all dirty data */
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode)) {
+ nfs_file_block_o_direct(NFS_I(inode));
nfs_sync_inode(inode);
+ }
fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
if (fattr == NULL) {
@@ -777,8 +780,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
- if (error == 0)
+ if (error == 0) {
+ if (attr->ia_valid & ATTR_SIZE)
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ attr->ia_size);
error = nfs_refresh_inode(inode, fattr);
+ }
nfs_free_fattr(fattr);
out:
trace_nfs_setattr_exit(inode, error);
@@ -2229,7 +2236,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
nfs_display_fhandle_hash(NFS_FH(inode)),
- atomic_read(&inode->i_count), fattr->valid);
+ icount_read(inode), fattr->valid);
if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
/* Only a mounted-on-fileid? Just exit */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 74d712b58423..c0a44f389f8f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
int nfs_check_flags(int);
+void nfs_truncate_last_folio(struct address_space *mapping, loff_t from,
+ loff_t to);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
@@ -530,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
}
+/* Must be called with exclusively locked inode->i_rwsem */
+static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi)
+{
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ inode_dio_wait(&nfsi->vfs_inode);
+ }
+}
+
+
/* namespace.c */
#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
index 3388faf2acb9..d275b0a250bf 100644
--- a/fs/nfs/io.c
+++ b/fs/nfs/io.c
@@ -14,15 +14,6 @@
#include "internal.h"
-/* Call with exclusively locked inode->i_rwsem */
-static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
-{
- if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
- clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
- inode_dio_wait(inode);
- }
-}
-
/**
* nfs_start_io_read - declare the file is being used for buffered reads
* @inode: file inode
@@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode)
err = down_write_killable(&inode->i_rwsem);
if (err)
return err;
- nfs_block_o_direct(nfsi, inode);
+ nfs_file_block_o_direct(nfsi);
downgrade_write(&inode->i_rwsem);
return 0;
@@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode)
err = down_write_killable(&inode->i_rwsem);
if (!err)
- nfs_block_o_direct(NFS_I(inode), inode);
+ nfs_file_block_o_direct(NFS_I(inode));
return err;
}
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index bd5fca285899..97abf62f109d 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp)
return;
}
- if (nfs_client_is_local(clp)) {
- /* If already enabled, disable and re-enable */
- nfs_localio_disable_client(clp);
- }
+ if (nfs_client_is_local(clp))
+ return;
if (!nfs_uuid_begin(&clp->cl_uuid))
return;
@@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
case -ENOMEM:
case -ENXIO:
case -ENOENT:
- /* Revalidate localio, will disable if unsupported */
+ /* Revalidate localio */
+ nfs_localio_disable_client(clp);
nfs_local_probe(clp);
}
}
@@ -453,12 +452,13 @@ static void nfs_local_call_read(struct work_struct *work)
nfs_local_iter_init(&iter, iocb, READ);
status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+
+ revert_creds(save_cred);
+
if (status != -EIOCBQUEUED) {
nfs_local_read_done(iocb, status);
nfs_local_pgio_release(iocb);
}
-
- revert_creds(save_cred);
}
static int
@@ -648,14 +648,15 @@ static void nfs_local_call_write(struct work_struct *work)
file_start_write(filp);
status = filp->f_op->write_iter(&iocb->kiocb, &iter);
file_end_write(filp);
+
+ revert_creds(save_cred);
+ current->flags = old_flags;
+
if (status != -EIOCBQUEUED) {
nfs_local_write_done(iocb, status);
nfs_local_vfs_getattr(iocb);
nfs_local_pgio_release(iocb);
}
-
- revert_creds(save_cred);
- current->flags = old_flags;
}
static int
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 7f1ec9c67ff2..f9a3a1fbf44c 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -335,7 +335,7 @@ static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp)
num *= HZ;
*((int *)kp->arg) = num;
if (!list_empty(&nfs_automount_list))
- mod_delayed_work(system_wq, &nfs_automount_task, num);
+ mod_delayed_work(system_percpu_wq, &nfs_automount_task, num);
} else {
*((int *)kp->arg) = -1*HZ;
cancel_delayed_work(&nfs_automount_task);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 01c01f45358b..6a0b5871ba3b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
exception.inode = inode;
exception.state = lock->open_context->state;
+ nfs_file_block_o_direct(NFS_I(inode));
err = nfs_sync_inode(inode);
if (err)
goto out;
@@ -137,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE],
};
struct inode *inode = file_inode(filep);
+ loff_t oldsize = i_size_read(inode);
int err;
if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
@@ -145,7 +147,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
- if (err == -EOPNOTSUPP)
+
+ if (err == 0)
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ offset + len);
+ else if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE |
NFS_CAP_ZERO_RANGE);
@@ -183,6 +189,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE],
};
struct inode *inode = file_inode(filep);
+ loff_t oldsize = i_size_read(inode);
int err;
if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE))
@@ -191,9 +198,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
- if (err == 0)
+ if (err == 0) {
+ nfs_truncate_last_folio(inode->i_mapping, oldsize,
+ offset + len);
truncate_pagecache_range(inode, offset, (offset + len) -1);
- if (err == -EOPNOTSUPP)
+ } else if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE;
inode_unlock(inode);
@@ -354,22 +363,27 @@ out:
/**
* nfs42_copy_dest_done - perform inode cache updates after clone/copy offload
- * @inode: pointer to destination inode
+ * @file: pointer to destination file
* @pos: destination offset
* @len: copy length
+ * @oldsize: length of the file prior to clone/copy
*
* Punch a hole in the inode page cache, so that the NFS client will
* know to retrieve new data.
* Update the file size if necessary, and then mark the inode as having
* invalid cached values for change attribute, ctime, mtime and space used.
*/
-static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len)
+static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len,
+ loff_t oldsize)
{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
loff_t newsize = pos + len;
loff_t end = newsize - 1;
- WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_SHIFT, end >> PAGE_SHIFT));
+ nfs_truncate_last_folio(mapping, oldsize, pos);
+ WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT));
spin_lock(&inode->i_lock);
if (newsize > i_size_read(inode))
@@ -402,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
struct nfs_server *src_server = NFS_SERVER(src_inode);
loff_t pos_src = args->src_pos;
loff_t pos_dst = args->dst_pos;
+ loff_t oldsize_dst = i_size_read(dst_inode);
size_t count = args->count;
ssize_t status;
@@ -430,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
return status;
}
+ nfs_file_block_o_direct(NFS_I(dst_inode));
status = nfs_sync_inode(dst_inode);
if (status)
return status;
@@ -475,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
goto out;
}
- nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count);
+ nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst);
nfs_invalidate_atime(src_inode);
status = res->write_res.count;
out:
@@ -1242,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
struct nfs42_clone_res res = {
.server = server,
};
+ loff_t oldsize_dst = i_size_read(dst_inode);
int status;
msg->rpc_argp = &args;
@@ -1276,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
/* a zero-length count means clone to EOF in src */
if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE)
count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset;
- nfs42_copy_dest_done(dst_inode, dst_offset, count);
+ nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst);
status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1d6b5f4230c9..c9a0d1e420c6 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
lock_two_nondirectories(src_inode, dst_inode);
/* flush all pending writes on both src and dst so that server
* has the latest data */
+ nfs_file_block_o_direct(NFS_I(src_inode));
ret = nfs_sync_inode(src_inode);
if (ret)
goto out_unlock;
+ nfs_file_block_o_direct(NFS_I(dst_inode));
ret = nfs_sync_inode(dst_inode);
if (ret)
goto out_unlock;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7d2b67e06cc3..ce61253efd45 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4013,8 +4013,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
res.attr_bitmask[2];
}
memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
- server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
- NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL);
+ server->caps &=
+ ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS |
+ NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS |
+ NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME);
server->fattr_valid = NFS_ATTR_FATTR_V4;
if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
@@ -4092,7 +4094,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
};
int err;
- nfs_server_set_init_caps(server);
do {
err = nfs4_handle_exception(server,
_nfs4_server_capabilities(server, fhandle),
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index db3811af0796..18ae614e5a6c 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -122,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
timeout = 5 * HZ;
dprintk("%s: requeueing work. Lease period = %ld\n",
__func__, (timeout + HZ - 1) / HZ);
- mod_delayed_work(system_wq, &clp->cl_renewd, timeout);
+ mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout);
set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
spin_unlock(&clp->cl_lock);
}
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 96b1323318c2..627115179795 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class,
TP_ARGS(inode, new_size))
DEFINE_NFS_UPDATE_SIZE_EVENT(truncate);
+DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio);
DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
DEFINE_NFS_UPDATE_SIZE_EVENT(update);
DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 11968dcb7243..6e69ce43a13f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -253,13 +253,14 @@ nfs_page_group_unlock(struct nfs_page *req)
nfs_page_clear_headlock(req);
}
-/*
- * nfs_page_group_sync_on_bit_locked
+/**
+ * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set
+ * @req: request in page group
+ * @bit: PG_* bit that is used to sync page group
*
* must be called with page group lock held
*/
-static bool
-nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
{
struct nfs_page *head = req->wb_head;
struct nfs_page *tmp;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fa5c41d0989a..647c53d1418a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -153,20 +153,10 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
}
}
-static int
-nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
{
- int ret;
-
- if (!test_bit(PG_REMOVE, &req->wb_flags))
- return 0;
- ret = nfs_page_group_lock(req);
- if (ret)
- return ret;
if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
nfs_page_set_inode_ref(req, inode);
- nfs_page_group_unlock(req);
- return 0;
}
/**
@@ -247,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error)
}
/*
- * nfs_page_group_search_locked
- * @head - head request of page group
- * @page_offset - offset into page
- *
- * Search page group with head @head to find a request that contains the
- * page offset @page_offset.
- *
- * Returns a pointer to the first matching nfs request, or NULL if no
- * match is found.
- *
- * Must be called with the page group lock held
- */
-static struct nfs_page *
-nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
-{
- struct nfs_page *req;
-
- req = head;
- do {
- if (page_offset >= req->wb_pgbase &&
- page_offset < (req->wb_pgbase + req->wb_bytes))
- return req;
-
- req = req->wb_this_page;
- } while (req != head);
-
- return NULL;
-}
-
-/*
- * nfs_page_group_covers_page
- * @head - head request of page group
+ * nfs_page_covers_folio
+ * @req: struct nfs_page
*
- * Return true if the page group with head @head covers the whole page,
- * returns false otherwise
+ * Return true if the request covers the whole folio.
+ * Note that the caller should ensure all subrequests have been joined
*/
static bool nfs_page_group_covers_page(struct nfs_page *req)
{
unsigned int len = nfs_folio_length(nfs_page_to_folio(req));
- struct nfs_page *tmp;
- unsigned int pos = 0;
-
- nfs_page_group_lock(req);
- for (;;) {
- tmp = nfs_page_group_search_locked(req->wb_head, pos);
- if (!tmp)
- break;
- pos = tmp->wb_pgbase + tmp->wb_bytes;
- }
-
- nfs_page_group_unlock(req);
- return pos >= len;
+ return req->wb_pgbase == 0 && req->wb_bytes == len;
}
/* We can set the PG_uptodate flag if we see that a write request
@@ -585,19 +533,18 @@ retry:
}
}
+ ret = nfs_page_group_lock(head);
+ if (ret < 0)
+ goto out_unlock;
+
/* Ensure that nobody removed the request before we locked it */
if (head != folio->private) {
+ nfs_page_group_unlock(head);
nfs_unlock_and_release_request(head);
goto retry;
}
- ret = nfs_cancel_remove_inode(head, inode);
- if (ret < 0)
- goto out_unlock;
-
- ret = nfs_page_group_lock(head);
- if (ret < 0)
- goto out_unlock;
+ nfs_cancel_remove_inode(head, inode);
/* lock each request in the page group */
for (subreq = head->wb_this_page;
@@ -786,7 +733,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
{
struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
- if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+ nfs_page_group_lock(req);
+ if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) {
struct folio *folio = nfs_page_to_folio(req->wb_head);
struct address_space *mapping = folio->mapping;
@@ -798,6 +746,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
}
spin_unlock(&mapping->i_private_lock);
}
+ nfs_page_group_unlock(req);
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
atomic_long_dec(&nfsi->nrequests);
@@ -2054,6 +2003,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
* release it */
nfs_inode_remove_request(req);
nfs_unlock_and_release_request(req);
+ folio_cancel_dirty(folio);
}
return ret;
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 732abf6b92a5..85ca663c052c 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -113,7 +113,7 @@ static void
nfsd_file_schedule_laundrette(void)
{
if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags))
- queue_delayed_work(system_unbound_wq, &nfsd_filecache_laundrette,
+ queue_delayed_work(system_dfl_wq, &nfsd_filecache_laundrette,
NFSD_LAUNDRETTE_DELAY);
}
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index 4f6468eb2adf..cb237f1b902a 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -103,10 +103,11 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
if (nfsd_file_get(new) == NULL)
goto again;
/*
- * Drop the ref we were going to install and the
- * one we were going to return.
+ * Drop the ref we were going to install (both file and
+ * net) and the one we were going to return (only file).
*/
nfsd_file_put(localio);
+ nfsd_net_put(net);
nfsd_file_put(localio);
localio = new;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 98ab55ba3ced..edf050766e57 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -470,7 +470,15 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
if (!iap->ia_valid)
return 0;
- iap->ia_valid |= ATTR_CTIME;
+ /*
+ * If ATTR_DELEG is set, then this is an update from a client that
+ * holds a delegation. If this is an update for only the atime, the
+ * ctime should not be changed. If the update contains the mtime
+ * too, then ATTR_CTIME should already be set.
+ */
+ if (!(iap->ia_valid & ATTR_DELEG))
+ iap->ia_valid |= ATTR_CTIME;
+
return notify_change(&nop_mnt_idmap, dentry, iap, NULL);
}
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 14868a3dd592..bc52afbfc5c7 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
************************************************************************/
static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d.%d\n",
NILFS_CURRENT_REV, NILFS_MINOR_REV);
@@ -1087,7 +1087,7 @@ static const char features_readme_str[] =
"(1) revision\n\tshow current revision of NILFS file system driver.\n";
static ssize_t nilfs_feature_README_show(struct kobject *kobj,
- struct attribute *attr,
+ struct kobj_attribute *attr,
char *buf)
{
return sysfs_emit(buf, features_readme_str);
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
index 78a87a016928..d370cd5cce3f 100644
--- a/fs/nilfs2/sysfs.h
+++ b/fs/nilfs2/sysfs.h
@@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups {
struct completion sg_segments_kobj_unregister;
};
-#define NILFS_COMMON_ATTR_STRUCT(name) \
+#define NILFS_KOBJ_ATTR_STRUCT(name) \
struct nilfs_##name##_attr { \
struct attribute attr; \
- ssize_t (*show)(struct kobject *, struct attribute *, \
+ ssize_t (*show)(struct kobject *, struct kobj_attribute *, \
char *); \
- ssize_t (*store)(struct kobject *, struct attribute *, \
+ ssize_t (*store)(struct kobject *, struct kobj_attribute *, \
const char *, size_t); \
}
-NILFS_COMMON_ATTR_STRUCT(feature);
+NILFS_KOBJ_ATTR_STRUCT(feature);
#define NILFS_DEV_ATTR_STRUCT(name) \
struct nilfs_##name##_attr { \
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 079b868552c2..46bfc543f946 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
* removed all zero refcount inodes, in any case. Test to
* be sure.
*/
- if (!atomic_read(&inode->i_count)) {
+ if (!icount_read(inode)) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 798340db69d7..55a03bb05aa1 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -428,7 +428,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
conn->destroy_next = connector_destroy_list;
connector_destroy_list = conn;
spin_unlock(&destroy_lock);
- queue_work(system_unbound_wq, &connector_reaper_work);
+ queue_work(system_dfl_wq, &connector_reaper_work);
}
/*
* Note that we didn't update flags telling whether inode cares about
@@ -439,7 +439,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
spin_lock(&destroy_lock);
list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
+ queue_delayed_work(system_dfl_wq, &reaper_work,
FSNOTIFY_REAPER_DELAY);
}
EXPORT_SYMBOL_GPL(fsnotify_put_mark);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 59aa801347a7..e7fd8a790aaa 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -13,12 +13,26 @@
#include <linux/nsfs.h>
#include <linux/uaccess.h>
#include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
+#include <linux/utsname.h>
+#include <linux/exportfs.h>
+#include <linux/nstree.h>
+#include <net/net_namespace.h>
#include "mount.h"
#include "internal.h"
static struct vfsmount *nsfs_mnt;
+static struct path nsfs_root_path = {};
+
+void nsfs_get_root(struct path *path)
+{
+ *path = nsfs_root_path;
+ path_get(path);
+}
+
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg);
static const struct file_operations ns_file_operations = {
@@ -139,7 +153,7 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
* the size value will be set to the size the kernel knows about.
*/
kinfo->size = min(usize, sizeof(*kinfo));
- kinfo->mnt_ns_id = mnt_ns->seq;
+ kinfo->mnt_ns_id = mnt_ns->ns.ns_id;
kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts);
/* Subtract the root mount of the mount namespace. */
if (kinfo->nr_mounts)
@@ -163,15 +177,18 @@ static bool nsfs_ioctl_valid(unsigned int cmd)
case NS_GET_TGID_FROM_PIDNS:
case NS_GET_PID_IN_PIDNS:
case NS_GET_TGID_IN_PIDNS:
- return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ case NS_GET_ID:
+ return true;
}
/* Extensible ioctls require some extra handling. */
switch (_IOC_NR(cmd)) {
case _IOC_NR(NS_MNT_GET_INFO):
+ return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0);
case _IOC_NR(NS_MNT_GET_NEXT):
+ return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0);
case _IOC_NR(NS_MNT_GET_PREV):
- return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0);
}
return false;
@@ -202,26 +219,14 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
return -EINVAL;
return open_related_ns(ns, ns->ops->get_parent);
case NS_GET_NSTYPE:
- return ns->ops->type;
+ return ns->ns_type;
case NS_GET_OWNER_UID:
- if (ns->ops->type != CLONE_NEWUSER)
+ if (ns->ns_type != CLONE_NEWUSER)
return -EINVAL;
user_ns = container_of(ns, struct user_namespace, ns);
argp = (uid_t __user *) arg;
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
- case NS_GET_MNTNS_ID: {
- __u64 __user *idp;
- __u64 id;
-
- if (ns->ops->type != CLONE_NEWNS)
- return -EINVAL;
-
- mnt_ns = container_of(ns, struct mnt_namespace, ns);
- idp = (__u64 __user *)arg;
- id = mnt_ns->seq;
- return put_user(id, idp);
- }
case NS_GET_PID_FROM_PIDNS:
fallthrough;
case NS_GET_TGID_FROM_PIDNS:
@@ -229,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
case NS_GET_PID_IN_PIDNS:
fallthrough;
case NS_GET_TGID_IN_PIDNS: {
- if (ns->ops->type != CLONE_NEWPID)
+ if (ns->ns_type != CLONE_NEWPID)
return -EINVAL;
ret = -ESRCH;
@@ -267,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
ret = -ESRCH;
return ret;
}
+ case NS_GET_MNTNS_ID:
+ if (ns->ns_type != CLONE_NEWNS)
+ return -EINVAL;
+ fallthrough;
+ case NS_GET_ID: {
+ __u64 __user *idp;
+ __u64 id;
+
+ idp = (__u64 __user *)arg;
+ id = ns->ns_id;
+ return put_user(id, idp);
+ }
}
/* extensible ioctls */
@@ -276,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
size_t usize = _IOC_SIZE(ioctl);
- if (ns->ops->type != CLONE_NEWNS)
+ if (ns->ns_type != CLONE_NEWNS)
return -EINVAL;
if (!uinfo)
@@ -297,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct file *f __free(fput) = NULL;
size_t usize = _IOC_SIZE(ioctl);
- if (ns->ops->type != CLONE_NEWNS)
+ if (ns->ns_type != CLONE_NEWNS)
return -EINVAL;
if (usize < MNT_NS_INFO_SIZE_VER0)
@@ -415,12 +432,164 @@ static const struct stashed_operations nsfs_stashed_ops = {
.put_data = nsfs_put_data,
};
+#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32))
+#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32))
+
+static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh;
+ struct ns_common *ns = inode->i_private;
+ int len = *max_len;
+
+ if (parent)
+ return FILEID_INVALID;
+
+ if (len < NSFS_FID_SIZE_U32_VER0) {
+ *max_len = NSFS_FID_SIZE_U32_LATEST;
+ return FILEID_INVALID;
+ } else if (len > NSFS_FID_SIZE_U32_LATEST) {
+ *max_len = NSFS_FID_SIZE_U32_LATEST;
+ }
+
+ fid->ns_id = ns->ns_id;
+ fid->ns_type = ns->ns_type;
+ fid->ns_inum = inode->i_ino;
+ return FILEID_NSFS;
+}
+
+static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ struct path path __free(path_put) = {};
+ struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh;
+ struct user_namespace *owning_ns = NULL;
+ struct ns_common *ns;
+ int ret;
+
+ if (fh_len < NSFS_FID_SIZE_U32_VER0)
+ return NULL;
+
+ /* Check that any trailing bytes are zero. */
+ if ((fh_len > NSFS_FID_SIZE_U32_LATEST) &&
+ memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0,
+ fh_len - NSFS_FID_SIZE_U32_LATEST))
+ return NULL;
+
+ switch (fh_type) {
+ case FILEID_NSFS:
+ break;
+ default:
+ return NULL;
+ }
+
+ scoped_guard(rcu) {
+ ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
+ if (!ns)
+ return NULL;
+
+ VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
+ VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type);
+ VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum);
+
+ if (!__ns_ref_get(ns))
+ return NULL;
+ }
+
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ if (!current_in_namespace(to_cg_ns(ns)))
+ owning_ns = to_cg_ns(ns)->user_ns;
+ break;
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ if (!current_in_namespace(to_ipc_ns(ns)))
+ owning_ns = to_ipc_ns(ns)->user_ns;
+ break;
+#endif
+ case CLONE_NEWNS:
+ if (!current_in_namespace(to_mnt_ns(ns)))
+ owning_ns = to_mnt_ns(ns)->user_ns;
+ break;
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ if (!current_in_namespace(to_net_ns(ns)))
+ owning_ns = to_net_ns(ns)->user_ns;
+ break;
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ if (!current_in_namespace(to_pid_ns(ns))) {
+ owning_ns = to_pid_ns(ns)->user_ns;
+ } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) {
+ ns->ops->put(ns);
+ return ERR_PTR(-EPERM);
+ }
+ break;
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ if (!current_in_namespace(to_time_ns(ns)))
+ owning_ns = to_time_ns(ns)->user_ns;
+ break;
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ if (!current_in_namespace(to_user_ns(ns)))
+ owning_ns = to_user_ns(ns);
+ break;
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ if (!current_in_namespace(to_uts_ns(ns)))
+ owning_ns = to_uts_ns(ns)->user_ns;
+ break;
+#endif
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) {
+ ns->ops->put(ns);
+ return ERR_PTR(-EPERM);
+ }
+
+ /* path_from_stashed() unconditionally consumes the reference. */
+ ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return no_free_ptr(path.dentry);
+}
+
+static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
+ unsigned int oflags)
+{
+ /* nsfs_fh_to_dentry() performs all permission checks. */
+ return 0;
+}
+
+static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
+{
+ return file_open_root(path, "", oflags, 0);
+}
+
+static const struct export_operations nsfs_export_operations = {
+ .encode_fh = nsfs_encode_fh,
+ .fh_to_dentry = nsfs_fh_to_dentry,
+ .open = nsfs_export_open,
+ .permission = nsfs_export_permission,
+};
+
static int nsfs_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
if (!ctx)
return -ENOMEM;
ctx->ops = &nsfs_ops;
+ ctx->eops = &nsfs_export_operations;
ctx->dops = &ns_dentry_operations;
fc->s_fs_info = (void *)&nsfs_stashed_ops;
return 0;
@@ -438,4 +607,6 @@ void __init nsfs_init(void)
if (IS_ERR(nsfs_mnt))
panic("can't set nsfs up\n");
nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
+ nsfs_root_path.mnt = nsfs_mnt;
+ nsfs_root_path.dentry = nsfs_mnt->mnt_root;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2018501b2249..2347a50f079b 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1876,7 +1876,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
dlm_debug_init(dlm);
snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
- dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0);
+ dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!dlm->dlm_worker) {
status = -ENOMEM;
mlog_errno(status);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 5130ec44e5e1..cccaa1d6fbba 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = {
.alloc_inode = dlmfs_alloc_inode,
.free_inode = dlmfs_free_inode,
.evict_inode = dlmfs_evict_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
};
static const struct inode_operations dlmfs_file_inode_operations = {
@@ -595,7 +595,8 @@ static int __init init_dlmfs_fs(void)
}
cleanup_inode = 1;
- user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0);
+ user_dlm_worker = alloc_workqueue("user_dlm",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!user_dlm_worker) {
status = -ENOMEM;
goto bail;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 930150ed5db1..ef147e8b3271 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -706,6 +706,8 @@ out:
* it not only handles the fiemap for inlined files, but also deals
* with the fast symlink, cause they have no difference for extent
* mapping per se.
+ *
+ * Must be called with ip_alloc_sem semaphore held.
*/
static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
struct fiemap_extent_info *fieinfo,
@@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
u64 phys;
u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ lockdep_assert_held_read(&oi->ip_alloc_sem);
di = (struct ocfs2_dinode *)di_bh->b_data;
if (ocfs2_inode_is_fast_symlink(inode))
@@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
phys += offsetof(struct ocfs2_dinode,
id2.i_data.id_data);
+ /* Release the ip_alloc_sem to prevent deadlock on page fault */
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
flags);
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret < 0)
return ret;
}
@@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
-
+ /* Release the ip_alloc_sem to prevent deadlock on page fault */
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
len_bytes, fe_flags);
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret)
break;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 14bf440ea4df..6c4f78f473fb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1281,6 +1281,9 @@ static void ocfs2_clear_inode(struct inode *inode)
* the journal is flushed before journal shutdown. Thus it is safe to
* have inodes get cleaned up after journal shutdown.
*/
+ if (!osb->journal)
+ return;
+
jbd2_journal_release_jbd_inode(osb->journal->j_journal,
&oi->ip_jinode);
}
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index f3da840758e7..b46100a4f529 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = {
.free_inode = orangefs_free_inode,
.destroy_inode = orangefs_destroy_inode,
.write_inode = orangefs_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.statfs = orangefs_statfs,
.show_options = orangefs_show_options,
};
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 70b8687dc45e..dbd63a74df4b 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -225,7 +225,7 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
struct ovl_cattr *attr)
{
struct dentry *ret;
- inode_lock(workdir->d_inode);
+ inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT);
ret = ovl_create_real(ofs, workdir,
ovl_lookup_temp(ofs, workdir), attr);
inode_unlock(workdir->d_inode);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index df85a76597e9..bd3d7ba8fb95 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = {
.alloc_inode = ovl_alloc_inode,
.free_inode = ovl_free_inode,
.destroy_inode = ovl_destroy_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.put_super = ovl_put_super,
.sync_fs = ovl_sync_fs,
.statfs = ovl_statfs,
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index a33115e7384c..41033bac96cb 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1552,7 +1552,8 @@ void ovl_copyattr(struct inode *inode)
int ovl_parent_lock(struct dentry *parent, struct dentry *child)
{
inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
- if (!child || child->d_parent == parent)
+ if (!child ||
+ (!d_unhashed(child) && child->d_parent == parent))
return 0;
inode_unlock(parent->d_inode);
diff --git a/fs/pidfs.c b/fs/pidfs.c
index edc35522d75c..c40c29c702e5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -296,12 +296,12 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+ struct task_struct *task __free(put_task) = NULL;
struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
- struct task_struct *task;
struct pidfs_attr *attr;
const struct cred *c;
__u64 mask;
@@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd)
* erronously mistook the file descriptor for a pidfd.
* This is not perfect but will catch most cases.
*/
- return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO));
+ return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0);
}
return false;
@@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode)
}
static const struct super_operations pidfs_sops = {
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = pidfs_evict_inode,
.statfs = simple_statfs,
};
diff --git a/fs/pipe.c b/fs/pipe.c
index 731622d0738d..42fead1efe52 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
mutex_lock(&pipe->mutex);
if (!pipe->readers) {
- send_sig(SIGPIPE, current, 0);
+ if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
+ send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}
@@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
for (;;) {
if (!pipe->readers) {
- send_sig(SIGPIPE, current, 0);
+ if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
+ send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
diff --git a/fs/pnode.c b/fs/pnode.c
index 81f7599bdac4..6f7d02f3fa98 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -111,7 +111,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
return;
}
if (IS_MNT_SHARED(mnt)) {
- m = propagation_source(mnt);
+ if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list))
+ m = propagation_source(mnt);
if (list_empty(&mnt->mnt_share)) {
mnt_release_group_id(mnt);
} else {
@@ -637,10 +638,11 @@ void propagate_umount(struct list_head *set)
}
// now to_umount consists of all acceptable candidates
- // deal with reparenting of remaining overmounts on those
+ // deal with reparenting of surviving overmounts on those
list_for_each_entry(m, &to_umount, mnt_list) {
- if (m->overmount)
- reparent(m->overmount);
+ struct mount *over = m->overmount;
+ if (over && !will_be_unmounted(over))
+ reparent(over);
}
// and fold them into the set
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d6a0369caa93..69269745d73b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -157,13 +157,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
unsigned int max_fds = 0;
rcu_read_lock();
- ppid = pid_alive(p) ?
- task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-
tracer = ptrace_parent(p);
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
+ ppid = task_ppid_nr_ns(p, ns);
tgid = task_tgid_nr_ns(p, ns);
ngid = task_numa_group_id(p);
cred = get_task_cred(p);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 76e800e38c8f..176281112273 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -367,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = {
.setattr = proc_notify_change,
};
+static void pde_set_flags(struct proc_dir_entry *pde)
+{
+ const struct proc_ops *proc_ops = pde->proc_ops;
+
+ if (!proc_ops)
+ return;
+
+ if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+ if (proc_ops->proc_read_iter)
+ pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+ if (proc_ops->proc_compat_ioctl)
+ pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
+ if (proc_ops->proc_lseek)
+ pde->flags |= PROC_ENTRY_proc_lseek;
+}
+
/* returns the registered entry, or frees dp and returns NULL on failure */
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
struct proc_dir_entry *dp)
@@ -374,6 +393,9 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
if (proc_alloc_inum(&dp->low_ino))
goto out_free_entry;
+ if (!S_ISDIR(dp->mode))
+ pde_set_flags(dp);
+
write_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
@@ -561,20 +583,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
-static void pde_set_flags(struct proc_dir_entry *pde)
-{
- if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
- pde->flags |= PROC_ENTRY_PERMANENT;
- if (pde->proc_ops->proc_read_iter)
- pde->flags |= PROC_ENTRY_proc_read_iter;
-#ifdef CONFIG_COMPAT
- if (pde->proc_ops->proc_compat_ioctl)
- pde->flags |= PROC_ENTRY_proc_compat_ioctl;
-#endif
- if (pde->proc_ops->proc_lseek)
- pde->flags |= PROC_ENTRY_proc_lseek;
-}
-
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct proc_ops *proc_ops, void *data)
@@ -585,7 +593,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
if (!p)
return NULL;
p->proc_ops = proc_ops;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
@@ -636,7 +643,6 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
p->proc_ops = &proc_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_seq_private);
@@ -667,7 +673,6 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
return NULL;
p->proc_ops = &proc_single_ops;
p->single_show = show;
- pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_single_data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 129490151be1..d9b7ef122343 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.free_inode = proc_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
.show_options = proc_show_options,
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 4403a2e20c16..ea2b597fd92c 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -12,7 +12,7 @@
#include "internal.h"
-static const struct proc_ns_operations *ns_entries[] = {
+static const struct proc_ns_operations *const ns_entries[] = {
#ifdef CONFIG_NET_NS
&netns_operations,
#endif
@@ -117,7 +117,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
struct task_struct *task = get_proc_task(file_inode(file));
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
if (!task)
return -ENOENT;
@@ -151,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
struct task_struct *task = get_proc_task(dir);
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
unsigned int len = dentry->d_name.len;
struct dentry *res = ERR_PTR(-ENOENT);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ed86ac710384..1e24e085c7d5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -38,12 +38,14 @@ enum proc_param {
Opt_gid,
Opt_hidepid,
Opt_subset,
+ Opt_pidns,
};
static const struct fs_parameter_spec proc_fs_parameters[] = {
- fsparam_u32("gid", Opt_gid),
+ fsparam_u32("gid", Opt_gid),
fsparam_string("hidepid", Opt_hidepid),
fsparam_string("subset", Opt_subset),
+ fsparam_file_or_string("pidns", Opt_pidns),
{}
};
@@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
return 0;
}
+#ifdef CONFIG_PID_NS
+static int proc_parse_pidns_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct pid_namespace *target, *active = task_active_pid_ns(current);
+ struct ns_common *ns;
+ struct file *ns_filp __free(fput) = NULL;
+
+ switch (param->type) {
+ case fs_value_is_file:
+ /* came through fsconfig, steal the file reference */
+ ns_filp = no_free_ptr(param->file);
+ break;
+ case fs_value_is_string:
+ ns_filp = filp_open(param->string, O_RDONLY, 0);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ break;
+ }
+ if (!ns_filp)
+ ns_filp = ERR_PTR(-EBADF);
+ if (IS_ERR(ns_filp)) {
+ errorfc(fc, "could not get file from pidns argument");
+ return PTR_ERR(ns_filp);
+ }
+
+ if (!proc_ns_file(ns_filp))
+ return invalfc(fc, "pidns argument is not an nsfs file");
+ ns = get_proc_ns(file_inode(ns_filp));
+ if (ns->ns_type != CLONE_NEWPID)
+ return invalfc(fc, "pidns argument is not a pidns file");
+ target = container_of(ns, struct pid_namespace, ns);
+
+ /*
+ * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
+ * permission model should be the same as pidns_install().
+ */
+ if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
+ errorfc(fc, "insufficient permissions to set pidns");
+ return -EPERM;
+ }
+ if (!pidns_is_ancestor(target, active))
+ return invalfc(fc, "cannot set pidns to non-descendant pidns");
+
+ put_pid_ns(ctx->pid_ns);
+ ctx->pid_ns = get_pid_ns(target);
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ return 0;
+}
+#endif /* CONFIG_PID_NS */
+
static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct proc_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ int opt, err;
opt = fs_parse(fc, proc_fs_parameters, param, &result);
if (opt < 0)
@@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_hidepid:
- if (proc_parse_hidepid_param(fc, param))
- return -EINVAL;
+ err = proc_parse_hidepid_param(fc, param);
+ if (err)
+ return err;
break;
case Opt_subset:
- if (proc_parse_subset_param(fc, param->string) < 0)
- return -EINVAL;
+ err = proc_parse_subset_param(fc, param->string);
+ if (err)
+ return err;
+ break;
+
+ case Opt_pidns:
+#ifdef CONFIG_PID_NS
+ /*
+ * We would have to RCU-protect every proc_pid_ns() or
+ * proc_sb_info() access if we allowed this to be reconfigured
+ * for an existing procfs instance. Luckily, procfs instances
+ * are cheap to create, and mount-beneath would let you
+ * atomically replace an instance even with overmounts.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ errorfc(fc, "cannot reconfigure pidns for existing procfs");
+ return -EBUSY;
+ }
+ err = proc_parse_pidns_param(fc, param, &result);
+ if (err)
+ return err;
break;
+#else
+ errorfc(fc, "pidns mount flag not supported on this system");
+ return -EOPNOTSUPP;
+#endif
default:
return -EINVAL;
@@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
fs_info->hide_pid = ctx->hidepid;
if (ctx->mask & (1 << Opt_subset))
fs_info->pidonly = ctx->pidonly;
+ if (ctx->mask & (1 << Opt_pidns) &&
+ !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
+ put_pid_ns(fs_info->pid_ns);
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ }
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3d6d8a9f13fc..b26ae556b446 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -340,8 +340,8 @@ static int proc_maps_open(struct inode *inode, struct file *file,
priv->inode = inode;
priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR_OR_NULL(priv->mm)) {
- int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
+ if (IS_ERR(priv->mm)) {
+ int err = PTR_ERR(priv->mm);
seq_release_private(inode, file);
return err;
@@ -1148,10 +1148,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
struct folio *folio = NULL;
bool present = false;
+ spinlock_t *ptl;
+ pte_t ptent;
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+ ptent = huge_ptep_get(walk->mm, addr, pte);
if (pte_present(ptent)) {
folio = page_folio(pte_page(ptent));
present = true;
@@ -1170,6 +1173,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
else
mss->private_hugetlb += huge_page_size(hstate_vma(vma));
}
+ spin_unlock(ptl);
return 0;
}
#else
@@ -2017,12 +2021,14 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
u64 flags = 0, frame = 0;
+ spinlock_t *ptl;
int err = 0;
pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
pte = huge_ptep_get(walk->mm, addr, ptep);
if (pte_present(pte)) {
struct folio *folio = page_folio(pte_page(pte));
@@ -2050,11 +2056,12 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
err = add_to_pagemap(&pme, pm);
if (err)
- return err;
+ break;
if (pm->show_pfn && (flags & PM_PRESENT))
frame++;
}
+ spin_unlock(ptl);
cond_resched();
return err;
@@ -2410,6 +2417,9 @@ static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
{
struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+ if (!p->vec_buf)
+ return;
+
if (cur_buf->start != addr)
cur_buf->end = addr;
else
@@ -3128,17 +3138,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
- pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte);
+ pte_t huge_pte;
struct numa_maps *md;
struct page *page;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ huge_pte = huge_ptep_get(walk->mm, addr, pte);
if (!pte_present(huge_pte))
- return 0;
+ goto out;
page = pte_page(huge_pte);
md = walk->private;
gather_stats(page, md, pte_dirty(huge_pte), 1);
+out:
+ spin_unlock(ptl);
return 0;
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 1a2e1185426c..b4e55c90f8dc 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc)
static const struct super_operations pstore_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = pstore_evict_inode,
.show_options = pstore_show_options,
};
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index df4a9b348769..afa15a214538 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -881,7 +881,7 @@ void dqput(struct dquot *dquot)
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
+ queue_delayed_work(system_dfl_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index f8874c3b8c1e..41f9995da7ca 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root)
static const struct super_operations ramfs_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.show_options = ramfs_show_options,
};
diff --git a/fs/read_write.c b/fs/read_write.c
index c5b6265d984b..833bae068770 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (len == 0)
return 0;
+ /*
+ * Make sure return value doesn't overflow in 32bit compat mode. Also
+ * limit the size for all cases except when calling ->copy_file_range().
+ */
+ if (splice || !file_out->f_op->copy_file_range || in_compat_syscall())
+ len = min_t(size_t, MAX_RW_COUNT, len);
+
file_start_write(file_out);
/*
@@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
len, flags);
} else if (!splice && file_in->f_op->remap_file_range && samesb) {
ret = file_in->f_op->remap_file_range(file_in, pos_in,
- file_out, pos_out,
- min_t(loff_t, MAX_RW_COUNT, len),
- REMAP_FILE_CAN_SHORTEN);
+ file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN);
/* fallback to splice */
if (ret <= 0)
splice = true;
@@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* to splicing from input file, while file_start_write() is held on
* the output file on a different sb.
*/
- ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
- min_t(size_t, len, MAX_RW_COUNT), 0);
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0);
done:
if (ret > 0) {
fsnotify_access(file_in);
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index d98e0d2de09f..3c39cfacb251 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -625,11 +625,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
*/
list_for_each_entry(d, &r->mon_domains, hdr.list) {
if (d->ci_id == domid) {
- rr.ci_id = d->ci_id;
cpu = cpumask_any(&d->hdr.cpu_mask);
ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
if (!ci)
continue;
+ rr.ci = ci;
mon_event_read(&rr, r, NULL, rdtgrp,
&ci->shared_cpu_map, evtid, false);
goto checkresult;
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 0a1eedba2b03..9a8cf6f11151 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -98,7 +98,7 @@ struct mon_data {
* domains in @r sharing L3 @ci.id
* @evtid: Which monitor event to read.
* @first: Initialize MBM counter when true.
- * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
+ * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
* @err: Error encountered when reading counter.
* @val: Returned value of event counter. If @rgrp is a parent resource group,
* @val includes the sum of event counts from its child resource groups.
@@ -112,7 +112,7 @@ struct rmid_read {
struct rdt_mon_domain *d;
enum resctrl_event_id evtid;
bool first;
- unsigned int ci_id;
+ struct cacheinfo *ci;
int err;
u64 val;
void *arch_mon_ctx;
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index f5637855c3ac..7326c28a7908 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -361,7 +361,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
int cpu = smp_processor_id();
struct rdt_mon_domain *d;
- struct cacheinfo *ci;
struct mbm_state *m;
int err, ret;
u64 tval = 0;
@@ -389,8 +388,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
}
/* Summing domains that share a cache, must be on a CPU for that cache. */
- ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
- if (!ci || ci->id != rr->ci_id)
+ if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
return -EINVAL;
/*
@@ -402,7 +400,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
*/
ret = -EINVAL;
list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
- if (d->ci_id != rr->ci_id)
+ if (d->ci_id != rr->ci->id)
continue;
err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
rr->evtid, &tval, rr->arch_mon_ctx);
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index beb4f18f05ef..2337cf795db3 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -304,6 +304,8 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
cfids = tcon->cfids;
+ if (!cfids)
+ continue;
spin_lock(&cfids->cfid_list_lock); /* check lock ordering */
seq_printf(m, "Num entries: %d\n", cfids->num_entries);
list_for_each_entry(cfid, &cfids->entries, entry) {
@@ -319,8 +321,6 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\n");
}
spin_unlock(&cfids->cfid_list_lock);
-
-
}
}
}
@@ -347,6 +347,22 @@ static __always_inline const char *compression_alg_str(__le16 alg)
}
}
+static __always_inline const char *cipher_alg_str(__le16 cipher)
+{
+ switch (cipher) {
+ case SMB2_ENCRYPTION_AES128_CCM:
+ return "AES128-CCM";
+ case SMB2_ENCRYPTION_AES128_GCM:
+ return "AES128-GCM";
+ case SMB2_ENCRYPTION_AES256_CCM:
+ return "AES256-CCM";
+ case SMB2_ENCRYPTION_AES256_GCM:
+ return "AES256-GCM";
+ default:
+ return "UNKNOWN";
+ }
+}
+
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
struct mid_q_entry *mid_entry;
@@ -539,6 +555,11 @@ skip_rdma:
else
seq_puts(m, "disabled (not supported by this server)");
+ /* Show negotiated encryption cipher, even if not required */
+ seq_puts(m, "\nEncryption: ");
+ if (server->cipher_type)
+ seq_printf(m, "Negotiated cipher (%s)", cipher_alg_str(server->cipher_type));
+
seq_printf(m, "\n\n\tSessions: ");
i = 0;
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
@@ -576,12 +597,8 @@ skip_rdma:
/* dump session id helpful for use with network trace */
seq_printf(m, " SessionId: 0x%llx", ses->Suid);
- if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) {
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
seq_puts(m, " encrypted");
- /* can help in debugging to show encryption type */
- if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
- seq_puts(m, "(gcm256)");
- }
if (ses->sign)
seq_puts(m, " signed");
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index bc1c1e9b288a..43b86fa4d695 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -124,55 +124,44 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
dp = description;
/* start with version and hostname portion of UNC string */
spnego_key = ERR_PTR(-EINVAL);
- sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
- hostname);
- dp = description + strlen(description);
+ dp += sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
+ hostname);
/* add the server address */
if (server->dstaddr.ss_family == AF_INET)
- sprintf(dp, "ip4=%pI4", &sa->sin_addr);
+ dp += sprintf(dp, "ip4=%pI4", &sa->sin_addr);
else if (server->dstaddr.ss_family == AF_INET6)
- sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
+ dp += sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
else
goto out;
- dp = description + strlen(description);
-
/* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */
if (server->sec_kerberos)
- sprintf(dp, ";sec=krb5");
+ dp += sprintf(dp, ";sec=krb5");
else if (server->sec_mskerberos)
- sprintf(dp, ";sec=mskrb5");
+ dp += sprintf(dp, ";sec=mskrb5");
else if (server->sec_iakerb)
- sprintf(dp, ";sec=iakerb");
+ dp += sprintf(dp, ";sec=iakerb");
else {
cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n");
- sprintf(dp, ";sec=krb5");
+ dp += sprintf(dp, ";sec=krb5");
}
- dp = description + strlen(description);
- sprintf(dp, ";uid=0x%x",
- from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
+ dp += sprintf(dp, ";uid=0x%x",
+ from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
- dp = description + strlen(description);
- sprintf(dp, ";creduid=0x%x",
+ dp += sprintf(dp, ";creduid=0x%x",
from_kuid_munged(&init_user_ns, sesInfo->cred_uid));
- if (sesInfo->user_name) {
- dp = description + strlen(description);
- sprintf(dp, ";user=%s", sesInfo->user_name);
- }
+ if (sesInfo->user_name)
+ dp += sprintf(dp, ";user=%s", sesInfo->user_name);
- dp = description + strlen(description);
- sprintf(dp, ";pid=0x%x", current->pid);
+ dp += sprintf(dp, ";pid=0x%x", current->pid);
- if (sesInfo->upcall_target == UPTARGET_MOUNT) {
- dp = description + strlen(description);
- sprintf(dp, ";upcall_target=mount");
- } else {
- dp = description + strlen(description);
- sprintf(dp, ";upcall_target=app");
- }
+ if (sesInfo->upcall_target == UPTARGET_MOUNT)
+ dp += sprintf(dp, ";upcall_target=mount");
+ else
+ dp += sprintf(dp, ";upcall_target=app");
cifs_dbg(FYI, "key description = %s\n", description);
saved_cred = override_creds(spnego_cred);
diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c
index 4cc6e0896fad..f8659d36793f 100644
--- a/fs/smb/client/cifs_unicode.c
+++ b/fs/smb/client/cifs_unicode.c
@@ -629,6 +629,9 @@ cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
int len;
__le16 *dst;
+ if (!src)
+ return NULL;
+
len = cifs_local_to_utf16_bytes(src, maxlen, cp);
len += 2; /* NULL */
dst = kmalloc(len, GFP_KERNEL);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 3bd85ab2deb1..dcb39d1b5958 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode)
/* no serverino => unconditional eviction */
return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
- generic_drop_inode(inode);
+ inode_generic_drop(inode);
}
static const struct super_operations cifs_super_ops = {
@@ -1358,6 +1358,20 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
truncate_setsize(target_inode, new_size);
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
+ } else if (rc == -EOPNOTSUPP) {
+ /*
+ * copy_file_range syscall man page indicates EINVAL
+ * is returned e.g when "fd_in and fd_out refer to the
+ * same file and the source and target ranges overlap."
+ * Test generic/157 was what showed these cases where
+ * we need to remap EOPNOTSUPP to EINVAL
+ */
+ if (off >= src_inode->i_size) {
+ rc = -EINVAL;
+ } else if (src_inode == target_inode) {
+ if (off + len > destoff)
+ rc = -EINVAL;
+ }
}
if (rc == 0 && new_size > target_cifsi->netfs.zero_point)
target_cifsi->netfs.zero_point = new_size;
@@ -1881,7 +1895,9 @@ init_cifs(void)
cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n");
}
- cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ cifsiod_wq = alloc_workqueue("cifsiod",
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!cifsiod_wq) {
rc = -ENOMEM;
goto out_clean_proc;
@@ -1909,28 +1925,32 @@ init_cifs(void)
}
cifsoplockd_wq = alloc_workqueue("cifsoplockd",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!cifsoplockd_wq) {
rc = -ENOMEM;
goto out_destroy_fileinfo_put_wq;
}
deferredclose_wq = alloc_workqueue("deferredclose",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!deferredclose_wq) {
rc = -ENOMEM;
goto out_destroy_cifsoplockd_wq;
}
serverclose_wq = alloc_workqueue("serverclose",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!serverclose_wq) {
rc = -ENOMEM;
goto out_destroy_deferredclose_wq;
}
cfid_put_wq = alloc_workqueue("cfid_put_wq",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!cfid_put_wq) {
rc = -ENOMEM;
goto out_destroy_serverclose_wq;
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 487f39cff77e..3ce7c614ccc0 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -145,6 +145,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 55
-#define CIFS_VERSION "2.55"
+#define SMB3_PRODUCT_BUILD 56
+#define CIFS_VERSION "2.56"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index e6830ab3a546..0fae95cf81c4 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -87,7 +87,7 @@
#define SMB_INTERFACE_POLL_INTERVAL 600
/* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 7
+#define MAX_COMPOUND 10
/*
* Default number of credits to keep available for SMB3.
@@ -1732,6 +1732,7 @@ struct mid_q_entry {
int mid_rc; /* rc for MID_RC */
__le16 command; /* smb command code */
unsigned int optype; /* operation type */
+ spinlock_t mid_lock;
bool wait_cancelled:1; /* Cancelled while waiting for response */
bool deleted_from_q:1; /* Whether Mid has been dequeued frem pending_mid_q */
bool large_buf:1; /* if valid response, is pointer to large buf */
@@ -1881,9 +1882,12 @@ static inline bool is_replayable_error(int error)
/* cifs_get_writable_file() flags */
-#define FIND_WR_ANY 0
-#define FIND_WR_FSUID_ONLY 1
-#define FIND_WR_WITH_DELETE 2
+enum cifs_writable_file_flags {
+ FIND_WR_ANY = 0U,
+ FIND_WR_FSUID_ONLY = (1U << 0),
+ FIND_WR_WITH_DELETE = (1U << 1),
+ FIND_WR_NO_PENDING_DELETE = (1U << 2),
+};
#define MID_FREE 0
#define MID_REQUEST_ALLOCATED 1
@@ -2036,6 +2040,9 @@ require use of the stronger protocol */
* cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo
* ->invalidHandle initiate_cifs_search
* ->oplock_break_cancelled
+ * mid_q_entry->mid_lock mid_q_entry->callback alloc_mid
+ * smb2_mid_entry_alloc
+ * (Any fields of mid_q_entry that will need protection)
****************************************************************************/
#ifdef DECLARE_GLOBALS_HERE
@@ -2339,6 +2346,8 @@ struct smb2_compound_vars {
struct kvec qi_iov;
struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE];
struct kvec close_iov;
struct smb2_file_rename_info_hdr rename_info;
struct smb2_file_link_info_hdr link_info;
@@ -2375,6 +2384,23 @@ static inline bool cifs_netbios_name(const char *name, size_t namelen)
return ret;
}
+/*
+ * Execute mid callback atomically - ensures callback runs exactly once
+ * and prevents sleeping in atomic context.
+ */
+static inline void mid_execute_callback(struct mid_q_entry *mid)
+{
+ void (*callback)(struct mid_q_entry *mid);
+
+ spin_lock(&mid->mid_lock);
+ callback = mid->callback;
+ mid->callback = NULL; /* Mark as executed, */
+ spin_unlock(&mid->mid_lock);
+
+ if (callback)
+ callback(mid);
+}
+
#define CIFS_REPARSE_SUPPORT(tcon) \
((tcon)->posix_extensions || \
(le32_to_cpu((tcon)->fsAttrInfo.Attributes) & \
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index c34c533b2efa..e8fba98690ce 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -312,8 +312,8 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode);
extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
-extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
- const char *path);
+void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
+ struct dentry *dentry);
extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode,
const char *path);
diff --git a/fs/smb/client/cifstransport.c b/fs/smb/client/cifstransport.c
index 352dafb888dd..e98b95eff8c9 100644
--- a/fs/smb/client/cifstransport.c
+++ b/fs/smb/client/cifstransport.c
@@ -46,6 +46,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
memset(temp, 0, sizeof(struct mid_q_entry));
kref_init(&temp->refcount);
+ spin_lock_init(&temp->mid_lock);
temp->mid = get_mid(smb_buffer);
temp->pid = current->pid;
temp->command = cpu_to_le16(smb_buffer->Command);
@@ -345,16 +346,15 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = wait_for_response(server, midQ);
if (rc != 0) {
send_cancel(server, &rqst, midQ);
- spin_lock(&server->mid_queue_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
- midQ->mid_state == MID_RESPONSE_RECEIVED) {
+ spin_lock(&midQ->mid_lock);
+ if (midQ->callback) {
/* no longer considered to be "in-flight" */
midQ->callback = release_mid;
- spin_unlock(&server->mid_queue_lock);
+ spin_unlock(&midQ->mid_lock);
add_credits(server, &credits, 0);
return rc;
}
- spin_unlock(&server->mid_queue_lock);
+ spin_unlock(&midQ->mid_lock);
}
rc = cifs_sync_mid_result(midQ, server);
@@ -527,15 +527,14 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
rc = wait_for_response(server, midQ);
if (rc) {
send_cancel(server, &rqst, midQ);
- spin_lock(&server->mid_queue_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
- midQ->mid_state == MID_RESPONSE_RECEIVED) {
+ spin_lock(&midQ->mid_lock);
+ if (midQ->callback) {
/* no longer considered to be "in-flight" */
midQ->callback = release_mid;
- spin_unlock(&server->mid_queue_lock);
+ spin_unlock(&midQ->mid_lock);
return rc;
}
- spin_unlock(&server->mid_queue_lock);
+ spin_unlock(&midQ->mid_lock);
}
/* We got the response - restart system call. */
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c
index 766b4de13da7..db709f5cd2e1 100644
--- a/fs/smb/client/compress.c
+++ b/fs/smb/client/compress.c
@@ -155,58 +155,29 @@ static int cmp_bkt(const void *_a, const void *_b)
}
/*
- * TODO:
- * Support other iter types, if required.
- * Only ITER_XARRAY is supported for now.
+ * Collect some 2K samples with 2K gaps between.
*/
-static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
+static int collect_sample(const struct iov_iter *source, ssize_t max, u8 *sample)
{
- struct folio *folios[16], *folio;
- unsigned int nr, i, j, npages;
- loff_t start = iter->xarray_start + iter->iov_offset;
- pgoff_t last, index = start / PAGE_SIZE;
- size_t len, off, foff;
- void *p;
- int s = 0;
-
- last = (start + max - 1) / PAGE_SIZE;
- do {
- nr = xa_extract(iter->xarray, (void **)folios, index, last, ARRAY_SIZE(folios),
- XA_PRESENT);
- if (nr == 0)
- return -EIO;
-
- for (i = 0; i < nr; i++) {
- folio = folios[i];
- npages = folio_nr_pages(folio);
- foff = start - folio_pos(folio);
- off = foff % PAGE_SIZE;
-
- for (j = foff / PAGE_SIZE; j < npages; j++) {
- size_t len2;
-
- len = min_t(size_t, max, PAGE_SIZE - off);
- len2 = min_t(size_t, len, SZ_2K);
-
- p = kmap_local_page(folio_page(folio, j));
- memcpy(&sample[s], p, len2);
- kunmap_local(p);
-
- s += len2;
-
- if (len2 < SZ_2K || s >= max - SZ_2K)
- return s;
-
- max -= len;
- if (max <= 0)
- return s;
-
- start += len;
- off = 0;
- index++;
- }
- }
- } while (nr == ARRAY_SIZE(folios));
+ struct iov_iter iter = *source;
+ size_t s = 0;
+
+ while (iov_iter_count(&iter) >= SZ_2K) {
+ size_t part = umin(umin(iov_iter_count(&iter), SZ_2K), max);
+ size_t n;
+
+ n = copy_from_iter(sample + s, part, &iter);
+ if (n != part)
+ return -EFAULT;
+
+ s += n;
+ max -= n;
+
+ if (iov_iter_count(&iter) < PAGE_SIZE - SZ_2K)
+ break;
+
+ iov_iter_advance(&iter, SZ_2K);
+ }
return s;
}
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 587845a2452d..dd12f3eb61dc 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -335,7 +335,7 @@ cifs_abort_connection(struct TCP_Server_Info *server)
cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
list_for_each_entry_safe(mid, nmid, &retry_list, qhead) {
list_del_init(&mid->qhead);
- mid->callback(mid);
+ mid_execute_callback(mid);
release_mid(mid);
}
@@ -919,7 +919,7 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
list_del_init(&mid->qhead);
mid->mid_rc = mid_rc;
mid->mid_state = MID_RC;
- mid->callback(mid);
+ mid_execute_callback(mid);
release_mid(mid);
}
@@ -1117,7 +1117,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid);
list_del_init(&mid_entry->qhead);
- mid_entry->callback(mid_entry);
+ mid_execute_callback(mid_entry);
release_mid(mid_entry);
}
/* 1/8th of sec is more than enough time for them to exit */
@@ -1394,7 +1394,7 @@ next_pdu:
}
if (!mids[i]->multiRsp || mids[i]->multiEnd)
- mids[i]->callback(mids[i]);
+ mid_execute_callback(mids[i]);
release_mid(mids[i]);
} else if (server->ops->is_oplock_break &&
@@ -4205,7 +4205,6 @@ retry:
return 0;
}
- server->lstrp = jiffies;
server->tcpStatus = CifsInNegotiate;
server->neg_start = jiffies;
spin_unlock(&server->srv_lock);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 186e061068be..cb907e18cc35 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -998,7 +998,10 @@ int cifs_open(struct inode *inode, struct file *file)
/* Get the cached handle as SMB2 close is deferred */
if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) {
- rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile);
+ rc = cifs_get_writable_path(tcon, full_path,
+ FIND_WR_FSUID_ONLY |
+ FIND_WR_NO_PENDING_DELETE,
+ &cfile);
} else {
rc = cifs_get_readable_path(tcon, full_path, &cfile);
}
@@ -2530,6 +2533,9 @@ refind_writable:
continue;
if (with_delete && !(open_file->fid.access & DELETE))
continue;
+ if ((flags & FIND_WR_NO_PENDING_DELETE) &&
+ open_file->status_file_deleted)
+ continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -2647,6 +2653,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
spin_unlock(&tcon->open_file_lock);
free_dentry_path(page);
*ret_file = find_readable_file(cinode, 0);
+ if (*ret_file) {
+ spin_lock(&cinode->open_file_lock);
+ if ((*ret_file)->status_file_deleted) {
+ spin_unlock(&cinode->open_file_lock);
+ cifsFileInfo_put(*ret_file);
+ *ret_file = NULL;
+ } else {
+ spin_unlock(&cinode->open_file_lock);
+ }
+ }
return *ret_file ? 0 : -ENOENT;
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 75be4b46bc6f..7e9784080501 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode)
* but will return the EACCES to the caller. Note that the VFS does not call
* unlink on negative dentries currently.
*/
-int cifs_unlink(struct inode *dir, struct dentry *dentry)
+static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename)
{
int rc = 0;
unsigned int xid;
@@ -1943,15 +1943,24 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct tcon_link *tlink;
struct cifs_tcon *tcon;
+ __u32 dosattr = 0, origattr = 0;
struct TCP_Server_Info *server;
struct iattr *attrs = NULL;
- __u32 dosattr = 0, origattr = 0;
+ bool rehash = false;
cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry);
if (unlikely(cifs_forced_shutdown(cifs_sb)))
return -EIO;
+ /* Unhash dentry in advance to prevent any concurrent opens */
+ spin_lock(&dentry->d_lock);
+ if (!d_unhashed(dentry)) {
+ __d_drop(dentry);
+ rehash = true;
+ }
+ spin_unlock(&dentry->d_lock);
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -1975,7 +1984,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
}
netfs_wait_for_outstanding_io(inode);
- cifs_close_deferred_file_under_dentry(tcon, full_path);
+ cifs_close_deferred_file_under_dentry(tcon, dentry);
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -1994,7 +2003,24 @@ retry_std_delete:
goto psx_del_no_retry;
}
- rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
+ /* For SMB2+, if the file is open, we always perform a silly rename.
+ *
+ * We check for d_count() right after calling
+ * cifs_close_deferred_file_under_dentry() to make sure that the
+ * dentry's refcount gets dropped in case the file had any deferred
+ * close.
+ */
+ if (!sillyrename && server->vals->protocol_id > SMB10_PROT_ID) {
+ spin_lock(&dentry->d_lock);
+ if (d_count(dentry) > 1)
+ sillyrename = true;
+ spin_unlock(&dentry->d_lock);
+ }
+
+ if (sillyrename)
+ rc = -EBUSY;
+ else
+ rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
psx_del_no_retry:
if (!rc) {
@@ -2003,7 +2029,8 @@ psx_del_no_retry:
cifs_drop_nlink(inode);
}
} else if (rc == -ENOENT) {
- d_drop(dentry);
+ if (simple_positive(dentry))
+ d_delete(dentry);
} else if (rc == -EBUSY) {
if (server->ops->rename_pending_delete) {
rc = server->ops->rename_pending_delete(full_path,
@@ -2056,9 +2083,16 @@ unlink_out:
kfree(attrs);
free_xid(xid);
cifs_put_tlink(tlink);
+ if (rehash)
+ d_rehash(dentry);
return rc;
}
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ return __cifs_unlink(dir, dentry, false);
+}
+
static int
cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
const char *full_path, struct cifs_sb_info *cifs_sb,
@@ -2346,14 +2380,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb);
cifs_put_tlink(tlink);
+ cifsInode = CIFS_I(d_inode(direntry));
+
if (!rc) {
+ set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags);
spin_lock(&d_inode(direntry)->i_lock);
i_size_write(d_inode(direntry), 0);
clear_nlink(d_inode(direntry));
spin_unlock(&d_inode(direntry)->i_lock);
}
- cifsInode = CIFS_I(d_inode(direntry));
/* force revalidate to go get info when needed */
cifsInode->time = 0;
@@ -2446,8 +2482,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
do_rename_exit:
- if (rc == 0)
+ if (rc == 0) {
d_move(from_dentry, to_dentry);
+ /* Force a new lookup */
+ d_drop(from_dentry);
+ }
cifs_put_tlink(tlink);
return rc;
}
@@ -2458,10 +2497,12 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
struct dentry *target_dentry, unsigned int flags)
{
const char *from_name, *to_name;
+ struct TCP_Server_Info *server;
void *page1, *page2;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
+ bool rehash = false;
unsigned int xid;
int rc, tmprc;
int retry_count = 0;
@@ -2477,10 +2518,22 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
if (unlikely(cifs_forced_shutdown(cifs_sb)))
return -EIO;
+ /*
+ * Prevent any concurrent opens on the target by unhashing the dentry.
+ * VFS already unhashes the target when renaming directories.
+ */
+ if (d_is_positive(target_dentry) && !d_is_dir(target_dentry)) {
+ if (!d_unhashed(target_dentry)) {
+ d_drop(target_dentry);
+ rehash = true;
+ }
+ }
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
page1 = alloc_dentry_path();
page2 = alloc_dentry_path();
@@ -2498,10 +2551,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
goto cifs_rename_exit;
}
- cifs_close_deferred_file_under_dentry(tcon, from_name);
+ cifs_close_deferred_file_under_dentry(tcon, source_dentry);
if (d_inode(target_dentry) != NULL) {
netfs_wait_for_outstanding_io(d_inode(target_dentry));
- cifs_close_deferred_file_under_dentry(tcon, to_name);
+ cifs_close_deferred_file_under_dentry(tcon, target_dentry);
}
rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
@@ -2518,6 +2571,8 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
}
}
+ if (!rc)
+ rehash = false;
/*
* No-replace is the natural behavior for CIFS, so skip unlink hacks.
*/
@@ -2565,23 +2620,61 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
unlink_target:
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-
- /* Try unlinking the target dentry if it's not negative */
- if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
- if (d_is_dir(target_dentry))
- tmprc = cifs_rmdir(target_dir, target_dentry);
- else
- tmprc = cifs_unlink(target_dir, target_dentry);
- if (tmprc)
- goto cifs_rename_exit;
- rc = cifs_do_rename(xid, source_dentry, from_name,
- target_dentry, to_name);
+ if (d_really_is_positive(target_dentry)) {
+ if (!rc) {
+ struct inode *inode = d_inode(target_dentry);
+ /*
+ * Samba and ksmbd servers allow renaming a target
+ * directory that is open, so make sure to update
+ * ->i_nlink and then mark it as delete pending.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb);
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, 0);
+ clear_nlink(inode);
+ spin_unlock(&inode->i_lock);
+ set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags);
+ CIFS_I(inode)->time = 0; /* force reval */
+ inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ }
+ } else if (rc == -EACCES || rc == -EEXIST) {
+ /*
+ * Rename failed, possibly due to a busy target.
+ * Retry it by unliking the target first.
+ */
+ if (d_is_dir(target_dentry)) {
+ tmprc = cifs_rmdir(target_dir, target_dentry);
+ } else {
+ tmprc = __cifs_unlink(target_dir, target_dentry,
+ server->vals->protocol_id > SMB10_PROT_ID);
+ }
+ if (tmprc) {
+ /*
+ * Some servers will return STATUS_ACCESS_DENIED
+ * or STATUS_DIRECTORY_NOT_EMPTY when failing to
+ * rename a non-empty directory. Make sure to
+ * propagate the appropriate error back to
+ * userspace.
+ */
+ if (tmprc == -EEXIST || tmprc == -ENOTEMPTY)
+ rc = tmprc;
+ goto cifs_rename_exit;
+ }
+ rc = cifs_do_rename(xid, source_dentry, from_name,
+ target_dentry, to_name);
+ if (!rc)
+ rehash = false;
+ }
}
/* force revalidate to go get info when needed */
CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
cifs_rename_exit:
+ if (rehash)
+ d_rehash(target_dentry);
kfree(info_buf_source);
free_dentry_path(page2);
free_dentry_path(page1);
@@ -2599,6 +2692,8 @@ cifs_dentry_needs_reval(struct dentry *dentry)
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct cached_fid *cfid = NULL;
+ if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags))
+ return false;
if (cifs_i->time == 0)
return true;
@@ -2749,7 +2844,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
}
cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
- full_path, inode, inode->i_count.counter,
+ full_path, inode, icount_read(inode),
dentry, cifs_get_time(dentry), jiffies);
again:
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index da23cc12a52c..dda6dece802a 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -832,33 +832,28 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
kfree(tmp_list);
}
}
-void
-cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
+
+void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon,
+ struct dentry *dentry)
{
- struct cifsFileInfo *cfile;
struct file_list *tmp_list, *tmp_next_list;
- void *page;
- const char *full_path;
+ struct cifsFileInfo *cfile;
LIST_HEAD(file_head);
- page = alloc_dentry_path();
spin_lock(&tcon->open_file_lock);
list_for_each_entry(cfile, &tcon->openFileList, tlist) {
- full_path = build_path_from_dentry(cfile->dentry, page);
- if (strstr(full_path, path)) {
- if (delayed_work_pending(&cfile->deferred)) {
- if (cancel_delayed_work(&cfile->deferred)) {
- spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
- cifs_del_deferred_close(cfile);
- spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
-
- tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
- if (tmp_list == NULL)
- break;
- tmp_list->cfile = cfile;
- list_add_tail(&tmp_list->list, &file_head);
- }
- }
+ if ((cfile->dentry == dentry) &&
+ delayed_work_pending(&cfile->deferred) &&
+ cancel_delayed_work(&cfile->deferred)) {
+ spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+ cifs_del_deferred_close(cfile);
+ spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
+
+ tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
+ if (tmp_list == NULL)
+ break;
+ tmp_list->cfile = cfile;
+ list_add_tail(&tmp_list->list, &file_head);
}
}
spin_unlock(&tcon->open_file_lock);
@@ -868,7 +863,6 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
list_del(&tmp_list->list);
kfree(tmp_list);
}
- free_dentry_path(page);
}
/*
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 7869cec58f52..10c84c095fe7 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -278,7 +278,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
}
/*
- * For absolute symlinks it is not possible to determinate
+ * For absolute symlinks it is not possible to determine
* if it should point to directory or file.
*/
if (symname[0] == '/') {
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 893a1ea8c000..a02d41d1ce4a 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -1005,7 +1005,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
rc = -EOPNOTSUPP;
}
- /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */
+ /* Fallback to SMB_COM_SETATTR command when absolutely needed. */
if (rc == -EOPNOTSUPP) {
cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n");
rc = SMBSetInformation(xid, tcon, full_path,
@@ -1039,7 +1039,7 @@ set_via_filehandle:
cifsFileInfo_put(open_file);
/*
- * Setting the read-only bit is not honered on non-NT servers when done
+ * Setting the read-only bit is not honored on non-NT servers when done
* via open-semantics. So for setting it, use SMB_COM_SETATTR command.
* This command works only after the file is closed, so use it only when
* operation was called without the filehandle.
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 224495322a05..e56e4d402f13 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -30,10 +30,9 @@ enum smb2_compound_ops {
SMB2_OP_QUERY_DIR,
SMB2_OP_MKDIR,
SMB2_OP_RENAME,
- SMB2_OP_DELETE,
SMB2_OP_HARDLINK,
SMB2_OP_SET_EOF,
- SMB2_OP_RMDIR,
+ SMB2_OP_UNLINK,
SMB2_OP_POSIX_QUERY_INFO,
SMB2_OP_SET_REPARSE,
SMB2_OP_GET_REPARSE,
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 2a0316c514e4..0985db9f86e5 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -207,8 +207,10 @@ replay_again:
server = cifs_pick_channel(ses);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
- if (vars == NULL)
- return -ENOMEM;
+ if (vars == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
rqst = &vars->rqst[0];
rsp_iov = &vars->rsp_iov[0];
@@ -344,9 +346,6 @@ replay_again:
trace_smb3_posix_query_info_compound_enter(xid, tcon->tid,
ses->Suid, full_path);
break;
- case SMB2_OP_DELETE:
- trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path);
- break;
case SMB2_OP_MKDIR:
/*
* Directories are created through parameters in the
@@ -354,23 +353,40 @@ replay_again:
*/
trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path);
break;
- case SMB2_OP_RMDIR:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ case SMB2_OP_UNLINK:
+ rqst[num_rqst].rq_iov = vars->unlink_iov;
rqst[num_rqst].rq_nvec = 1;
size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
data[0] = &delete_pending[0];
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst], COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_DISPOSITION_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (rc)
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ }
+ if (!rc && (!cfile || num_rqst > 1)) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ } else if (rc) {
goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
- trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path);
+ }
+ num_rqst++;
+ trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path);
break;
case SMB2_OP_SET_EOF:
rqst[num_rqst].rq_iov = &vars->si_iov[0];
@@ -440,7 +456,7 @@ replay_again:
ses->Suid, full_path);
break;
case SMB2_OP_RENAME:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_iov = vars->rename_iov;
rqst[num_rqst].rq_nvec = 2;
len = in_iov[i].iov_len;
@@ -671,7 +687,7 @@ finished:
}
for (i = 0; i < num_cmds; i++) {
- char *buf = rsp_iov[i + i].iov_base;
+ char *buf = rsp_iov[i + 1].iov_base;
if (buf && resp_buftype[i + 1] != CIFS_NO_BUFFER)
rc = server->ops->map_error(buf, false);
@@ -730,19 +746,6 @@ finished:
trace_smb3_posix_query_info_compound_done(xid, tcon->tid,
ses->Suid);
break;
- case SMB2_OP_DELETE:
- if (rc)
- trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc);
- else {
- /*
- * If dentry (hence, inode) is NULL, lease break is going to
- * take care of degrading leases on handles for deleted files.
- */
- if (inode)
- cifs_mark_open_handles_for_deleted_file(inode, full_path);
- trace_smb3_delete_done(xid, tcon->tid, ses->Suid);
- }
- break;
case SMB2_OP_MKDIR:
if (rc)
trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc);
@@ -763,11 +766,11 @@ finished:
trace_smb3_rename_done(xid, tcon->tid, ses->Suid);
SMB2_set_info_free(&rqst[num_rqst++]);
break;
- case SMB2_OP_RMDIR:
- if (rc)
- trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc);
+ case SMB2_OP_UNLINK:
+ if (!rc)
+ trace_smb3_unlink_done(xid, tcon->tid, ses->Suid);
else
- trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid);
+ trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc);
SMB2_set_info_free(&rqst[num_rqst++]);
break;
case SMB2_OP_SET_EOF:
@@ -864,6 +867,7 @@ finished:
smb2_should_replay(tcon, &retries, &cur_sleep))
goto replay_again;
+out:
if (cfile)
cifsFileInfo_put(cfile);
@@ -1163,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE);
return smb2_compound_op(xid, tcon, cifs_sb,
name, &oparms, NULL,
- &(int){SMB2_OP_RMDIR}, 1,
+ &(int){SMB2_OP_UNLINK}, 1,
NULL, NULL, NULL, NULL);
}
@@ -1171,21 +1175,107 @@ int
smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb, struct dentry *dentry)
{
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ __le16 *utf16_path __free(kfree) = NULL;
+ int retries = 0, cur_sleep = 1;
+ struct TCP_Server_Info *server;
struct cifs_open_parms oparms;
+ struct smb2_create_req *creq;
+ struct inode *inode = NULL;
+ struct smb_rqst rqst[2];
+ struct kvec rsp_iov[2];
+ struct kvec close_iov;
+ int resp_buftype[2];
+ struct cifs_fid fid;
+ int flags = 0;
+ __u8 oplock;
+ int rc;
- oparms = CIFS_OPARMS(cifs_sb, tcon, name,
- DELETE, FILE_OPEN,
- CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE);
- int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
- NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, dentry);
- if (rc == -EINVAL) {
- cifs_dbg(FYI, "invalid lease key, resending request without lease");
- rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
- NULL, &(int){SMB2_OP_DELETE}, 1,
- NULL, NULL, NULL, NULL);
+ utf16_path = cifs_convert_path_to_utf16(name, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+again:
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(tcon->ses);
+
+ memset(rqst, 0, sizeof(rqst));
+ memset(resp_buftype, 0, sizeof(resp_buftype));
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = ARRAY_SIZE(open_iov);
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE | FILE_READ_ATTRIBUTES,
+ FILE_OPEN, CREATE_DELETE_ON_CLOSE |
+ OPEN_REPARSE_POINT, ACL_NO_MODE);
+ oparms.fid = &fid;
+
+ if (dentry) {
+ inode = d_inode(dentry);
+ if (CIFS_I(inode)->lease_granted && server->ops->get_lease_key) {
+ oplock = SMB2_OPLOCK_LEVEL_LEASE;
+ server->ops->get_lease_key(inode, &fid);
+ }
+ }
+
+ rc = SMB2_open_init(tcon, server,
+ &rqst[0], &oplock, &oparms, utf16_path);
+ if (rc)
+ goto err_free;
+ smb2_set_next_command(tcon, &rqst[0]);
+ creq = rqst[0].rq_iov[0].iov_base;
+ creq->ShareAccess = FILE_SHARE_DELETE_LE;
+
+ rqst[1].rq_iov = &close_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_close_init(tcon, server, &rqst[1],
+ COMPOUND_FID, COMPOUND_FID, false);
+ smb2_set_related(&rqst[1]);
+ if (rc)
+ goto err_free;
+
+ if (retries) {
+ for (int i = 0; i < ARRAY_SIZE(rqst); i++)
+ smb2_set_replay(server, &rqst[i]);
}
+
+ rc = compound_send_recv(xid, tcon->ses, server, flags,
+ ARRAY_SIZE(rqst), rqst,
+ resp_buftype, rsp_iov);
+ SMB2_open_free(&rqst[0]);
+ SMB2_close_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto again;
+
+ /* Retry compound request without lease */
+ if (rc == -EINVAL && dentry) {
+ dentry = NULL;
+ retries = 0;
+ cur_sleep = 1;
+ goto again;
+ }
+ /*
+ * If dentry (hence, inode) is NULL, lease break is going to
+ * take care of degrading leases on handles for deleted files.
+ */
+ if (!rc && inode)
+ cifs_mark_open_handles_for_deleted_file(inode, name);
+
+ return rc;
+
+err_free:
+ SMB2_open_free(&rqst[0]);
+ SMB2_close_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
return rc;
}
@@ -1438,3 +1528,113 @@ out:
cifs_free_open_info(&data);
return rc;
}
+
+static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb,
+ const char *name, size_t namelen)
+{
+ int len;
+
+ if (*name == '\\' ||
+ (cifs_sb_master_tlink(cifs_sb) &&
+ cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/'))
+ name++;
+ return cifs_strndup_to_utf16(name, namelen, &len,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+}
+
+int smb2_rename_pending_delete(const char *full_path,
+ struct dentry *dentry,
+ const unsigned int xid)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry));
+ __le16 *utf16_path __free(kfree) = NULL;
+ __u32 co = file_create_options(dentry);
+ int cmds[] = {
+ SMB2_OP_SET_INFO,
+ SMB2_OP_RENAME,
+ SMB2_OP_UNLINK,
+ };
+ const int num_cmds = ARRAY_SIZE(cmds);
+ char *to_name __free(kfree) = NULL;
+ __u32 attrs = cinode->cifsAttrs;
+ struct cifs_open_parms oparms;
+ static atomic_t sillycounter;
+ struct cifsFileInfo *cfile;
+ struct tcon_link *tlink;
+ struct cifs_tcon *tcon;
+ struct kvec iov[2];
+ const char *ppath;
+ void *page;
+ size_t len;
+ int rc;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ tcon = tlink_tcon(tlink);
+
+ page = alloc_dentry_path();
+
+ ppath = build_path_from_dentry(dentry->d_parent, page);
+ if (IS_ERR(ppath)) {
+ rc = PTR_ERR(ppath);
+ goto out;
+ }
+
+ len = strlen(ppath) + strlen("/.__smb1234") + 1;
+ to_name = kmalloc(len, GFP_KERNEL);
+ if (!to_name) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb),
+ atomic_inc_return(&sillycounter) & 0xffff);
+
+ utf16_path = utf16_smb2_path(cifs_sb, to_name, len);
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+ DELETE | FILE_WRITE_ATTRIBUTES,
+ FILE_OPEN, co, ACL_NO_MODE);
+
+ attrs &= ~ATTR_READONLY;
+ if (!attrs)
+ attrs = ATTR_NORMAL;
+ if (d_inode(dentry)->i_nlink <= 1)
+ attrs |= ATTR_HIDDEN;
+ iov[0].iov_base = &(FILE_BASIC_INFO) {
+ .Attributes = cpu_to_le32(attrs),
+ };
+ iov[0].iov_len = sizeof(FILE_BASIC_INFO);
+ iov[1].iov_base = utf16_path;
+ iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path);
+
+ cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+ cmds, num_cmds, cfile, NULL, NULL, dentry);
+ if (rc == -EINVAL) {
+ cifs_dbg(FYI, "invalid lease key, resending request without lease\n");
+ cifs_get_writable_path(tcon, full_path,
+ FIND_WR_WITH_DELETE, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov,
+ cmds, num_cmds, cfile, NULL, NULL, NULL);
+ }
+ if (!rc) {
+ set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags);
+ } else {
+ cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n",
+ __func__, full_path, to_name, rc);
+ rc = -EIO;
+ }
+out:
+ cifs_put_tlink(tlink);
+ free_dentry_path(page);
+ return rc;
+}
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index cddf273c14ae..89d933b4a8bc 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -614,6 +614,15 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
struct cifs_tcon *tcon;
struct cifs_pending_open *open;
+ /* Trace receipt of lease break request from server */
+ trace_smb3_lease_break_enter(le32_to_cpu(rsp->CurrentLeaseState),
+ le32_to_cpu(rsp->Flags),
+ le16_to_cpu(rsp->Epoch),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
+
cifs_dbg(FYI, "Checking for lease break\n");
/* If server is a channel, select the primary channel */
@@ -660,10 +669,12 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState),
- le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
- le64_to_cpu(rsp->hdr.SessionId),
- *((u64 *)rsp->LeaseKey),
- *((u64 *)&rsp->LeaseKey[8]));
+ le32_to_cpu(rsp->Flags),
+ le16_to_cpu(rsp->Epoch),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
return false;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index ad8947434b71..e586f3f4b5c9 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -772,6 +772,13 @@ next_iface:
bytes_left -= sizeof(*p);
break;
}
+ /* Validate that Next doesn't point beyond the buffer */
+ if (next > bytes_left) {
+ cifs_dbg(VFS, "%s: invalid Next pointer %zu > %zd\n",
+ __func__, next, bytes_left);
+ rc = -EINVAL;
+ goto out;
+ }
p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
bytes_left -= next;
}
@@ -783,7 +790,9 @@ next_iface:
}
/* Azure rounds the buffer size up 8, to a 16 byte boundary */
- if ((bytes_left > 8) || p->Next)
+ if ((bytes_left > 8) ||
+ (bytes_left >= offsetof(struct network_interface_info_ioctl_rsp, Next)
+ + sizeof(p->Next) && p->Next))
cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
ses->iface_last_update = jiffies;
@@ -2631,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
}
/* SMB headers in a compound are 8 byte aligned. */
- if (!IS_ALIGNED(len, 8)) {
- num_padding = 8 - (len & 7);
+ if (IS_ALIGNED(len, 8))
+ goto out;
+
+ num_padding = 8 - (len & 7);
+ if (smb3_encryption_required(tcon)) {
+ int i;
+
+ /*
+ * Flatten request into a single buffer with required padding as
+ * the encryption layer can't handle the padding iovs.
+ */
+ for (i = 1; i < rqst->rq_nvec; i++) {
+ memcpy(rqst->rq_iov[0].iov_base +
+ rqst->rq_iov[0].iov_len,
+ rqst->rq_iov[i].iov_base,
+ rqst->rq_iov[i].iov_len);
+ rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len;
+ }
+ memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len,
+ 0, num_padding);
+ rqst->rq_iov[0].iov_len += num_padding;
+ rqst->rq_nvec = 1;
+ } else {
rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding;
rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding;
rqst->rq_nvec++;
- len += num_padding;
}
+ len += num_padding;
+out:
shdr->NextCommand = cpu_to_le32(len);
}
@@ -4487,7 +4518,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
for (int i = 1; i < num_rqst; i++) {
struct smb_rqst *old = &old_rq[i - 1];
struct smb_rqst *new = &new_rq[i];
- struct folio_queue *buffer;
+ struct folio_queue *buffer = NULL;
size_t size = iov_iter_count(&old->rq_iter);
orig_len += smb_rqst_len(server, old);
@@ -4805,7 +4836,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
dw->server->ops->is_network_name_deleted(dw->buf,
dw->server);
- mid->callback(mid);
+ mid_execute_callback(mid);
} else {
spin_lock(&dw->server->srv_lock);
if (dw->server->tcpStatus == CifsNeedReconnect) {
@@ -4813,7 +4844,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
mid->mid_state = MID_RETRY_NEEDED;
spin_unlock(&dw->server->mid_queue_lock);
spin_unlock(&dw->server->srv_lock);
- mid->callback(mid);
+ mid_execute_callback(mid);
} else {
spin_lock(&dw->server->mid_queue_lock);
mid->mid_state = MID_REQUEST_SUBMITTED;
@@ -5367,6 +5398,7 @@ struct smb_version_operations smb20_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
#endif /* CIFS_ALLOW_INSECURE_LEGACY */
@@ -5472,6 +5504,7 @@ struct smb_version_operations smb21_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
struct smb_version_operations smb30_operations = {
@@ -5588,6 +5621,7 @@ struct smb_version_operations smb30_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
struct smb_version_operations smb311_operations = {
@@ -5704,6 +5738,7 @@ struct smb_version_operations smb311_operations = {
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
+ .rename_pending_delete = smb2_rename_pending_delete,
};
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 2df93a75e3b8..c3b9d3f6210f 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -6192,11 +6192,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
please_key_high = (__u64 *)(lease_key+8);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
- trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid,
+ trace_smb3_lease_ack_err(le32_to_cpu(lease_state), tcon->tid,
ses->Suid, *please_key_low, *please_key_high, rc);
cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
} else
- trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid,
+ trace_smb3_lease_ack_done(le32_to_cpu(lease_state), tcon->tid,
ses->Suid, *please_key_low, *please_key_high);
return rc;
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 6e805ece6a7b..b3f1398c9f79 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end);
int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
+int smb2_rename_pending_delete(const char *full_path,
+ struct dentry *dentry,
+ const unsigned int xid);
#endif /* _SMB2PROTO_H */
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index ff9ef7fcd010..bc0e92eb2b64 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -771,6 +771,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *shdr,
temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
memset(temp, 0, sizeof(struct mid_q_entry));
kref_init(&temp->refcount);
+ spin_lock_init(&temp->mid_lock);
temp->mid = le64_to_cpu(shdr->MessageId);
temp->credits = credits > 0 ? credits : 1;
temp->pid = current->pid;
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index c628e91c328b..e0fce5033004 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -453,9 +453,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
struct smbdirect_recv_io *response =
container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
struct smbdirect_socket *sc = response->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbd_connection *info =
container_of(sc, struct smbd_connection, socket);
- int data_length = 0;
+ u32 data_offset = 0;
+ u32 data_length = 0;
+ u32 remaining_data_length = 0;
log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n",
response, sc->recv_io.expected, wc->status, wc->opcode,
@@ -487,7 +490,22 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
/* SMBD data transfer packet */
case SMBDIRECT_EXPECT_DATA_TRANSFER:
data_transfer = smbdirect_recv_io_payload(response);
+
+ if (wc->byte_len <
+ offsetof(struct smbdirect_data_transfer, padding))
+ goto error;
+
+ remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
+ data_offset = le32_to_cpu(data_transfer->data_offset);
data_length = le32_to_cpu(data_transfer->data_length);
+ if (wc->byte_len < data_offset ||
+ (u64)wc->byte_len < (u64)data_offset + data_length)
+ goto error;
+
+ if (remaining_data_length > sp->max_fragmented_recv_size ||
+ data_length > sp->max_fragmented_recv_size ||
+ (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)
+ goto error;
if (data_length) {
if (sc->recv_io.reassembly.full_packet_received)
@@ -1090,8 +1108,10 @@ static int smbd_negotiate(struct smbd_connection *info)
log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
rc, response->sge.addr,
response->sge.length, response->sge.lkey);
- if (rc)
+ if (rc) {
+ put_receive_buffer(info, response);
return rc;
+ }
init_completion(&info->negotiate_completion);
info->negotiate_done = false;
@@ -1329,17 +1349,16 @@ void smbd_destroy(struct TCP_Server_Info *server)
sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
}
+ log_rdma_event(INFO, "cancelling post_send_credits_work\n");
+ disable_work_sync(&info->post_send_credits_work);
+
log_rdma_event(INFO, "destroying qp\n");
ib_drain_qp(sc->ib.qp);
rdma_destroy_qp(sc->rdma.cm_id);
sc->ib.qp = NULL;
log_rdma_event(INFO, "cancelling idle timer\n");
- cancel_delayed_work_sync(&info->idle_timer_work);
-
- log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
- wait_event(info->wait_send_pending,
- atomic_read(&info->send_pending) == 0);
+ disable_delayed_work_sync(&info->idle_timer_work);
/* It's not possible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
@@ -1712,7 +1731,7 @@ allocate_mr_failed:
return NULL;
negotiation_failed:
- cancel_delayed_work_sync(&info->idle_timer_work);
+ disable_delayed_work_sync(&info->idle_timer_work);
destroy_caches_and_workqueue(info);
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
rdma_disconnect(sc->rdma.cm_id);
@@ -1986,7 +2005,11 @@ int smbd_send(struct TCP_Server_Info *server,
*/
wait_event(info->wait_send_pending,
- atomic_read(&info->send_pending) == 0);
+ atomic_read(&info->send_pending) == 0 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
+ rc = -EAGAIN;
return rc;
}
@@ -2067,7 +2090,7 @@ static void destroy_mr_list(struct smbd_connection *info)
struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *mr, *tmp;
- cancel_work_sync(&info->mr_recovery_work);
+ disable_work_sync(&info->mr_recovery_work);
list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
if (mr->state == MR_INVALIDATED)
ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl,
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 93e5b2bb9f28..fd650e2afc76 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter);
-DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter);
@@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done);
-DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done);
@@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
-DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err);
@@ -1171,8 +1168,54 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \
__u64 lease_key_high), \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
-DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found);
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_ack_done);
+/* Tracepoint when a lease break request is received/entered (includes epoch and flags) */
+DECLARE_EVENT_CLASS(smb3_lease_enter_class,
+ TP_PROTO(__u32 lease_state,
+ __u32 flags,
+ __u16 epoch,
+ __u32 tid,
+ __u64 sesid,
+ __u64 lease_key_low,
+ __u64 lease_key_high),
+ TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high),
+ TP_STRUCT__entry(
+ __field(__u32, lease_state)
+ __field(__u32, flags)
+ __field(__u16, epoch)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, lease_key_low)
+ __field(__u64, lease_key_high)
+ ),
+ TP_fast_assign(
+ __entry->lease_state = lease_state;
+ __entry->flags = flags;
+ __entry->epoch = epoch;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->lease_key_low = lease_key_low;
+ __entry->lease_key_high = lease_key_high;
+ ),
+ TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x flags=0x%x epoch=%u",
+ __entry->sesid, __entry->tid, __entry->lease_key_high,
+ __entry->lease_key_low, __entry->lease_state, __entry->flags, __entry->epoch)
+)
+
+#define DEFINE_SMB3_LEASE_ENTER_EVENT(name) \
+DEFINE_EVENT(smb3_lease_enter_class, smb3_##name, \
+ TP_PROTO(__u32 lease_state, \
+ __u32 flags, \
+ __u16 epoch, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 lease_key_low, \
+ __u64 lease_key_high), \
+ TP_ARGS(lease_state, flags, epoch, tid, sesid, lease_key_low, lease_key_high))
+
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_break_enter);
+/* Lease not found: reuse lease_enter payload (includes epoch and flags) */
+DEFINE_SMB3_LEASE_ENTER_EVENT(lease_not_found);
DECLARE_EVENT_CLASS(smb3_lease_err_class,
TP_PROTO(__u32 lease_state,
@@ -1213,7 +1256,7 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \
int rc), \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc))
-DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+DEFINE_SMB3_LEASE_ERR_EVENT(lease_ack_err);
DECLARE_EVENT_CLASS(smb3_connect_class,
TP_PROTO(char *hostname,
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 32d528b4dd83..a61ba7f3fb86 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -1005,15 +1005,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n",
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(server, &rqst[i], midQ[i]);
- spin_lock(&server->mid_queue_lock);
+ spin_lock(&midQ[i]->mid_lock);
midQ[i]->wait_cancelled = true;
- if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED ||
- midQ[i]->mid_state == MID_RESPONSE_RECEIVED) {
+ if (midQ[i]->callback) {
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
}
- spin_unlock(&server->mid_queue_lock);
+ spin_unlock(&midQ[i]->mid_lock);
}
}
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 3f04a2977ba8..67c4f73398df 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -504,7 +504,8 @@ void ksmbd_conn_transport_destroy(void)
{
mutex_lock(&init_lock);
ksmbd_tcp_destroy();
- ksmbd_rdma_destroy();
+ ksmbd_rdma_stop_listening();
stop_sessions();
+ ksmbd_rdma_destroy();
mutex_unlock(&init_lock);
}
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 31dd1caac1e8..2aa8084bb593 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -46,7 +46,12 @@ struct ksmbd_conn {
struct mutex srv_mutex;
int status;
unsigned int cli_cap;
- __be32 inet_addr;
+ union {
+ __be32 inet_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+ u8 inet6_addr[16];
+#endif
+ };
char *request_buf;
struct ksmbd_transport *transport;
struct nls_table *local_nls;
diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c
index 72b00ca6e455..4a71f46d7020 100644
--- a/fs/smb/server/ksmbd_work.c
+++ b/fs/smb/server/ksmbd_work.c
@@ -78,7 +78,7 @@ int ksmbd_work_pool_init(void)
int ksmbd_workqueue_init(void)
{
- ksmbd_wq = alloc_workqueue("ksmbd-io", 0, 0);
+ ksmbd_wq = alloc_workqueue("ksmbd-io", WQ_PERCPU, 0);
if (!ksmbd_wq)
return -ENOMEM;
return 0;
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index d7a8a580d013..a04d5702820d 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -1102,8 +1102,10 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
if (!atomic_inc_not_zero(&opinfo->refcount))
continue;
- if (ksmbd_conn_releasing(opinfo->conn))
+ if (ksmbd_conn_releasing(opinfo->conn)) {
+ opinfo_put(opinfo);
continue;
+ }
oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL);
opinfo_put(opinfo);
@@ -1139,8 +1141,11 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
if (!atomic_inc_not_zero(&opinfo->refcount))
continue;
- if (ksmbd_conn_releasing(opinfo->conn))
+ if (ksmbd_conn_releasing(opinfo->conn)) {
+ opinfo_put(opinfo);
continue;
+ }
+
oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL);
opinfo_put(opinfo);
}
@@ -1343,8 +1348,10 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
if (!atomic_inc_not_zero(&brk_op->refcount))
continue;
- if (ksmbd_conn_releasing(brk_op->conn))
+ if (ksmbd_conn_releasing(brk_op->conn)) {
+ opinfo_put(brk_op);
continue;
+ }
if (brk_op->is_lease && (brk_op->o_lease->state &
(~(SMB2_LEASE_READ_CACHING_LE |
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0d92ce49aed7..a565fc36cee6 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -2951,18 +2951,19 @@ int smb2_open(struct ksmbd_work *work)
}
ksmbd_debug(SMB, "converted name = %s\n", name);
- if (strchr(name, ':')) {
- if (!test_share_config_flag(work->tcon->share_conf,
- KSMBD_SHARE_FLAG_STREAMS)) {
- rc = -EBADF;
- goto err_out2;
- }
- rc = parse_stream_name(name, &stream_name, &s_type);
- if (rc < 0)
- goto err_out2;
- }
if (posix_ctxt == false) {
+ if (strchr(name, ':')) {
+ if (!test_share_config_flag(work->tcon->share_conf,
+ KSMBD_SHARE_FLAG_STREAMS)) {
+ rc = -EBADF;
+ goto err_out2;
+ }
+ rc = parse_stream_name(name, &stream_name, &s_type);
+ if (rc < 0)
+ goto err_out2;
+ }
+
rc = ksmbd_validate_filename(name);
if (rc < 0)
goto err_out2;
@@ -3443,6 +3444,8 @@ int smb2_open(struct ksmbd_work *work)
fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE |
FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE));
+ fp->is_posix_ctxt = posix_ctxt;
+
/* fp should be searchable through ksmbd_inode.m_fp_list
* after daccess, saccess, attrib_only, and stream are
* initialized.
@@ -5988,7 +5991,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (IS_ERR(new_name))
return PTR_ERR(new_name);
- if (strchr(new_name, ':')) {
+ if (fp->is_posix_ctxt == false && strchr(new_name, ':')) {
int s_type;
char *xattr_stream_name, *stream_name = NULL;
size_t xattr_stream_size;
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 8d366db5f605..5df138fc6004 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -148,7 +148,7 @@ struct smb_direct_transport {
wait_queue_head_t wait_send_pending;
atomic_t send_pending;
- struct delayed_work post_recv_credits_work;
+ struct work_struct post_recv_credits_work;
struct work_struct send_immediate_work;
struct work_struct disconnect_work;
@@ -367,8 +367,8 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
spin_lock_init(&t->lock_new_recv_credits);
- INIT_DELAYED_WORK(&t->post_recv_credits_work,
- smb_direct_post_recv_credits);
+ INIT_WORK(&t->post_recv_credits_work,
+ smb_direct_post_recv_credits);
INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work);
INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work);
@@ -399,9 +399,9 @@ static void free_transport(struct smb_direct_transport *t)
wait_event(t->wait_send_pending,
atomic_read(&t->send_pending) == 0);
- cancel_work_sync(&t->disconnect_work);
- cancel_delayed_work_sync(&t->post_recv_credits_work);
- cancel_work_sync(&t->send_immediate_work);
+ disable_work_sync(&t->disconnect_work);
+ disable_work_sync(&t->post_recv_credits_work);
+ disable_work_sync(&t->send_immediate_work);
if (t->qp) {
ib_drain_qp(t->qp);
@@ -554,7 +554,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
case SMB_DIRECT_MSG_DATA_TRANSFER: {
struct smb_direct_data_transfer *data_transfer =
(struct smb_direct_data_transfer *)recvmsg->packet;
- unsigned int data_length;
+ u32 remaining_data_length, data_offset, data_length;
int avail_recvmsg_count, receive_credits;
if (wc->byte_len <
@@ -564,15 +564,25 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
return;
}
+ remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length);
data_length = le32_to_cpu(data_transfer->data_length);
- if (data_length) {
- if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
- (u64)data_length) {
- put_recvmsg(t, recvmsg);
- smb_direct_disconnect_rdma_connection(t);
- return;
- }
+ data_offset = le32_to_cpu(data_transfer->data_offset);
+ if (wc->byte_len < data_offset ||
+ wc->byte_len < (u64)data_offset + data_length) {
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
+ return;
+ }
+ if (remaining_data_length > t->max_fragmented_recv_size ||
+ data_length > t->max_fragmented_recv_size ||
+ (u64)remaining_data_length + (u64)data_length >
+ (u64)t->max_fragmented_recv_size) {
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
+ return;
+ }
+ if (data_length) {
if (t->full_packet_received)
recvmsg->first_segment = true;
@@ -605,8 +615,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
wake_up_interruptible(&t->wait_send_credits);
if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count))
- mod_delayed_work(smb_direct_wq,
- &t->post_recv_credits_work, 0);
+ queue_work(smb_direct_wq, &t->post_recv_credits_work);
if (data_length) {
enqueue_reassembly(t, recvmsg, (int)data_length);
@@ -763,8 +772,7 @@ again:
st->count_avail_recvmsg += queue_removed;
if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) {
spin_unlock(&st->receive_credit_lock);
- mod_delayed_work(smb_direct_wq,
- &st->post_recv_credits_work, 0);
+ queue_work(smb_direct_wq, &st->post_recv_credits_work);
} else {
spin_unlock(&st->receive_credit_lock);
}
@@ -791,7 +799,7 @@ read_rfc1002_done:
static void smb_direct_post_recv_credits(struct work_struct *work)
{
struct smb_direct_transport *t = container_of(work,
- struct smb_direct_transport, post_recv_credits_work.work);
+ struct smb_direct_transport, post_recv_credits_work);
struct smb_direct_recvmsg *recvmsg;
int receive_credits, credits = 0;
int ret;
@@ -1209,78 +1217,130 @@ static int smb_direct_writev(struct ksmbd_transport *t,
bool need_invalidate, unsigned int remote_key)
{
struct smb_direct_transport *st = smb_trans_direct_transfort(t);
- int remaining_data_length;
- int start, i, j;
- int max_iov_size = st->max_send_size -
+ size_t remaining_data_length;
+ size_t iov_idx;
+ size_t iov_ofs;
+ size_t max_iov_size = st->max_send_size -
sizeof(struct smb_direct_data_transfer);
int ret;
- struct kvec vec;
struct smb_direct_send_ctx send_ctx;
+ int error = 0;
if (st->status != SMB_DIRECT_CS_CONNECTED)
return -ENOTCONN;
//FIXME: skip RFC1002 header..
+ if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4))
+ return -EINVAL;
buflen -= 4;
+ iov_idx = 1;
+ iov_ofs = 0;
remaining_data_length = buflen;
ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen);
smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key);
- start = i = 1;
- buflen = 0;
- while (true) {
- buflen += iov[i].iov_len;
- if (buflen > max_iov_size) {
- if (i > start) {
- remaining_data_length -=
- (buflen - iov[i].iov_len);
- ret = smb_direct_post_send_data(st, &send_ctx,
- &iov[start], i - start,
- remaining_data_length);
- if (ret)
+ while (remaining_data_length) {
+ struct kvec vecs[SMB_DIRECT_MAX_SEND_SGES - 1]; /* minus smbdirect hdr */
+ size_t possible_bytes = max_iov_size;
+ size_t possible_vecs;
+ size_t bytes = 0;
+ size_t nvecs = 0;
+
+ /*
+ * For the last message remaining_data_length should be
+ * have been 0 already!
+ */
+ if (WARN_ON_ONCE(iov_idx >= niovs)) {
+ error = -EINVAL;
+ goto done;
+ }
+
+ /*
+ * We have 2 factors which limit the arguments we pass
+ * to smb_direct_post_send_data():
+ *
+ * 1. The number of supported sges for the send,
+ * while one is reserved for the smbdirect header.
+ * And we currently need one SGE per page.
+ * 2. The number of negotiated payload bytes per send.
+ */
+ possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx);
+
+ while (iov_idx < niovs && possible_vecs && possible_bytes) {
+ struct kvec *v = &vecs[nvecs];
+ int page_count;
+
+ v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs;
+ v->iov_len = min_t(size_t,
+ iov[iov_idx].iov_len - iov_ofs,
+ possible_bytes);
+ page_count = get_buf_page_count(v->iov_base, v->iov_len);
+ if (page_count > possible_vecs) {
+ /*
+ * If the number of pages in the buffer
+ * is to much (because we currently require
+ * one SGE per page), we need to limit the
+ * length.
+ *
+ * We know possible_vecs is at least 1,
+ * so we always keep the first page.
+ *
+ * We need to calculate the number extra
+ * pages (epages) we can also keep.
+ *
+ * We calculate the number of bytes in the
+ * first page (fplen), this should never be
+ * larger than v->iov_len because page_count is
+ * at least 2, but adding a limitation feels
+ * better.
+ *
+ * Then we calculate the number of bytes (elen)
+ * we can keep for the extra pages.
+ */
+ size_t epages = possible_vecs - 1;
+ size_t fpofs = offset_in_page(v->iov_base);
+ size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len);
+ size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE);
+
+ v->iov_len = fplen + elen;
+ page_count = get_buf_page_count(v->iov_base, v->iov_len);
+ if (WARN_ON_ONCE(page_count > possible_vecs)) {
+ /*
+ * Something went wrong in the above
+ * logic...
+ */
+ error = -EINVAL;
goto done;
- } else {
- /* iov[start] is too big, break it */
- int nvec = (buflen + max_iov_size - 1) /
- max_iov_size;
-
- for (j = 0; j < nvec; j++) {
- vec.iov_base =
- (char *)iov[start].iov_base +
- j * max_iov_size;
- vec.iov_len =
- min_t(int, max_iov_size,
- buflen - max_iov_size * j);
- remaining_data_length -= vec.iov_len;
- ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1,
- remaining_data_length);
- if (ret)
- goto done;
}
- i++;
- if (i == niovs)
- break;
}
- start = i;
- buflen = 0;
- } else {
- i++;
- if (i == niovs) {
- /* send out all remaining vecs */
- remaining_data_length -= buflen;
- ret = smb_direct_post_send_data(st, &send_ctx,
- &iov[start], i - start,
- remaining_data_length);
- if (ret)
- goto done;
- break;
+ possible_vecs -= page_count;
+ nvecs += 1;
+ possible_bytes -= v->iov_len;
+ bytes += v->iov_len;
+
+ iov_ofs += v->iov_len;
+ if (iov_ofs >= iov[iov_idx].iov_len) {
+ iov_idx += 1;
+ iov_ofs = 0;
}
}
+
+ remaining_data_length -= bytes;
+
+ ret = smb_direct_post_send_data(st, &send_ctx,
+ vecs, nvecs,
+ remaining_data_length);
+ if (unlikely(ret)) {
+ error = ret;
+ goto done;
+ }
}
done:
ret = smb_direct_flush_send_list(st, &send_ctx, true);
+ if (unlikely(!ret && error))
+ ret = error;
/*
* As an optimization, we don't wait for individual I/O to finish
@@ -1672,7 +1732,7 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t)
goto out_err;
}
- smb_direct_post_recv_credits(&t->post_recv_credits_work.work);
+ smb_direct_post_recv_credits(&t->post_recv_credits_work);
return 0;
out_err:
put_recvmsg(t, recvmsg);
@@ -1744,6 +1804,11 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
return -EINVAL;
}
+ if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) {
+ pr_err("warning: device max_send_sge = %d too small\n",
+ device->attrs.max_send_sge);
+ return -EINVAL;
+ }
if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) {
pr_err("warning: device max_recv_sge = %d too small\n",
device->attrs.max_recv_sge);
@@ -1767,7 +1832,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
cap->max_send_wr = max_send_wrs;
cap->max_recv_wr = t->recv_credit_max;
- cap->max_send_sge = max_sge_per_wr;
+ cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES;
cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
cap->max_inline_data = 0;
cap->max_rdma_ctxs = t->max_rw_credits;
@@ -2177,7 +2242,8 @@ int ksmbd_rdma_init(void)
* for lack of credits
*/
smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq",
- WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
if (!smb_direct_wq)
return -ENOMEM;
@@ -2194,7 +2260,7 @@ int ksmbd_rdma_init(void)
return 0;
}
-void ksmbd_rdma_destroy(void)
+void ksmbd_rdma_stop_listening(void)
{
if (!smb_direct_listener.cm_id)
return;
@@ -2203,7 +2269,10 @@ void ksmbd_rdma_destroy(void)
rdma_destroy_id(smb_direct_listener.cm_id);
smb_direct_listener.cm_id = NULL;
+}
+void ksmbd_rdma_destroy(void)
+{
if (smb_direct_wq) {
destroy_workqueue(smb_direct_wq);
smb_direct_wq = NULL;
diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h
index 77aee4e5c9dc..a2291b77488a 100644
--- a/fs/smb/server/transport_rdma.h
+++ b/fs/smb/server/transport_rdma.h
@@ -54,13 +54,15 @@ struct smb_direct_data_transfer {
#ifdef CONFIG_SMB_SERVER_SMBDIRECT
int ksmbd_rdma_init(void);
+void ksmbd_rdma_stop_listening(void);
void ksmbd_rdma_destroy(void);
bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
void init_smbd_max_io_size(unsigned int sz);
unsigned int get_smbd_max_read_write_size(void);
#else
static inline int ksmbd_rdma_init(void) { return 0; }
-static inline int ksmbd_rdma_destroy(void) { return 0; }
+static inline void ksmbd_rdma_stop_listening(void) { }
+static inline void ksmbd_rdma_destroy(void) { }
static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
static inline void init_smbd_max_io_size(unsigned int sz) { }
static inline unsigned int get_smbd_max_read_write_size(void) { return 0; }
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index b1df02e321b0..4337df97987d 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -85,7 +85,14 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk)
return NULL;
}
+#if IS_ENABLED(CONFIG_IPV6)
+ if (client_sk->sk->sk_family == AF_INET6)
+ memcpy(&conn->inet6_addr, &client_sk->sk->sk_v6_daddr, 16);
+ else
+ conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+#else
conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+#endif
conn->transport = KSMBD_TRANS(t);
KSMBD_TRANS(t)->conn = conn;
KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops;
@@ -229,7 +236,6 @@ static int ksmbd_kthread_fn(void *p)
{
struct socket *client_sk = NULL;
struct interface *iface = (struct interface *)p;
- struct inet_sock *csk_inet;
struct ksmbd_conn *conn;
int ret;
@@ -252,13 +258,27 @@ static int ksmbd_kthread_fn(void *p)
/*
* Limits repeated connections from clients with the same IP.
*/
- csk_inet = inet_sk(client_sk->sk);
down_read(&conn_list_lock);
list_for_each_entry(conn, &conn_list, conns_list)
- if (csk_inet->inet_daddr == conn->inet_addr) {
+#if IS_ENABLED(CONFIG_IPV6)
+ if (client_sk->sk->sk_family == AF_INET6) {
+ if (memcmp(&client_sk->sk->sk_v6_daddr,
+ &conn->inet6_addr, 16) == 0) {
+ ret = -EAGAIN;
+ break;
+ }
+ } else if (inet_sk(client_sk->sk)->inet_daddr ==
+ conn->inet_addr) {
+ ret = -EAGAIN;
+ break;
+ }
+#else
+ if (inet_sk(client_sk->sk)->inet_daddr ==
+ conn->inet_addr) {
ret = -EAGAIN;
break;
}
+#endif
up_read(&conn_list_lock);
if (ret == -EAGAIN)
continue;
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 0708155b5caf..78b506c5ef03 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -112,6 +112,8 @@ struct ksmbd_file {
bool is_durable;
bool is_persistent;
bool is_resilient;
+
+ bool is_posix_ctxt;
};
static inline void set_ctx_actor(struct dir_context *ctx,
diff --git a/fs/splice.c b/fs/splice.c
index 4d6df083e0c0..f5094b6d00a0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -739,6 +739,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
sd.pos = kiocb.ki_pos;
if (ret <= 0)
break;
+ WARN_ONCE(ret > sd.total_len - left,
+ "Splice Exceeded! ret=%zd tot=%zu left=%zu\n",
+ ret, sd.total_len, left);
sd.num_spliced += ret;
sd.total_len -= ret;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 992ea0e37257..4465cf05603a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -187,10 +187,15 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
unsigned short flags;
unsigned int fragments;
u64 lookup_table_start, xattr_id_table_start, next_table;
- int err;
+ int err, devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
TRACE("Entered squashfs_fill_superblock\n");
+ if (!devblksize) {
+ errorf(fc, "squashfs: unable to set blocksize\n");
+ return -EINVAL;
+ }
+
sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
if (sb->s_fs_info == NULL) {
ERROR("Failed to allocate squashfs_sb_info\n");
@@ -201,12 +206,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
- msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
- if (!msblk->devblksize) {
- errorf(fc, "squashfs: unable to set blocksize\n");
- return -EINVAL;
- }
-
+ msblk->devblksize = devblksize;
msblk->devblksize_log2 = ffz(~msblk->devblksize);
mutex_init(&msblk->meta_index_mutex);
diff --git a/fs/super.c b/fs/super.c
index 7f876f32343a..f4fa0e93c463 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1716,49 +1716,6 @@ int get_tree_bdev(struct fs_context *fc,
}
EXPORT_SYMBOL(get_tree_bdev);
-static int test_bdev_super(struct super_block *s, void *data)
-{
- return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
-}
-
-struct dentry *mount_bdev(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data,
- int (*fill_super)(struct super_block *, void *, int))
-{
- struct super_block *s;
- int error;
- dev_t dev;
-
- error = lookup_bdev(dev_name, &dev);
- if (error)
- return ERR_PTR(error);
-
- flags |= SB_NOSEC;
- s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
- if (IS_ERR(s))
- return ERR_CAST(s);
-
- if (s->s_root) {
- if ((flags ^ s->s_flags) & SB_RDONLY) {
- deactivate_locked_super(s);
- return ERR_PTR(-EBUSY);
- }
- } else {
- error = setup_bdev_super(s, flags, NULL);
- if (!error)
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- s->s_flags |= SB_ACTIVE;
- }
-
- return dget(s->s_root);
-}
-EXPORT_SYMBOL(mount_bdev);
-
void kill_block_super(struct super_block *sb)
{
struct block_device *bdev = sb->s_bdev;
@@ -1773,26 +1730,6 @@ void kill_block_super(struct super_block *sb)
EXPORT_SYMBOL(kill_block_super);
#endif
-struct dentry *mount_nodev(struct file_system_type *fs_type,
- int flags, void *data,
- int (*fill_super)(struct super_block *, void *, int))
-{
- int error;
- struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
-
- if (IS_ERR(s))
- return ERR_CAST(s);
-
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
- s->s_flags |= SB_ACTIVE;
- return dget(s->s_root);
-}
-EXPORT_SYMBOL(mount_nodev);
-
/**
* vfs_get_tree - Get the mountable root
* @fc: The superblock configuration context.
@@ -2314,17 +2251,20 @@ int sb_init_dio_done_wq(struct super_block *sb)
{
struct workqueue_struct *old;
struct workqueue_struct *wq = alloc_workqueue("dio/%s",
- WQ_MEM_RECLAIM, 0,
+ WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0,
sb->s_id);
if (!wq)
return -ENOMEM;
+
+ old = NULL;
/*
* This has to be atomic as more DIOs can race to create the workqueue
*/
- old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
- /* Someone created workqueue before us? Free ours... */
- if (old)
+ if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) {
+ /* Someone created workqueue before us? Free ours... */
destroy_workqueue(wq);
+ }
return 0;
}
EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index fb5ac358077b..0b14d004a095 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -88,6 +88,8 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
}
const struct fscrypt_operations ubifs_crypt_operations = {
+ .inode_info_offs = (int)offsetof(struct ubifs_inode, i_crypt_info) -
+ (int)offsetof(struct ubifs_inode, vfs_inode),
.legacy_key_prefix = "ubifs:",
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f3e3b2068608..46952a33c4e6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
static int ubifs_drop_inode(struct inode *inode)
{
- int drop = generic_drop_inode(inode);
+ int drop = inode_generic_drop(inode);
if (!drop)
drop = fscrypt_drop_inode(inode);
@@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode)
goto out;
dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
- ubifs_assert(c, !atomic_read(&inode->i_count));
+ ubifs_assert(c, !icount_read(inode));
truncate_inode_pages_final(&inode->i_data);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5db45c9e26ee..49e50431741c 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -365,6 +365,7 @@ struct ubifs_gced_idx_leb {
* @read_in_a_row: number of consecutive pages read in a row (for bulk read)
* @data_len: length of the data attached to the inode
* @data: inode's data
+ * @i_crypt_info: inode's fscrypt information
*
* @ui_mutex exists for two main reasons. At first it prevents inodes from
* being written back while UBIFS changing them, being in the middle of an VFS
@@ -416,6 +417,9 @@ struct ubifs_inode {
pgoff_t read_in_a_row;
int data_len;
void *data;
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
+#endif
};
/**
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 503268cf4296..89eccc4becf9 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -284,9 +284,9 @@ static int enable_verity(struct file *filp,
/* Successfully enabled verity */
/*
- * Readers can start using ->i_verity_info immediately, so it
- * can't be rolled back once set. So don't set it until just
- * after the filesystem has successfully enabled verity.
+ * Readers can start using the inode's verity info immediately,
+ * so it can't be rolled back once set. So don't set it until
+ * just after the filesystem has successfully enabled verity.
*/
fsverity_set_info(inode, vi);
}
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 5fe854a5b9ad..bc1d887c532e 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -63,10 +63,11 @@ struct merkle_tree_params {
* fsverity_info - cached verity metadata for an inode
*
* When a verity file is first opened, an instance of this struct is allocated
- * and stored in ->i_verity_info; it remains until the inode is evicted. It
- * caches information about the Merkle tree that's needed to efficiently verify
- * data read from the file. It also caches the file digest. The Merkle tree
- * pages themselves are not cached here, but the filesystem may cache them.
+ * and a pointer to it is stored in the file's in-memory inode. It remains
+ * until the inode is evicted. It caches information about the Merkle tree
+ * that's needed to efficiently verify data read from the file. It also caches
+ * the file digest. The Merkle tree pages themselves are not cached here, but
+ * the filesystem may cache them.
*/
struct fsverity_info {
struct merkle_tree_params tree_params;
diff --git a/fs/verity/open.c b/fs/verity/open.c
index c561e130cd0c..77b1c977af02 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -244,17 +244,17 @@ fail:
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
{
/*
- * Multiple tasks may race to set ->i_verity_info, so use
- * cmpxchg_release(). This pairs with the smp_load_acquire() in
- * fsverity_get_info(). I.e., here we publish ->i_verity_info with a
- * RELEASE barrier so that other tasks can ACQUIRE it.
+ * Multiple tasks may race to set the inode's verity info pointer, so
+ * use cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * fsverity_get_info(). I.e., publish the pointer with a RELEASE
+ * barrier so that other tasks can ACQUIRE it.
*/
- if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) {
- /* Lost the race, so free the fsverity_info we allocated. */
+ if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) {
+ /* Lost the race, so free the verity info we allocated. */
fsverity_free_info(vi);
/*
- * Afterwards, the caller may access ->i_verity_info directly,
- * so make sure to ACQUIRE the winning fsverity_info.
+ * Afterwards, the caller may access the inode's verity info
+ * directly, so make sure to ACQUIRE the winning verity info.
*/
(void)fsverity_get_info(inode);
}
@@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode,
return 0;
}
-/* Ensure the inode has an ->i_verity_info */
static int ensure_verity_info(struct inode *inode)
{
struct fsverity_info *vi = fsverity_get_info(inode);
@@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr);
void __fsverity_cleanup_inode(struct inode *inode)
{
- fsverity_free_info(inode->i_verity_info);
- inode->i_verity_info = NULL;
+ struct fsverity_info **vi_addr = fsverity_info_addr(inode);
+
+ fsverity_free_info(*vi_addr);
+ *vi_addr = NULL;
}
EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode);
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index a1f00c3fd3b2..f0c47b9afb8c 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -245,7 +245,7 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset,
unsigned long max_ra_pages)
{
struct inode *inode = data_folio->mapping->host;
- struct fsverity_info *vi = inode->i_verity_info;
+ struct fsverity_info *vi = *fsverity_info_addr(inode);
const unsigned int block_size = vi->tree_params.block_size;
u64 pos = (u64)data_folio->index << PAGE_SHIFT;
@@ -355,7 +355,7 @@ void __init fsverity_init_workqueue(void)
* latency on ARM64.
*/
fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue",
- WQ_HIGHPRI,
+ WQ_HIGHPRI | WQ_PERCPU,
num_online_cpus());
if (!fsverity_read_workqueue)
panic("failed to allocate fsverity_read_queue");
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index ae0ca6858496..065953475cf5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -105,6 +105,7 @@ config XFS_POSIX_ACL
config XFS_RT
bool "XFS Realtime subvolume support"
depends on XFS_FS
+ default BLK_DEV_ZONED
help
If you say Y here you will be able to mount and use XFS filesystems
which contain a realtime subvolume. The realtime subvolume is a
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4c44ce1c8a64..bff3dc226f81 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -435,6 +435,13 @@ xfs_attr_rmtval_get(
0, &bp, &xfs_attr3_rmt_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK);
+ /*
+ * ENODATA from disk implies a disk medium failure;
+ * ENODATA for xattrs means attribute not found, so
+ * disambiguate that here.
+ */
+ if (error == -ENODATA)
+ error = -EIO;
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 17d9e6154f19..723a0643b838 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2833,6 +2833,12 @@ xfs_da_read_buf(
&bp, ops);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(dp, whichfork);
+ /*
+ * ENODATA from disk implies a disk medium failure; ENODATA for
+ * xattrs means attribute not found, so disambiguate that here.
+ */
+ if (error == -ENODATA && whichfork == XFS_ATTR_FORK)
+ error = -EIO;
if (error)
goto out_free;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 1e6e9c10cea2..a8187281eb96 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -479,7 +479,7 @@ DECLARE_EVENT_CLASS(xchk_dqiter_class,
__field(xfs_exntst_t, state)
),
TP_fast_assign(
- __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev;
+ __entry->dev = cursor->sc->mp->m_super->s_dev;
__entry->dqtype = cursor->dqtype;
__entry->ino = cursor->quota_ip->i_ino;
__entry->cur_id = cursor->id;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1ee4f835ac3c..a26f79815533 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -760,6 +760,9 @@ xfs_vm_swap_activate(
{
struct xfs_inode *ip = XFS_I(file_inode(swap_file));
+ if (xfs_is_zoned_inode(ip))
+ return -EINVAL;
+
/*
* Swap file activation can race against concurrent shared extent
* removal in files that have been cloned. If this happens,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 55a304cb3aef..f96fbf5c54c9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1101,9 +1101,6 @@ xfs_file_write_iter(
if (xfs_is_shutdown(ip->i_mount))
return -EIO;
- if (IS_DAX(inode))
- return xfs_file_dax_write(iocb, from);
-
if (iocb->ki_flags & IOCB_ATOMIC) {
if (ocount < xfs_get_atomic_write_min(ip))
return -EINVAL;
@@ -1116,6 +1113,9 @@ xfs_file_write_iter(
return ret;
}
+ if (IS_DAX(inode))
+ return xfs_file_dax_write(iocb, from);
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9c39251961a3..df8eab11dc48 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1035,7 +1035,7 @@ xfs_itruncate_extents_flags(
int error = 0;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- if (atomic_read(&VFS_I(ip)->i_count))
+ if (icount_read(VFS_I(ip)))
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 07fbdcc4cbf5..bd6d33557194 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -358,9 +358,20 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip)
{
+ if (IS_DAX(VFS_IC(ip)))
+ return false;
+
return xfs_inode_buftarg(ip)->bt_awu_max > 0;
}
+static inline bool xfs_inode_can_sw_atomic_write(const struct xfs_inode *ip)
+{
+ if (IS_DAX(VFS_IC(ip)))
+ return false;
+
+ return xfs_can_sw_atomic_write(ip->i_mount);
+}
+
/*
* In-core inode flags.
*/
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index fe1f74a3b6a3..e1051a530a50 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -219,7 +219,7 @@ xfs_bulk_ireq_setup(
else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno)
return -EINVAL;
- breq->flags |= XFS_IBULK_SAME_AG;
+ breq->iwalk_flags |= XFS_IWALK_SAME_AG;
/* Asking for an inode past the end of the AG? We're done! */
if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 149b5460fbfd..603effabe1ee 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -616,7 +616,8 @@ xfs_get_atomic_write_min(
* write of exactly one single fsblock if the bdev will make that
* guarantee for us.
*/
- if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp))
+ if (xfs_inode_can_hw_atomic_write(ip) ||
+ xfs_inode_can_sw_atomic_write(ip))
return mp->m_sb.sb_blocksize;
return 0;
@@ -633,7 +634,7 @@ xfs_get_atomic_write_max(
* write of exactly one single fsblock if the bdev will make that
* guarantee for us.
*/
- if (!xfs_can_sw_atomic_write(mp)) {
+ if (!xfs_inode_can_sw_atomic_write(ip)) {
if (xfs_inode_can_hw_atomic_write(ip))
return mp->m_sb.sb_blocksize;
return 0;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c8c9b8d8309f..2aa37a4d2706 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -307,7 +307,6 @@ xfs_bulkstat(
.breq = breq,
};
struct xfs_trans *tp;
- unsigned int iwalk_flags = 0;
int error;
if (breq->idmap != &nop_mnt_idmap) {
@@ -328,10 +327,7 @@ xfs_bulkstat(
* locking abilities to detect cycles in the inobt without deadlocking.
*/
tp = xfs_trans_alloc_empty(breq->mp);
- if (breq->flags & XFS_IBULK_SAME_AG)
- iwalk_flags |= XFS_IWALK_SAME_AG;
-
- error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
+ error = xfs_iwalk(breq->mp, tp, breq->startino, breq->iwalk_flags,
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
kfree(bc.buf);
@@ -457,7 +453,7 @@ xfs_inumbers(
* locking abilities to detect cycles in the inobt without deadlocking.
*/
tp = xfs_trans_alloc_empty(breq->mp);
- error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags,
+ error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->iwalk_flags,
xfs_inumbers_walk, breq->icount, &ic);
xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index f10e8f8f2335..2d0612f14d6e 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -13,17 +13,15 @@ struct xfs_ibulk {
xfs_ino_t startino; /* start with this inode */
unsigned int icount; /* number of elements in ubuffer */
unsigned int ocount; /* number of records returned */
- unsigned int flags; /* see XFS_IBULK_FLAG_* */
+ unsigned int flags; /* XFS_IBULK_FLAG_* */
+ unsigned int iwalk_flags; /* XFS_IWALK_FLAG_* */
};
-/* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG (1U << 0)
-
/* Fill out the bs_extents64 field if set. */
-#define XFS_IBULK_NREXT64 (1U << 1)
+#define XFS_IBULK_NREXT64 (1U << 0)
/* Signal that we can return metadata directories. */
-#define XFS_IBULK_METADIR (1U << 2)
+#define XFS_IBULK_METADIR (1U << 1)
/*
* Advance the user buffer pointer by one record of the given size. If the
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c8a57e21a1d3..0da19472d603 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1489,8 +1489,7 @@ xlog_alloc_log(
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM |
- WQ_HIGHPRI),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU),
0, mp->m_super->s_id);
if (!log->l_ioend_workqueue)
goto out_free_iclog;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2133fbaf1766..dc32c5e34d81 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -779,6 +779,25 @@ xfs_set_max_atomic_write_opt(
return -EINVAL;
}
+ if (xfs_has_reflink(mp))
+ goto set_limit;
+
+ if (new_max_fsbs == 1) {
+ if (mp->m_ddev_targp->bt_awu_max ||
+ (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_awu_max)) {
+ } else {
+ xfs_warn(mp,
+ "cannot support atomic writes of size %lluk with no reflink or HW support",
+ new_max_bytes >> 10);
+ return -EINVAL;
+ }
+ } else {
+ xfs_warn(mp,
+ "cannot support atomic writes of size %lluk with no reflink support",
+ new_max_bytes >> 10);
+ return -EINVAL;
+ }
+
set_limit:
error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
if (error) {
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 866c71d9fbae..73b7e72944e4 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -293,7 +293,8 @@ int
xfs_mru_cache_init(void)
{
xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
- XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU),
+ 1);
if (!xfs_mru_reap_wq)
return -ENOMEM;
return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bb0a82635a77..70ee2d55e4f1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -578,19 +578,19 @@ xfs_init_mount_workqueues(
struct xfs_mount *mp)
{
mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
1, mp->m_super->s_id);
if (!mp->m_buf_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
0, mp->m_super->s_id);
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
goto out_destroy_unwritten;
@@ -602,13 +602,14 @@ xfs_init_mount_workqueues(
goto out_destroy_reclaim;
mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
1, mp->m_super->s_id);
if (!mp->m_inodegc_wq)
goto out_destroy_blockgc;
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
- XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0,
+ mp->m_super->s_id);
if (!mp->m_sync_workqueue)
goto out_destroy_inodegc;
@@ -778,7 +779,7 @@ xfs_fs_drop_inode(
return 0;
}
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
STATIC void
@@ -2596,8 +2597,8 @@ xfs_init_workqueues(void)
* AGs in all the filesystems mounted. Hence use the default large
* max_active value for this workqueue.
*/
- xfs_alloc_wq = alloc_workqueue("xfsalloc",
- XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
+ xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU),
+ 0);
if (!xfs_alloc_wq)
return -ENOMEM;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e1794e3e3156..79b8641880ab 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -455,6 +455,7 @@ DEFINE_EVENT(xfs_zone_alloc_class, name, \
xfs_extlen_t len), \
TP_ARGS(oz, rgbno, len))
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
TRACE_EVENT(xfs_zone_gc_select_victim,
@@ -1151,7 +1152,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->count = atomic_read(&VFS_I(ip)->i_count);
+ __entry->count = icount_read(VFS_I(ip));
__entry->pincount = atomic_read(&ip->i_pincount);
__entry->iflags = ip->i_flags;
__entry->caller_ip = caller_ip;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ece374d622b3..575e7028f423 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -253,8 +253,8 @@ xfs_trans_alloc(
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
retry:
- WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
tp = __xfs_trans_alloc(mp, flags);
+ WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
if (error == -ENOSPC && want_retry) {
xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 33f7eee521a8..f28214c28ab5 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -166,10 +166,9 @@ xfs_open_zone_mark_full(
static void
xfs_zone_record_blocks(
struct xfs_trans *tp,
- xfs_fsblock_t fsbno,
- xfs_filblks_t len,
struct xfs_open_zone *oz,
- bool used)
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = oz->oz_rtg;
@@ -179,18 +178,37 @@ xfs_zone_record_blocks(
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
- if (used) {
- rmapip->i_used_blocks += len;
- ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
- } else {
- xfs_add_frextents(mp, len);
- }
+ rmapip->i_used_blocks += len;
+ ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
oz->oz_written += len;
if (oz->oz_written == rtg_blocks(rtg))
xfs_open_zone_mark_full(oz);
xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
}
+/*
+ * Called for blocks that have been written to disk, but not actually linked to
+ * an inode, which can happen when garbage collection races with user data
+ * writes to a file.
+ */
+static void
+xfs_zone_skip_blocks(
+ struct xfs_open_zone *oz,
+ xfs_filblks_t len)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+
+ trace_xfs_zone_skip_blocks(oz, 0, len);
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ oz->oz_written += len;
+ if (oz->oz_written == rtg_blocks(rtg))
+ xfs_open_zone_mark_full(oz);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+
+ xfs_add_frextents(rtg_mount(rtg), len);
+}
+
static int
xfs_zoned_map_extent(
struct xfs_trans *tp,
@@ -250,8 +268,7 @@ xfs_zoned_map_extent(
}
}
- xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
- true);
+ xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount);
/* Map the new blocks into the data fork. */
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
@@ -259,8 +276,7 @@ xfs_zoned_map_extent(
skip:
trace_xfs_reflink_cow_remap_skip(ip, new);
- xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
- false);
+ xfs_zone_skip_blocks(oz, new->br_blockcount);
return 0;
}
@@ -358,44 +374,6 @@ xfs_zone_free_blocks(
return 0;
}
-/*
- * Check if the zone containing the data just before the offset we are
- * writing to is still open and has space.
- */
-static struct xfs_open_zone *
-xfs_last_used_zone(
- struct iomap_ioend *ioend)
-{
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
- struct xfs_rtgroup *rtg = NULL;
- struct xfs_open_zone *oz = NULL;
- struct xfs_iext_cursor icur;
- struct xfs_bmbt_irec got;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
- &icur, &got)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return NULL;
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
- if (!rtg)
- return NULL;
-
- xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
- oz = READ_ONCE(rtg->rtg_open_zone);
- if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
- oz = NULL;
- xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
-
- xfs_rtgroup_rele(rtg);
- return oz;
-}
-
static struct xfs_group *
xfs_find_free_zone(
struct xfs_mount *mp,
@@ -902,12 +880,9 @@ xfs_zone_alloc_and_submit(
goto out_error;
/*
- * If we don't have a cached zone in this write context, see if the
- * last extent before the one we are writing to points to an active
- * zone. If so, just continue writing to it.
+ * If we don't have a locally cached zone in this write context, see if
+ * the inode is still associated with a zone and use that if so.
*/
- if (!*oz && ioend->io_offset)
- *oz = xfs_last_used_zone(ioend);
if (!*oz)
*oz = xfs_cached_zone(mp, ip);
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
index 1313c55b8cbe..9cd38716fd25 100644
--- a/fs/xfs/xfs_zone_space_resv.c
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -10,6 +10,7 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
@@ -230,6 +231,11 @@ xfs_zoned_space_reserve(
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
flags & XFS_ZR_RESERVED);
+ if (error == -ENOSPC && !(flags & XFS_ZR_NOWAIT)) {
+ xfs_inodegc_flush(mp);
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ }
if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
if (error)