summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c295
-rw-r--r--fs/9p/acl.h8
-rw-r--r--fs/9p/vfs_addr.c4
-rw-r--r--fs/9p/vfs_dir.c2
-rw-r--r--fs/9p/vfs_inode_dotl.c4
-rw-r--r--fs/9p/xattr.c11
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/afs/cmservice.c2
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/file.c4
-rw-r--r--fs/afs/internal.h4
-rw-r--r--fs/afs/rxrpc.c10
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/aio.c4
-rw-r--r--fs/attr.c74
-rw-r--r--fs/bad_inode.c4
-rw-r--r--fs/binfmt_elf.c299
-rw-r--r--fs/binfmt_elf_fdpic.c7
-rw-r--r--fs/binfmt_misc.c8
-rw-r--r--fs/btrfs/acl.c3
-rw-r--r--fs/btrfs/acl.h2
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/btrfs/ioctl.c4
-rw-r--r--fs/btrfs/ordered-data.c6
-rw-r--r--fs/cachefiles/io.c77
-rw-r--r--fs/ceph/acl.c3
-rw-r--r--fs/ceph/addr.c4
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c6
-rw-r--r--fs/ceph/locks.c4
-rw-r--r--fs/ceph/mdsmap.c2
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/cifs/cifsacl.c139
-rw-r--r--fs/cifs/cifsfs.c4
-rw-r--r--fs/cifs/cifsproto.h20
-rw-r--r--fs/cifs/cifssmb.c206
-rw-r--r--fs/cifs/connect.c6
-rw-r--r--fs/cifs/file.c6
-rw-r--r--fs/cifs/fscache.c4
-rw-r--r--fs/cifs/smb2ops.c4
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/cifs/xattr.c68
-rw-r--r--fs/coredump.c12
-rw-r--r--fs/crypto/fscrypt_private.h13
-rw-r--r--fs/crypto/keyring.c14
-rw-r--r--fs/crypto/keysetup.c17
-rw-r--r--fs/crypto/policy.c12
-rw-r--r--fs/debugfs/file.c28
-rw-r--r--fs/dlm/ast.c322
-rw-r--r--fs/dlm/ast.h17
-rw-r--r--fs/dlm/config.c4
-rw-r--r--fs/dlm/debug_fs.c2
-rw-r--r--fs/dlm/dlm_internal.h25
-rw-r--r--fs/dlm/lock.c190
-rw-r--r--fs/dlm/lockspace.c14
-rw-r--r--fs/dlm/lowcomms.c1538
-rw-r--r--fs/dlm/lowcomms.h6
-rw-r--r--fs/dlm/main.c7
-rw-r--r--fs/dlm/member.c5
-rw-r--r--fs/dlm/memory.c30
-rw-r--r--fs/dlm/memory.h4
-rw-r--r--fs/dlm/midcomms.c141
-rw-r--r--fs/dlm/midcomms.h7
-rw-r--r--fs/dlm/rcom.c4
-rw-r--r--fs/dlm/requestqueue.c3
-rw-r--r--fs/dlm/user.c74
-rw-r--r--fs/dlm/user.h2
-rw-r--r--fs/ecryptfs/inode.c32
-rw-r--r--fs/erofs/data.c10
-rw-r--r--fs/erofs/fscache.c412
-rw-r--r--fs/erofs/inode.c8
-rw-r--r--fs/erofs/internal.h13
-rw-r--r--fs/erofs/namei.c2
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/xattr.c8
-rw-r--r--fs/erofs/zdata.c80
-rw-r--r--fs/erofs/zmap.c15
-rw-r--r--fs/exec.c50
-rw-r--r--fs/ext2/acl.c3
-rw-r--r--fs/ext2/acl.h2
-rw-r--r--fs/ext2/balloc.c12
-rw-r--r--fs/ext2/dir.c41
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext2/namei.c4
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext4/acl.c3
-rw-r--r--fs/ext4/acl.h2
-rw-r--r--fs/ext4/ext4.h13
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h10
-rw-r--r--fs/ext4/extents.c16
-rw-r--r--fs/ext4/extents_status.c11
-rw-r--r--fs/ext4/fast_commit.c205
-rw-r--r--fs/ext4/fast_commit.h3
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/ialloc.c8
-rw-r--r--fs/ext4/indirect.c9
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c254
-rw-r--r--fs/ext4/ioctl.c24
-rw-r--r--fs/ext4/mballoc.c10
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/namei.c51
-rw-r--r--fs/ext4/orphan.c2
-rw-r--r--fs/ext4/page-io.c44
-rw-r--r--fs/ext4/readpage.c13
-rw-r--r--fs/ext4/resize.c36
-rw-r--r--fs/ext4/super.c65
-rw-r--r--fs/ext4/verity.c2
-rw-r--r--fs/ext4/xattr.c22
-rw-r--r--fs/f2fs/acl.c4
-rw-r--r--fs/f2fs/acl.h2
-rw-r--r--fs/f2fs/compress.c64
-rw-r--r--fs/f2fs/data.c53
-rw-r--r--fs/f2fs/file.c4
-rw-r--r--fs/f2fs/gc.c2
-rw-r--r--fs/f2fs/namei.c4
-rw-r--r--fs/f2fs/segment.c8
-rw-r--r--fs/fat/nfs.c4
-rw-r--r--fs/fs_parser.c3
-rw-r--r--fs/fscache/cookie.c8
-rw-r--r--fs/fscache/io.c2
-rw-r--r--fs/fuse/acl.c5
-rw-r--r--fs/fuse/cuse.c5
-rw-r--r--fs/fuse/dev.c7
-rw-r--r--fs/fuse/dir.c47
-rw-r--r--fs/fuse/file.c45
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/fuse/ioctl.c4
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/gfs2/acl.c3
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/inode.c6
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/trans.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c4
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/inode.c75
-rw-r--r--fs/internal.h35
-rw-r--r--fs/jbd2/commit.c5
-rw-r--r--fs/jffs2/acl.c3
-rw-r--r--fs/jffs2/acl.h2
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jfs/acl.c3
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/jfs_dmap.c27
-rw-r--r--fs/jfs/jfs_extent.h2
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_umount.c4
-rw-r--r--fs/jfs/jfs_xattr.h2
-rw-r--r--fs/jfs/jfs_xtree.h4
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/ksmbd/smb2pdu.c8
-rw-r--r--fs/ksmbd/smbacl.c6
-rw-r--r--fs/ksmbd/vfs.c23
-rw-r--r--fs/ksmbd/vfs.h4
-rw-r--r--fs/libfs.c22
-rw-r--r--fs/lockd/svcsubs.c4
-rw-r--r--fs/locks.c50
-rw-r--r--fs/mbcache.c14
-rw-r--r--fs/namei.c44
-rw-r--r--fs/namespace.c179
-rw-r--r--fs/netfs/io.c6
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/fscache.c4
-rw-r--r--fs/nfs/nfs3_fs.h2
-rw-r--r--fs/nfs/nfs3acl.c9
-rw-r--r--fs/nfs/nfs3proc.c4
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/nfs2acl.c8
-rw-r--r--fs/nfsd/nfs3acl.c8
-rw-r--r--fs/nfsd/nfs4acl.c4
-rw-r--r--fs/nfsd/nfs4state.c6
-rw-r--r--fs/nfsd/vfs.c8
-rw-r--r--fs/nilfs2/the_nilfs.c73
-rw-r--r--fs/ntfs3/file.c4
-rw-r--r--fs/ntfs3/namei.c4
-rw-r--r--fs/ntfs3/ntfs_fs.h4
-rw-r--r--fs/ntfs3/xattr.c9
-rw-r--r--fs/ocfs2/acl.c3
-rw-r--r--fs/ocfs2/acl.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c38
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/netdebug.c2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c8
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c19
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c30
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/journal.h1
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h3
-rw-r--r--fs/ocfs2/stack_o2cb.c6
-rw-r--r--fs/ocfs2/stackglue.c8
-rw-r--r--fs/ocfs2/super.c5
-rw-r--r--fs/open.c8
-rw-r--r--fs/orangefs/acl.c47
-rw-r--r--fs/orangefs/inode.c62
-rw-r--r--fs/orangefs/namei.c2
-rw-r--r--fs/orangefs/orangefs-kernel.h7
-rw-r--r--fs/overlayfs/Kconfig2
-rw-r--r--fs/overlayfs/copy_up.c38
-rw-r--r--fs/overlayfs/dir.c68
-rw-r--r--fs/overlayfs/export.c8
-rw-r--r--fs/overlayfs/file.c31
-rw-r--r--fs/overlayfs/inode.c187
-rw-r--r--fs/overlayfs/namei.c12
-rw-r--r--fs/overlayfs/overlayfs.h53
-rw-r--r--fs/overlayfs/readdir.c58
-rw-r--r--fs/overlayfs/super.c114
-rw-r--r--fs/overlayfs/util.c15
-rw-r--r--fs/posix_acl.c727
-rw-r--r--fs/proc/cmdline.c6
-rw-r--r--fs/proc/consoles.c21
-rw-r--r--fs/proc/fd.c45
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/pstore/platform.c25
-rw-r--r--fs/pstore/ram.c44
-rw-r--r--fs/pstore/ram_core.c20
-rw-r--r--fs/pstore/ram_internal.h98
-rw-r--r--fs/pstore/zone.c2
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/read_write.c12
-rw-r--r--fs/reiserfs/acl.h6
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/namei.c4
-rw-r--r--fs/reiserfs/xattr_acl.c11
-rw-r--r--fs/remap_range.c2
-rw-r--r--fs/seq_file.c2
-rw-r--r--fs/splice.c10
-rw-r--r--fs/squashfs/Kconfig51
-rw-r--r--fs/squashfs/block.c2
-rw-r--r--fs/squashfs/decompressor.c2
-rw-r--r--fs/squashfs/decompressor_multi.c20
-rw-r--r--fs/squashfs/decompressor_multi_percpu.c23
-rw-r--r--fs/squashfs/decompressor_single.c15
-rw-r--r--fs/squashfs/squashfs.h23
-rw-r--r--fs/squashfs/squashfs_fs_sb.h4
-rw-r--r--fs/squashfs/super.c102
-rw-r--r--fs/stat.c7
-rw-r--r--fs/super.c60
-rw-r--r--fs/sysv/itree.c2
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/lpt_commit.c14
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/udf/inode.c83
-rw-r--r--fs/udf/namei.c8
-rw-r--r--fs/udf/super.c4
-rw-r--r--fs/udf/truncate.c48
-rw-r--r--fs/udf/udf_sb.h6
-rw-r--r--fs/verity/fsverity_private.h5
-rw-r--r--fs/verity/hash_algs.c6
-rw-r--r--fs/verity/measure.c19
-rw-r--r--fs/verity/verify.c12
-rw-r--r--fs/xattr.c118
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c2
-rw-r--r--fs/xfs/xfs_acl.c3
-rw-r--r--fs/xfs/xfs_acl.h2
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_iops.c16
276 files changed, 4970 insertions, 3986 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 4dac4a0dc5f4..c397c51f80d9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -17,34 +17,64 @@
#include "v9fs_vfs.h"
#include "fid.h"
-static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+static struct posix_acl *v9fs_fid_get_acl(struct p9_fid *fid, const char *name)
{
ssize_t size;
void *value = NULL;
struct posix_acl *acl = NULL;
size = v9fs_fid_xattr_get(fid, name, NULL, 0);
- if (size > 0) {
- value = kzalloc(size, GFP_NOFS);
- if (!value)
- return ERR_PTR(-ENOMEM);
- size = v9fs_fid_xattr_get(fid, name, value, size);
- if (size > 0) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- goto err_out;
- }
- } else if (size == -ENODATA || size == 0 ||
- size == -ENOSYS || size == -EOPNOTSUPP) {
- acl = NULL;
- } else
- acl = ERR_PTR(-EIO);
-
-err_out:
+ if (size < 0)
+ return ERR_PTR(size);
+ if (size == 0)
+ return ERR_PTR(-ENODATA);
+
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+
+ size = v9fs_fid_xattr_get(fid, name, value, size);
+ if (size < 0)
+ acl = ERR_PTR(size);
+ else if (size == 0)
+ acl = ERR_PTR(-ENODATA);
+ else
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
kfree(value);
return acl;
}
+static struct posix_acl *v9fs_acl_get(struct dentry *dentry, const char *name)
+{
+ struct p9_fid *fid;
+ struct posix_acl *acl = NULL;
+
+ fid = v9fs_fid_lookup(dentry);
+ if (IS_ERR(fid))
+ return ERR_CAST(fid);
+
+ acl = v9fs_fid_get_acl(fid, name);
+ p9_fid_put(fid);
+ return acl;
+}
+
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, const char *name)
+{
+ int retval;
+ struct posix_acl *acl = NULL;
+
+ acl = v9fs_fid_get_acl(fid, name);
+ if (!IS_ERR(acl))
+ return acl;
+
+ retval = PTR_ERR(acl);
+ if (retval == -ENODATA || retval == -ENOSYS || retval == -EOPNOTSUPP)
+ return NULL;
+
+ /* map everything else to -EIO */
+ return ERR_PTR(-EIO);
+}
+
int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
{
int retval = 0;
@@ -89,7 +119,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
return acl;
}
-struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu)
+struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu)
{
struct v9fs_session_info *v9ses;
@@ -109,6 +139,112 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu)
}
+struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ struct v9fs_session_info *v9ses;
+
+ v9ses = v9fs_dentry2v9ses(dentry);
+ /* We allow set/get/list of acl when access=client is not specified. */
+ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+ return v9fs_acl_get(dentry, posix_acl_xattr_name(type));
+ return v9fs_get_cached_acl(d_inode(dentry), type);
+}
+
+int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+ int retval;
+ size_t size = 0;
+ void *value = NULL;
+ const char *acl_name;
+ struct v9fs_session_info *v9ses;
+ struct inode *inode = d_inode(dentry);
+
+ if (acl) {
+ retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
+ if (retval)
+ goto err_out;
+
+ size = posix_acl_xattr_size(acl->a_count);
+
+ value = kzalloc(size, GFP_NOFS);
+ if (!value) {
+ retval = -ENOMEM;
+ goto err_out;
+ }
+
+ retval = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (retval < 0)
+ goto err_out;
+ }
+
+ /*
+ * set the attribute on the remote. Without even looking at the
+ * xattr value. We leave it to the server to validate
+ */
+ acl_name = posix_acl_xattr_name(type);
+ v9ses = v9fs_dentry2v9ses(dentry);
+ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+ retval = v9fs_xattr_set(dentry, acl_name, value, size, 0);
+ goto err_out;
+ }
+
+ if (S_ISLNK(inode->i_mode)) {
+ retval = -EOPNOTSUPP;
+ goto err_out;
+ }
+
+ if (!inode_owner_or_capable(&init_user_ns, inode)) {
+ retval = -EPERM;
+ goto err_out;
+ }
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ if (acl) {
+ struct iattr iattr = {};
+ struct posix_acl *acl_mode = acl;
+
+ retval = posix_acl_update_mode(&init_user_ns, inode,
+ &iattr.ia_mode,
+ &acl_mode);
+ if (retval)
+ goto err_out;
+ if (!acl_mode) {
+ /*
+ * ACL can be represented by the mode bits.
+ * So don't update ACL below.
+ */
+ kfree(value);
+ value = NULL;
+ size = 0;
+ }
+ iattr.ia_valid = ATTR_MODE;
+ /*
+ * FIXME should we update ctime ?
+ * What is the following setxattr update the mode ?
+ */
+ v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode)) {
+ retval = acl ? -EINVAL : 0;
+ goto err_out;
+ }
+ break;
+ }
+
+ retval = v9fs_xattr_set(dentry, acl_name, value, size, 0);
+ if (!retval)
+ set_cached_acl(inode, type, acl);
+
+err_out:
+ kfree(value);
+ return retval;
+}
+
static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
{
int retval;
@@ -207,124 +343,3 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
*modep = mode;
return 0;
}
-
-static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- struct v9fs_session_info *v9ses;
- struct posix_acl *acl;
- int error;
-
- v9ses = v9fs_dentry2v9ses(dentry);
- /*
- * We allow set/get/list of acl when access=client is not specified
- */
- if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_xattr_get(dentry, handler->name, buffer, size);
-
- acl = v9fs_get_cached_acl(inode, handler->flags);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- int retval;
- struct posix_acl *acl;
- struct v9fs_session_info *v9ses;
-
- v9ses = v9fs_dentry2v9ses(dentry);
- /*
- * set the attribute on the remote. Without even looking at the
- * xattr value. We leave it to the server to validate
- */
- if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_xattr_set(dentry, handler->name, value, size,
- flags);
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EPERM;
- if (value) {
- /* update the cached acl value */
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
- if (retval)
- goto err_out;
- }
- } else
- acl = NULL;
-
- switch (handler->flags) {
- case ACL_TYPE_ACCESS:
- if (acl) {
- struct iattr iattr = { 0 };
- struct posix_acl *old_acl = acl;
-
- retval = posix_acl_update_mode(&init_user_ns, inode,
- &iattr.ia_mode, &acl);
- if (retval)
- goto err_out;
- if (!acl) {
- /*
- * ACL can be represented
- * by the mode bits. So don't
- * update ACL.
- */
- posix_acl_release(old_acl);
- value = NULL;
- size = 0;
- }
- iattr.ia_valid = ATTR_MODE;
- /* FIXME should we update ctime ?
- * What is the following setxattr update the
- * mode ?
- */
- v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
- }
- break;
- case ACL_TYPE_DEFAULT:
- if (!S_ISDIR(inode->i_mode)) {
- retval = acl ? -EINVAL : 0;
- goto err_out;
- }
- break;
- default:
- BUG();
- }
- retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
- if (!retval)
- set_cached_acl(inode, handler->flags, acl);
-err_out:
- posix_acl_release(acl);
- return retval;
-}
-
-const struct xattr_handler v9fs_xattr_acl_access_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = v9fs_xattr_get_acl,
- .set = v9fs_xattr_set_acl,
-};
-
-const struct xattr_handler v9fs_xattr_acl_default_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = v9fs_xattr_get_acl,
- .set = v9fs_xattr_set_acl,
-};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index ce5175d463dd..4c60a2bce5de 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -8,8 +8,12 @@
#ifdef CONFIG_9P_FS_POSIX_ACL
int v9fs_get_acl(struct inode *inode, struct p9_fid *fid);
-struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type,
+struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type,
bool rcu);
+struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type);
+int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type);
int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid);
int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
struct posix_acl *dacl, struct posix_acl *acl);
@@ -17,7 +21,9 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
struct posix_acl **dpacl, struct posix_acl **pacl);
void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
#else
+#define v9fs_iop_get_inode_acl NULL
#define v9fs_iop_get_acl NULL
+#define v9fs_iop_set_acl NULL
static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
{
return 0;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 47b9a1122f34..a19891015f19 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -40,7 +40,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
size_t len = subreq->len - subreq->transferred;
int total, err;
- iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len);
+ iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len);
total = p9_client_read(fid, pos, &to, &err);
@@ -172,7 +172,7 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio)
len = min_t(loff_t, i_size - start, len);
- iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len);
+ iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 000fbaae9b18..3bb95adc9619 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -109,7 +109,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
struct iov_iter to;
int n;
- iov_iter_kvec(&to, READ, &kvec, 1, buflen);
+ iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buflen);
n = p9_client_read(file->private_data, ctx->pos, &to,
&err);
if (err)
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 5cfa4b4f070f..03c1743c4aff 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -983,14 +983,18 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = {
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.listxattr = v9fs_listxattr,
+ .get_inode_acl = v9fs_iop_get_inode_acl,
.get_acl = v9fs_iop_get_acl,
+ .set_acl = v9fs_iop_set_acl,
};
const struct inode_operations v9fs_file_inode_operations_dotl = {
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.listxattr = v9fs_listxattr,
+ .get_inode_acl = v9fs_iop_get_inode_acl,
.get_acl = v9fs_iop_get_acl,
+ .set_acl = v9fs_iop_set_acl,
};
const struct inode_operations v9fs_symlink_inode_operations_dotl = {
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 1f9298a4bd42..b6984311e00a 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/uio.h>
+#include <linux/posix_acl_xattr.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -24,7 +25,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
struct iov_iter to;
int err;
- iov_iter_kvec(&to, READ, &kvec, 1, buffer_size);
+ iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buffer_size);
attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
if (IS_ERR(attr_fid)) {
@@ -109,7 +110,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
struct iov_iter from;
int retval, err;
- iov_iter_kvec(&from, WRITE, &kvec, 1, value_len);
+ iov_iter_kvec(&from, ITER_SOURCE, &kvec, 1, value_len);
p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
name, value_len, flags);
@@ -182,9 +183,9 @@ static struct xattr_handler v9fs_xattr_security_handler = {
const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
-#ifdef CONFIG_9P_FS_POSIX_ACL
- &v9fs_xattr_acl_access_handler,
- &v9fs_xattr_acl_default_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_9P_FS_SECURITY
&v9fs_xattr_security_handler,
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 3e11fc3331eb..b5636e544c8a 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -11,8 +11,6 @@
#include <net/9p/client.h>
extern const struct xattr_handler *v9fs_xattr_handlers[];
-extern const struct xattr_handler v9fs_xattr_acl_access_handler;
-extern const struct xattr_handler v9fs_xattr_acl_default_handler;
ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
void *buffer, size_t buffer_size);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 0a090d614e76..7dcd59693a0c 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -298,7 +298,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
if (call->count2 != call->count && call->count2 != 0)
return afs_protocol_error(call, afs_eproto_cb_count);
call->iter = &call->def_iter;
- iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4);
+ iov_iter_discard(&call->def_iter, ITER_DEST, call->count2 * 3 * 4);
call->unmarshall++;
fallthrough;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 230c2d19116d..104df2964225 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -305,7 +305,7 @@ expand:
req->actual_len = i_size; /* May change */
req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
req->data_version = dvnode->status.data_version; /* May change */
- iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages,
+ iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages,
0, i_size);
req->iter = &req->def_iter;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d1cfb235c4b9..2eeab57df133 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -324,7 +324,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
- iov_iter_xarray(&fsreq->def_iter, READ,
+ iov_iter_xarray(&fsreq->def_iter, ITER_DEST,
&fsreq->vnode->netfs.inode.i_mapping->i_pages,
fsreq->pos, fsreq->len);
@@ -346,7 +346,7 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio)
fsreq->len = folio_size(folio);
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
- iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages,
+ iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages,
fsreq->pos, fsreq->len);
ret = afs_fetch_data(fsreq->vnode, fsreq);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 723d162078a3..9ba7b68375c9 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1301,7 +1301,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si
call->iov_len = size;
call->kvec[0].iov_base = buf;
call->kvec[0].iov_len = size;
- iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size);
+ iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size);
}
static inline void afs_extract_to_tmp(struct afs_call *call)
@@ -1319,7 +1319,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call)
static inline void afs_extract_discard(struct afs_call *call, size_t size)
{
call->iov_len = size;
- iov_iter_discard(&call->def_iter, READ, size);
+ iov_iter_discard(&call->def_iter, ITER_DEST, size);
}
static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index eccc3cd0cb70..c62939e5ea1f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -359,7 +359,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0);
@@ -400,7 +400,7 @@ error_do_abort:
RX_USER_ABORT, ret, "KSD");
} else {
len = 0;
- iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
rxrpc_kernel_recv_data(call->net->socket, rxcall,
&msg.msg_iter, &len, false,
&call->abort_code, &call->service_id);
@@ -485,7 +485,7 @@ static void afs_deliver_to_call(struct afs_call *call)
) {
if (state == AFS_CALL_SV_AWAIT_ACK) {
len = 0;
- iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0);
+ iov_iter_kvec(&call->def_iter, ITER_DEST, NULL, 0, 0);
ret = rxrpc_kernel_recv_data(call->net->socket,
call->rxcall, &call->def_iter,
&len, false, &remote_abort,
@@ -822,7 +822,7 @@ void afs_send_empty_reply(struct afs_call *call)
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, NULL, 0, 0);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -862,7 +862,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
iov[0].iov_len = len;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, len);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9ebdd36eaf2f..08fd456dde67 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -609,7 +609,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
*/
afs_write_to_cache(vnode, start, len, i_size, caching);
- iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+ iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
ret = afs_store_data(vnode, &iter, start, false);
} else {
_debug("write discard %x @%llx [%llx]", len, start, i_size);
@@ -1000,7 +1000,7 @@ int afs_launder_folio(struct folio *folio)
bv[0].bv_page = &folio->page;
bv[0].bv_offset = f;
bv[0].bv_len = t - f;
- iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
+ iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len);
trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
diff --git a/fs/aio.c b/fs/aio.c
index 5b2ff20ad322..562916d85cba 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1552,7 +1552,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
if (unlikely(!file->f_op->read_iter))
return -EINVAL;
- ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
+ ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter);
if (ret < 0)
return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
@@ -1580,7 +1580,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
if (unlikely(!file->f_op->write_iter))
return -EINVAL;
- ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
+ ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter);
if (ret < 0)
return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
diff --git a/fs/attr.c b/fs/attr.c
index 1552a5f23d6b..b45f30e516fa 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,6 +18,70 @@
#include <linux/evm.h>
#include <linux/ima.h>
+#include "internal.h"
+
+/**
+ * setattr_should_drop_sgid - determine whether the setgid bit needs to be
+ * removed
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ *
+ * This function determines whether the setgid bit needs to be removed.
+ * We retain backwards compatibility and require setgid bit to be removed
+ * unconditionally if S_IXGRP is set. Otherwise we have the exact same
+ * requirements as setattr_prepare() and setattr_copy().
+ *
+ * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise.
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+ const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+
+ if (!(mode & S_ISGID))
+ return 0;
+ if (mode & S_IXGRP)
+ return ATTR_KILL_SGID;
+ if (!in_group_or_capable(mnt_userns, inode,
+ i_gid_into_vfsgid(mnt_userns, inode)))
+ return ATTR_KILL_SGID;
+ return 0;
+}
+
+/**
+ * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
+ * be dropped
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ *
+ * This function determines whether the set{g,u}id bits need to be removed.
+ * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the
+ * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both
+ * set{g,u}id bits need to be removed the corresponding mask of both flags is
+ * returned.
+ *
+ * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits
+ * to remove, 0 otherwise.
+ */
+int setattr_should_drop_suidgid(struct user_namespace *mnt_userns,
+ struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ kill |= setattr_should_drop_sgid(mnt_userns, inode);
+
+ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
+EXPORT_SYMBOL(setattr_should_drop_suidgid);
+
/**
* chown_ok - verify permissions to chown inode
* @mnt_userns: user namespace of the mount @inode was found from
@@ -140,8 +204,7 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* Also check the setgid bit! */
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_or_capable(mnt_userns, inode, vfsgid))
attr->ia_mode &= ~S_ISGID;
}
@@ -251,9 +314,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_or_capable(mnt_userns, inode,
+ i_gid_into_vfsgid(mnt_userns, inode)))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
@@ -375,7 +437,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
}
}
if (ia_valid & ATTR_KILL_SGID) {
- if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+ if (mode & S_ISGID) {
if (!(ia_valid & ATTR_MODE)) {
ia_valid = attr->ia_valid |= ATTR_MODE;
attr->ia_mode = inode->i_mode;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 9d1cde8066cf..92737166203f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -154,7 +154,7 @@ static int bad_inode_tmpfile(struct user_namespace *mnt_userns,
}
static int bad_inode_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl,
+ struct dentry *dentry, struct posix_acl *acl,
int type)
{
return -EIO;
@@ -177,7 +177,7 @@ static const struct inode_operations bad_inode_ops =
.setattr = bad_inode_setattr,
.listxattr = bad_inode_listxattr,
.get_link = bad_inode_get_link,
- .get_acl = bad_inode_get_acl,
+ .get_inode_acl = bad_inode_get_acl,
.fiemap = bad_inode_fiemap,
.update_time = bad_inode_update_time,
.atomic_open = bad_inode_atomic_open,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6a11025e5850..de63572a9404 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -248,7 +248,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
} while (0)
#ifdef ARCH_DLINFO
- /*
+ /*
* ARCH_DLINFO must come first so PPC can do its special alignment of
* AUXV.
* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
@@ -456,13 +456,13 @@ static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr)
*
* Loads ELF program headers from the binary file elf_file, which has the ELF
* header pointed to by elf_ex, into a newly allocated array. The caller is
- * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
+ * responsible for freeing the allocated data. Returns NULL upon failure.
*/
static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
struct file *elf_file)
{
struct elf_phdr *elf_phdata = NULL;
- int retval, err = -1;
+ int retval = -1;
unsigned int size;
/*
@@ -484,15 +484,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
/* Read in the program headers */
retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff);
- if (retval < 0) {
- err = retval;
- goto out;
- }
- /* Success! */
- err = 0;
out:
- if (err) {
+ if (retval) {
kfree(elf_phdata);
elf_phdata = NULL;
}
@@ -1020,7 +1014,7 @@ out_free_interp:
executable_stack);
if (retval < 0)
goto out_free_dentry;
-
+
elf_bss = 0;
elf_brk = 0;
@@ -1043,7 +1037,7 @@ out_free_interp:
if (unlikely (elf_brk > elf_bss)) {
unsigned long nbyte;
-
+
/* There was a PT_LOAD segment with p_memsz > p_filesz
before this one. Map anonymous pages, if needed,
and clear the area. */
@@ -1166,7 +1160,7 @@ out_free_interp:
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, total_size);
if (BAD_ADDR(error)) {
- retval = IS_ERR((void *)error) ?
+ retval = IS_ERR_VALUE(error) ?
PTR_ERR((void*)error) : -EINVAL;
goto out_free_dentry;
}
@@ -1251,7 +1245,7 @@ out_free_interp:
interpreter,
load_bias, interp_elf_phdata,
&arch_state);
- if (!IS_ERR((void *)elf_entry)) {
+ if (!IS_ERR_VALUE(elf_entry)) {
/*
* load_elf_interp() returns relocation
* adjustment
@@ -1260,7 +1254,7 @@ out_free_interp:
elf_entry += interp_elf_ex->e_entry;
}
if (BAD_ADDR(elf_entry)) {
- retval = IS_ERR((void *)elf_entry) ?
+ retval = IS_ERR_VALUE(elf_entry) ?
(int)elf_entry : -EINVAL;
goto out_free_dentry;
}
@@ -1521,7 +1515,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
phdr->p_align = 0;
}
-static void fill_note(struct memelfnote *note, const char *name, int type,
+static void fill_note(struct memelfnote *note, const char *name, int type,
unsigned int sz, void *data)
{
note->name = name;
@@ -1724,7 +1718,6 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
return 0;
}
-#ifdef CORE_DUMP_USE_REGSET
#include <linux/regset.h>
struct elf_thread_core_info {
@@ -1745,6 +1738,7 @@ struct elf_note_info {
int thread_notes;
};
+#ifdef CORE_DUMP_USE_REGSET
/*
* When a regset has a writeback hook, we call it on each thread before
* dumping user memory. On register window machines, this makes sure the
@@ -1824,34 +1818,58 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
return 1;
}
+#else
+static int fill_thread_core_info(struct elf_thread_core_info *t,
+ const struct user_regset_view *view,
+ long signr, struct elf_note_info *info)
+{
+ struct task_struct *p = t->task;
+ elf_fpregset_t *fpu;
+
+ fill_prstatus(&t->prstatus.common, p, signr);
+ elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+
+ fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+ &(t->prstatus));
+ info->size += notesize(&t->notes[0]);
+
+ fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL);
+ if (!fpu || !elf_core_copy_task_fpregs(p, fpu)) {
+ kfree(fpu);
+ return 1;
+ }
+
+ t->prstatus.pr_fpvalid = 1;
+ fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+ info->size += notesize(&t->notes[1]);
+
+ return 1;
+}
+#endif
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
struct coredump_params *cprm)
{
struct task_struct *dump_task = current;
- const struct user_regset_view *view = task_user_regset_view(dump_task);
+ const struct user_regset_view *view;
struct elf_thread_core_info *t;
struct elf_prpsinfo *psinfo;
struct core_thread *ct;
- unsigned int i;
-
- info->size = 0;
- info->thread = NULL;
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
- if (psinfo == NULL) {
- info->psinfo.data = NULL; /* So we don't free this wrongly */
+ if (!psinfo)
return 0;
- }
-
fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+#ifdef CORE_DUMP_USE_REGSET
+ view = task_user_regset_view(dump_task);
+
/*
* Figure out how many notes we're going to need for each thread.
*/
info->thread_notes = 0;
- for (i = 0; i < view->n; ++i)
+ for (int i = 0; i < view->n; ++i)
if (view->regsets[i].core_note_type != 0)
++info->thread_notes;
@@ -1870,11 +1888,23 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
*/
fill_elf_header(elf, phdrs,
view->e_machine, view->e_flags);
+#else
+ view = NULL;
+ info->thread_notes = 2;
+ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
+#endif
/*
* Allocate a structure for each thread.
*/
- for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
+ info->thread = kzalloc(offsetof(struct elf_thread_core_info,
+ notes[info->thread_notes]),
+ GFP_KERNEL);
+ if (unlikely(!info->thread))
+ return 0;
+
+ info->thread->task = dump_task;
+ for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) {
t = kzalloc(offsetof(struct elf_thread_core_info,
notes[info->thread_notes]),
GFP_KERNEL);
@@ -1882,17 +1912,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
return 0;
t->task = ct->task;
- if (ct->task == dump_task || !info->thread) {
- t->next = info->thread;
- info->thread = t;
- } else {
- /*
- * Make sure to keep the original task at
- * the head of the list.
- */
- t->next = info->thread->next;
- info->thread->next = t;
- }
+ t->next = info->thread->next;
+ info->thread->next = t;
}
/*
@@ -1920,11 +1941,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
return 1;
}
-static size_t get_note_info_size(struct elf_note_info *info)
-{
- return info->size;
-}
-
/*
* Write all the notes for each thread. When writing the first thread, the
* process-wide notes are interleaved after the first thread-specific note.
@@ -1979,197 +1995,6 @@ static void free_note_info(struct elf_note_info *info)
kvfree(info->files.data);
}
-#else
-
-/* Here is the structure in which status of each thread is captured. */
-struct elf_thread_status
-{
- struct list_head list;
- struct elf_prstatus prstatus; /* NT_PRSTATUS */
- elf_fpregset_t fpu; /* NT_PRFPREG */
- struct task_struct *thread;
- struct memelfnote notes[3];
- int num_notes;
-};
-
-/*
- * In order to add the specific thread information for the elf file format,
- * we need to keep a linked list of every threads pr_status and then create
- * a single section for them in the final core file.
- */
-static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
-{
- int sz = 0;
- struct task_struct *p = t->thread;
- t->num_notes = 0;
-
- fill_prstatus(&t->prstatus.common, p, signr);
- elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
-
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
- &(t->prstatus));
- t->num_notes++;
- sz += notesize(&t->notes[0]);
-
- if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
- &t->fpu))) {
- fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
- &(t->fpu));
- t->num_notes++;
- sz += notesize(&t->notes[1]);
- }
- return sz;
-}
-
-struct elf_note_info {
- struct memelfnote *notes;
- struct memelfnote *notes_files;
- struct elf_prstatus *prstatus; /* NT_PRSTATUS */
- struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */
- struct list_head thread_list;
- elf_fpregset_t *fpu;
- user_siginfo_t csigdata;
- int thread_status_size;
- int numnote;
-};
-
-static int elf_note_info_init(struct elf_note_info *info)
-{
- memset(info, 0, sizeof(*info));
- INIT_LIST_HEAD(&info->thread_list);
-
- /* Allocate space for ELF notes */
- info->notes = kmalloc_array(8, sizeof(struct memelfnote), GFP_KERNEL);
- if (!info->notes)
- return 0;
- info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
- if (!info->psinfo)
- return 0;
- info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
- if (!info->prstatus)
- return 0;
- info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
- if (!info->fpu)
- return 0;
- return 1;
-}
-
-static int fill_note_info(struct elfhdr *elf, int phdrs,
- struct elf_note_info *info,
- struct coredump_params *cprm)
-{
- struct core_thread *ct;
- struct elf_thread_status *ets;
-
- if (!elf_note_info_init(info))
- return 0;
-
- for (ct = current->signal->core_state->dumper.next;
- ct; ct = ct->next) {
- ets = kzalloc(sizeof(*ets), GFP_KERNEL);
- if (!ets)
- return 0;
-
- ets->thread = ct->task;
- list_add(&ets->list, &info->thread_list);
- }
-
- list_for_each_entry(ets, &info->thread_list, list) {
- int sz;
-
- sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
- info->thread_status_size += sz;
- }
- /* now collect the dump for the current */
- memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
- elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
-
- /* Set up header */
- fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
-
- /*
- * Set up the notes in similar form to SVR4 core dumps made
- * with info from their /proc.
- */
-
- fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
- sizeof(*info->prstatus), info->prstatus);
- fill_psinfo(info->psinfo, current->group_leader, current->mm);
- fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
- sizeof(*info->psinfo), info->psinfo);
-
- fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
- fill_auxv_note(info->notes + 3, current->mm);
- info->numnote = 4;
-
- if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
- info->notes_files = info->notes + info->numnote;
- info->numnote++;
- }
-
- /* Try to dump the FPU. */
- info->prstatus->pr_fpvalid =
- elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
- if (info->prstatus->pr_fpvalid)
- fill_note(info->notes + info->numnote++,
- "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
- return 1;
-}
-
-static size_t get_note_info_size(struct elf_note_info *info)
-{
- int sz = 0;
- int i;
-
- for (i = 0; i < info->numnote; i++)
- sz += notesize(info->notes + i);
-
- sz += info->thread_status_size;
-
- return sz;
-}
-
-static int write_note_info(struct elf_note_info *info,
- struct coredump_params *cprm)
-{
- struct elf_thread_status *ets;
- int i;
-
- for (i = 0; i < info->numnote; i++)
- if (!writenote(info->notes + i, cprm))
- return 0;
-
- /* write out the thread status notes section */
- list_for_each_entry(ets, &info->thread_list, list) {
- for (i = 0; i < ets->num_notes; i++)
- if (!writenote(&ets->notes[i], cprm))
- return 0;
- }
-
- return 1;
-}
-
-static void free_note_info(struct elf_note_info *info)
-{
- while (!list_empty(&info->thread_list)) {
- struct list_head *tmp = info->thread_list.next;
- list_del(tmp);
- kfree(list_entry(tmp, struct elf_thread_status, list));
- }
-
- /* Free data possibly allocated by fill_files_note(): */
- if (info->notes_files)
- kvfree(info->notes_files->data);
-
- kfree(info->prstatus);
- kfree(info->psinfo);
- kfree(info->notes);
- kfree(info->fpu);
-}
-
-#endif
-
static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
elf_addr_t e_shoff, int segs)
{
@@ -2233,7 +2058,7 @@ static int elf_core_dump(struct coredump_params *cprm)
/* Write notes phdr entry */
{
- size_t sz = get_note_info_size(&info);
+ size_t sz = info.size;
/* For cell spufs */
sz += elf_coredump_extra_notes_size();
@@ -2295,7 +2120,7 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
- /* write out the notes section */
+ /* write out the notes section */
if (!write_note_info(&info, cprm))
goto end_coredump;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 08d0c8797828..096e3520a0b1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -434,8 +434,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
current->mm->start_stack = current->mm->start_brk + stack_size;
#endif
- if (create_elf_fdpic_tables(bprm, current->mm,
- &exec_params, &interp_params) < 0)
+ retval = create_elf_fdpic_tables(bprm, current->mm, &exec_params,
+ &interp_params);
+ if (retval < 0)
goto error;
kdebug("- start_code %lx", current->mm->start_code);
@@ -1603,7 +1604,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
- /* write out the notes section */
+ /* write out the notes section */
if (!writenote(thread_list->notes, cprm))
goto end_coredump;
if (!writenote(&psinfo_note, cprm))
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e1eae7ea823a..bb202ad369d5 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -44,10 +44,10 @@ static LIST_HEAD(entries);
static int enabled = 1;
enum {Enabled, Magic};
-#define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
-#define MISC_FMT_OPEN_BINARY (1 << 30)
-#define MISC_FMT_CREDENTIALS (1 << 29)
-#define MISC_FMT_OPEN_FILE (1 << 28)
+#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
+#define MISC_FMT_OPEN_BINARY (1UL << 30)
+#define MISC_FMT_CREDENTIALS (1UL << 29)
+#define MISC_FMT_OPEN_FILE (1UL << 28)
typedef struct {
struct list_head list;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 100bae33c677..3da1779e8b79 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -110,10 +110,11 @@ out:
return ret;
}
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int ret;
+ struct inode *inode = d_inode(dentry);
umode_t old_mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index 45197b4f73bf..39bd36e6eeb7 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -6,7 +6,7 @@
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
struct posix_acl *acl, int type);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 905ea19df125..8bcad9940154 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5307,7 +5307,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
err = btrfs_dirty_inode(BTRFS_I(inode));
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
}
return err;
@@ -11360,7 +11360,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.mknod = btrfs_mknod,
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
@@ -11413,7 +11413,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.fileattr_get = btrfs_fileattr_get,
@@ -11424,7 +11424,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.listxattr = btrfs_listxattr,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4fd6b61b06a4..7e348bd2ccde 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4393,7 +4393,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
goto out_acct;
}
- ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
@@ -4492,7 +4492,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (args.len > args.unencoded_len - args.unencoded_offset)
goto out_acct;
- ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 4bed0839b640..57d8c72737e1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -764,11 +764,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
struct btrfs_ordered_extent *ordered;
if (start + len < start) {
- orig_end = INT_LIMIT(loff_t);
+ orig_end = OFFSET_MAX;
} else {
orig_end = start + len - 1;
- if (orig_end > INT_LIMIT(loff_t))
- orig_end = INT_LIMIT(loff_t);
+ if (orig_end > OFFSET_MAX)
+ orig_end = OFFSET_MAX;
}
/* start IO across the range first to instantiate any delalloc
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 000a28f46e59..175a25fcade8 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -385,38 +385,35 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
term_func, term_func_priv);
}
-/*
- * Prepare a read operation, shortening it to a cached/uncached
- * boundary as appropriate.
- */
-static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
- loff_t i_size)
+static inline enum netfs_io_source
+cachefiles_do_prepare_read(struct netfs_cache_resources *cres,
+ loff_t start, size_t *_len, loff_t i_size,
+ unsigned long *_flags, ino_t netfs_ino)
{
enum cachefiles_prepare_read_trace why;
- struct netfs_io_request *rreq = subreq->rreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct cachefiles_object *object;
+ struct cachefiles_object *object = NULL;
struct cachefiles_cache *cache;
struct fscache_cookie *cookie = fscache_cres_cookie(cres);
const struct cred *saved_cred;
struct file *file = cachefiles_cres_file(cres);
enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
+ size_t len = *_len;
loff_t off, to;
ino_t ino = file ? file_inode(file)->i_ino : 0;
int rc;
- _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
+ _enter("%zx @%llx/%llx", len, start, i_size);
- if (subreq->start >= i_size) {
+ if (start >= i_size) {
ret = NETFS_FILL_WITH_ZEROES;
why = cachefiles_trace_read_after_eof;
goto out_no_object;
}
if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
- __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
why = cachefiles_trace_read_no_data;
- if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags))
+ if (!test_bit(NETFS_SREQ_ONDEMAND, _flags))
goto out_no_object;
}
@@ -437,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
retry:
off = cachefiles_inject_read_error();
if (off == 0)
- off = vfs_llseek(file, subreq->start, SEEK_DATA);
+ off = vfs_llseek(file, start, SEEK_DATA);
if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
if (off == (loff_t)-ENXIO) {
why = cachefiles_trace_read_seek_nxio;
@@ -449,21 +446,22 @@ retry:
goto out;
}
- if (off >= subreq->start + subreq->len) {
+ if (off >= start + len) {
why = cachefiles_trace_read_found_hole;
goto download_and_store;
}
- if (off > subreq->start) {
+ if (off > start) {
off = round_up(off, cache->bsize);
- subreq->len = off - subreq->start;
+ len = off - start;
+ *_len = len;
why = cachefiles_trace_read_found_part;
goto download_and_store;
}
to = cachefiles_inject_read_error();
if (to == 0)
- to = vfs_llseek(file, subreq->start, SEEK_HOLE);
+ to = vfs_llseek(file, start, SEEK_HOLE);
if (to < 0 && to >= (loff_t)-MAX_ERRNO) {
trace_cachefiles_io_error(object, file_inode(file), to,
cachefiles_trace_seek_error);
@@ -471,12 +469,13 @@ retry:
goto out;
}
- if (to < subreq->start + subreq->len) {
- if (subreq->start + subreq->len >= i_size)
+ if (to < start + len) {
+ if (start + len >= i_size)
to = round_up(to, cache->bsize);
else
to = round_down(to, cache->bsize);
- subreq->len = to - subreq->start;
+ len = to - start;
+ *_len = len;
}
why = cachefiles_trace_read_have_data;
@@ -484,12 +483,11 @@ retry:
goto out;
download_and_store:
- __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
- if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) {
- rc = cachefiles_ondemand_read(object, subreq->start,
- subreq->len);
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
+ if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) {
+ rc = cachefiles_ondemand_read(object, start, len);
if (!rc) {
- __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags);
+ __clear_bit(NETFS_SREQ_ONDEMAND, _flags);
goto retry;
}
ret = NETFS_INVALID_READ;
@@ -497,11 +495,35 @@ download_and_store:
out:
cachefiles_end_secure(cache, saved_cred);
out_no_object:
- trace_cachefiles_prep_read(subreq, ret, why, ino);
+ trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino);
return ret;
}
/*
+ * Prepare a read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
+ loff_t i_size)
+{
+ return cachefiles_do_prepare_read(&subreq->rreq->cache_resources,
+ subreq->start, &subreq->len, i_size,
+ &subreq->flags, subreq->rreq->inode->i_ino);
+}
+
+/*
+ * Prepare an on-demand read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source
+cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
+ loff_t start, size_t *_len, loff_t i_size,
+ unsigned long *_flags, ino_t ino)
+{
+ return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino);
+}
+
+/*
* Prepare for a write to occur.
*/
int __cachefiles_prepare_write(struct cachefiles_object *object,
@@ -621,6 +643,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
.write = cachefiles_write,
.prepare_read = cachefiles_prepare_read,
.prepare_write = cachefiles_prepare_write,
+ .prepare_ondemand_read = cachefiles_prepare_ondemand_read,
.query_occupancy = cachefiles_query_occupancy,
};
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index f4fc8e0b847c..c7e8dd5b58d4 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -85,13 +85,14 @@ retry:
return acl;
}
-int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int ret = 0, size = 0;
const char *name = NULL;
char *value = NULL;
struct iattr newattrs;
+ struct inode *inode = d_inode(dentry);
struct timespec64 old_ctime = inode->i_ctime;
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index dcf701b05cc1..61f47debec5a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -288,7 +288,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
}
len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
if (err == 0)
err = -EFAULT;
@@ -327,7 +327,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
}
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
if (err < 0) {
dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e7e2ebac330d..6c7026cc8988 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -2033,7 +2033,7 @@ const struct inode_operations ceph_dir_iops = {
.getattr = ceph_getattr,
.setattr = ceph_setattr,
.listxattr = ceph_listxattr,
- .get_acl = ceph_get_acl,
+ .get_inode_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
.mknod = ceph_mknod,
.symlink = ceph_symlink,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 04fd34557de8..6f9580defb2b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1161,7 +1161,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
aio_req->total_len = rc + zlen;
}
- iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
+ iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs,
osd_data->num_bvecs, len);
iov_iter_advance(&i, rc);
iov_iter_zero(zlen, &i);
@@ -1400,7 +1400,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
int zlen = min_t(size_t, len - ret,
size - pos - ret);
- iov_iter_bvec(&i, READ, bvecs, num_pages, len);
+ iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len);
iov_iter_advance(&i, ret);
iov_iter_zero(zlen, &i);
ret += zlen;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index bad9eeb6a1a5..f23c5a6edc6f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -126,7 +126,7 @@ const struct inode_operations ceph_file_iops = {
.setattr = ceph_setattr,
.getattr = ceph_getattr,
.listxattr = ceph_listxattr,
- .get_acl = ceph_get_acl,
+ .get_inode_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
};
@@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode,
if (nsplits != ci->i_fragtree_nsplits) {
update = true;
} else if (nsplits) {
- i = prandom_u32_max(nsplits);
+ i = get_random_u32_below(nsplits);
id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id))
update = true;
@@ -2255,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
err = __ceph_setattr(inode, attr);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
- err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
+ err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode);
return err;
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 3e2843e86e27..f3b461c708a8 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -364,7 +364,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
*fcntl_count = 0;
*flock_count = 0;
- ctx = inode->i_flctx;
+ ctx = locks_inode_context(inode);
if (ctx) {
spin_lock(&ctx->flc_lock);
list_for_each_entry(lock, &ctx->flc_posix, fl_list)
@@ -418,7 +418,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
int num_fcntl_locks, int num_flock_locks)
{
struct file_lock *lock;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx = locks_inode_context(inode);
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 3fbabc98e1f7..7dac21ee6ce7 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
return -1;
/* pick */
- n = prandom_u32_max(n);
+ n = get_random_u32_below(n);
for (j = 0, i = 0; i < m->possible_max_rank; i++) {
if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 40630e6f691c..50e57a1fa32f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1117,7 +1117,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
struct posix_acl *ceph_get_acl(struct inode *, int, bool);
int ceph_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl, int type);
+ struct dentry *dentry, struct posix_acl *acl, int type);
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
struct ceph_acl_sec_ctx *as_ctx);
void ceph_init_inode_acls(struct inode *inode,
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fa480d62f313..c647f0d56518 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -13,6 +13,9 @@
#include <linux/string.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
+#include <uapi/linux/posix_acl.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <keys/user-type.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -20,6 +23,8 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "fs_context.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
/* security id for everyone/world system group */
static const struct cifs_sid sid_everyone = {
@@ -1668,3 +1673,137 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
kfree(pntsd);
return rc;
}
+
+struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX)
+ struct posix_acl *acl = NULL;
+ ssize_t rc = -EOPNOTSUPP;
+ unsigned int xid;
+ struct super_block *sb = dentry->d_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct tcon_link *tlink;
+ struct cifs_tcon *pTcon;
+ const char *full_path;
+ void *page;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return ERR_CAST(tlink);
+ pTcon = tlink_tcon(tlink);
+
+ xid = get_xid();
+ page = alloc_dentry_path();
+
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ acl = ERR_CAST(full_path);
+ goto out;
+ }
+
+ /* return alt name if available as pseudo attr */
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_get_acl(xid, pTcon, full_path, &acl,
+ ACL_TYPE_ACCESS,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_get_acl(xid, pTcon, full_path, &acl,
+ ACL_TYPE_DEFAULT,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+ }
+
+ if (rc < 0) {
+ if (rc == -EINVAL)
+ acl = ERR_PTR(-EOPNOTSUPP);
+ else
+ acl = ERR_PTR(rc);
+ }
+
+out:
+ free_dentry_path(page);
+ free_xid(xid);
+ cifs_put_tlink(tlink);
+ return acl;
+#else
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+}
+
+int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX)
+ int rc = -EOPNOTSUPP;
+ unsigned int xid;
+ struct super_block *sb = dentry->d_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct tcon_link *tlink;
+ struct cifs_tcon *pTcon;
+ const char *full_path;
+ void *page;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ pTcon = tlink_tcon(tlink);
+
+ xid = get_xid();
+ page = alloc_dentry_path();
+
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ goto out;
+ }
+
+ if (!acl)
+ goto out;
+
+ /* return dos attributes as pseudo xattr */
+ /* return alt name if available as pseudo attr */
+
+ /* if proc/fs/cifs/streamstoxattr is set then
+ search server for EAs or streams to
+ returns as xattrs */
+ if (posix_acl_xattr_size(acl->a_count) > CIFSMaxBufSize) {
+ cifs_dbg(FYI, "size of EA value too large\n");
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
+ ACL_TYPE_ACCESS,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
+ ACL_TYPE_DEFAULT,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+ }
+
+out:
+ free_dentry_path(page);
+ free_xid(xid);
+ cifs_put_tlink(tlink);
+ return rc;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 712a43161448..040267ed8a64 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1133,6 +1133,8 @@ const struct inode_operations cifs_dir_inode_ops = {
.symlink = cifs_symlink,
.mknod = cifs_mknod,
.listxattr = cifs_listxattr,
+ .get_acl = cifs_get_acl,
+ .set_acl = cifs_set_acl,
};
const struct inode_operations cifs_file_inode_ops = {
@@ -1141,6 +1143,8 @@ const struct inode_operations cifs_file_inode_ops = {
.permission = cifs_permission,
.listxattr = cifs_listxattr,
.fiemap = cifs_fiemap,
+ .get_acl = cifs_get_acl,
+ .set_acl = cifs_set_acl,
};
const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 83e83d8beabb..f50f96e4ec30 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -224,6 +224,10 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
const char *, u32 *, u32);
extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
const struct cifs_fid *, u32 *, u32);
+extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type);
+extern int cifs_set_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct posix_acl *acl, int type);
extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
const char *, int);
extern unsigned int setup_authusers_ACE(struct cifs_ace *pace);
@@ -537,14 +541,14 @@ extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
struct cifs_ntsd *, __u32, int);
-extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *searchName,
- char *acl_inf, const int buflen, const int acl_type,
- const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *fileName,
- const char *local_acl, const int buflen, const int acl_type,
- const struct nls_table *nls_codepage, int remap_special_chars);
+extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName,
+ struct posix_acl **acl, const int acl_type,
+ const struct nls_table *nls_codepage, int remap);
+extern int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *fileName,
+ const struct posix_acl *acl, const int acl_type,
+ const struct nls_table *nls_codepage, int remap);
extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
#endif /* CIFS_ALLOW_INSECURE_LEGACY */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1724066c1536..23f10e0d6e7e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2914,32 +2914,57 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
#ifdef CONFIG_CIFS_POSIX
-/*Convert an Access Control Entry from wire format to local POSIX xattr format*/
-static void cifs_convert_ace(struct posix_acl_xattr_entry *ace,
- struct cifs_posix_ace *cifs_ace)
+#ifdef CONFIG_FS_POSIX_ACL
+/**
+ * cifs_init_posix_acl - convert ACL from cifs to POSIX ACL format
+ * @ace: POSIX ACL entry to store converted ACL into
+ * @cifs_ace: ACL in cifs format
+ *
+ * Convert an Access Control Entry from wire format to local POSIX xattr
+ * format.
+ *
+ * Note that the @cifs_uid member is used to store both {g,u}id_t.
+ */
+static void cifs_init_posix_acl(struct posix_acl_entry *ace,
+ struct cifs_posix_ace *cifs_ace)
{
/* u8 cifs fields do not need le conversion */
- ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
- ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag);
- ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-/*
- cifs_dbg(FYI, "perm %d tag %d id %d\n",
- ace->e_perm, ace->e_tag, ace->e_id);
-*/
+ ace->e_perm = cifs_ace->cifs_e_perm;
+ ace->e_tag = cifs_ace->cifs_e_tag;
+ switch (ace->e_tag) {
+ case ACL_USER:
+ ace->e_uid = make_kuid(&init_user_ns,
+ le64_to_cpu(cifs_ace->cifs_uid));
+ break;
+ case ACL_GROUP:
+ ace->e_gid = make_kgid(&init_user_ns,
+ le64_to_cpu(cifs_ace->cifs_uid));
+ break;
+ }
return;
}
-/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */
-static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
- const int acl_type, const int size_of_data_area)
+/**
+ * cifs_to_posix_acl - copy cifs ACL format to POSIX ACL format
+ * @acl: ACLs returned in POSIX ACL format
+ * @src: ACLs in cifs format
+ * @acl_type: type of POSIX ACL requested
+ * @size_of_data_area: size of SMB we got
+ *
+ * This function converts ACLs from cifs format to POSIX ACL format.
+ * If @acl is NULL then the size of the buffer required to store POSIX ACLs in
+ * their uapi format is returned.
+ */
+static int cifs_to_posix_acl(struct posix_acl **acl, char *src,
+ const int acl_type, const int size_of_data_area)
{
int size = 0;
- int i;
__u16 count;
struct cifs_posix_ace *pACE;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src;
- struct posix_acl_xattr_header *local_acl = (void *)trgt;
+ struct posix_acl *kacl = NULL;
+ struct posix_acl_entry *pa, *pe;
if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
return -EOPNOTSUPP;
@@ -2959,7 +2984,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
count = le16_to_cpu(cifs_acl->access_entry_count);
size = sizeof(struct cifs_posix_acl);
size += sizeof(struct cifs_posix_ace) * count;
-/* skip past access ACEs to get to default ACEs */
+ /* skip past access ACEs to get to default ACEs */
pACE = &cifs_acl->ace_array[count];
count = le16_to_cpu(cifs_acl->default_entry_count);
size += sizeof(struct cifs_posix_ace) * count;
@@ -2971,62 +2996,75 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
return -EINVAL;
}
- size = posix_acl_xattr_size(count);
- if ((buflen == 0) || (local_acl == NULL)) {
- /* used to query ACL EA size */
- } else if (size > buflen) {
- return -ERANGE;
- } else /* buffer big enough */ {
- struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
-
- local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
- for (i = 0; i < count ; i++) {
- cifs_convert_ace(&ace[i], pACE);
- pACE++;
- }
+ /* Allocate number of POSIX ACLs to store in VFS format. */
+ kacl = posix_acl_alloc(count, GFP_NOFS);
+ if (!kacl)
+ return -ENOMEM;
+
+ FOREACH_ACL_ENTRY(pa, kacl, pe) {
+ cifs_init_posix_acl(pa, pACE);
+ pACE++;
}
- return size;
+
+ *acl = kacl;
+ return 0;
}
-static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
- const struct posix_acl_xattr_entry *local_ace)
+/**
+ * cifs_init_ace - convert ACL entry from POSIX ACL to cifs format
+ * @cifs_ace: the cifs ACL entry to store into
+ * @local_ace: the POSIX ACL entry to convert
+ */
+static void cifs_init_ace(struct cifs_posix_ace *cifs_ace,
+ const struct posix_acl_entry *local_ace)
{
- cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm);
- cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag);
- /* BB is there a better way to handle the large uid? */
- if (local_ace->e_id == cpu_to_le32(-1)) {
- /* Probably no need to le convert -1 on any arch but can not hurt */
+ cifs_ace->cifs_e_perm = local_ace->e_perm;
+ cifs_ace->cifs_e_tag = local_ace->e_tag;
+
+ switch (local_ace->e_tag) {
+ case ACL_USER:
+ cifs_ace->cifs_uid =
+ cpu_to_le64(from_kuid(&init_user_ns, local_ace->e_uid));
+ break;
+ case ACL_GROUP:
+ cifs_ace->cifs_uid =
+ cpu_to_le64(from_kgid(&init_user_ns, local_ace->e_gid));
+ break;
+ default:
cifs_ace->cifs_uid = cpu_to_le64(-1);
- } else
- cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-/*
- cifs_dbg(FYI, "perm %d tag %d id %d\n",
- ace->e_perm, ace->e_tag, ace->e_id);
-*/
+ }
}
-/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */
-static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
- const int buflen, const int acl_type)
+/**
+ * posix_acl_to_cifs - convert ACLs from POSIX ACL to cifs format
+ * @parm_data: ACLs in cifs format to conver to
+ * @acl: ACLs in POSIX ACL format to convert from
+ * @acl_type: the type of POSIX ACLs stored in @acl
+ *
+ * Return: the number cifs ACL entries after conversion
+ */
+static __u16 posix_acl_to_cifs(char *parm_data, const struct posix_acl *acl,
+ const int acl_type)
{
__u16 rc = 0;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
- struct posix_acl_xattr_header *local_acl = (void *)pACL;
- struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
+ const struct posix_acl_entry *pa, *pe;
int count;
- int i;
+ int i = 0;
- if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL))
+ if ((acl == NULL) || (cifs_acl == NULL))
return 0;
- count = posix_acl_xattr_count((size_t)buflen);
- cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n",
- count, buflen, le32_to_cpu(local_acl->a_version));
- if (le32_to_cpu(local_acl->a_version) != 2) {
- cifs_dbg(FYI, "unknown POSIX ACL version %d\n",
- le32_to_cpu(local_acl->a_version));
- return 0;
- }
+ count = acl->a_count;
+ cifs_dbg(FYI, "setting acl with %d entries\n", count);
+
+ /*
+ * Note that the uapi POSIX ACL version is verified by the VFS and is
+ * independent of the cifs ACL version. Changing the POSIX ACL version
+ * is a uapi change and if it's changed we will pass down the POSIX ACL
+ * version in struct posix_acl from the VFS. For now there's really
+ * only one that all filesystems know how to deal with.
+ */
cifs_acl->version = cpu_to_le16(1);
if (acl_type == ACL_TYPE_ACCESS) {
cifs_acl->access_entry_count = cpu_to_le16(count);
@@ -3038,8 +3076,9 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
return 0;
}
- for (i = 0; i < count; i++)
- convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
+ FOREACH_ACL_ENTRY(pa, acl, pe) {
+ cifs_init_ace(&cifs_acl->ace_array[i++], pa);
+ }
if (rc == 0) {
rc = (__u16)(count * sizeof(struct cifs_posix_ace));
rc += sizeof(struct cifs_posix_acl);
@@ -3048,11 +3087,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
return rc;
}
-int
-CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *searchName,
- char *acl_inf, const int buflen, const int acl_type,
- const struct nls_table *nls_codepage, int remap)
+int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName, struct posix_acl **acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
{
/* SMB_QUERY_POSIX_ACL */
TRANSACTION2_QPI_REQ *pSMB = NULL;
@@ -3124,23 +3162,26 @@ queryAclRetry:
else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
- rc = cifs_copy_posix_acl(acl_inf,
+ rc = cifs_to_posix_acl(acl,
(char *)&pSMBr->hdr.Protocol+data_offset,
- buflen, acl_type, count);
+ acl_type, count);
}
}
cifs_buf_release(pSMB);
+ /*
+ * The else branch after SendReceive() doesn't return EAGAIN so if we
+ * allocated @acl in cifs_to_posix_acl() we are guaranteed to return
+ * here and don't leak POSIX ACLs.
+ */
if (rc == -EAGAIN)
goto queryAclRetry;
return rc;
}
-int
-CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *fileName,
- const char *local_acl, const int buflen,
- const int acl_type,
- const struct nls_table *nls_codepage, int remap)
+int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *fileName, const struct posix_acl *acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
{
struct smb_com_transaction2_spi_req *pSMB = NULL;
struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
@@ -3181,7 +3222,7 @@ setAclRetry:
pSMB->ParameterOffset = cpu_to_le16(param_offset);
/* convert to on the wire format for POSIX ACL */
- data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type);
+ data_count = posix_acl_to_cifs(parm_data, acl, acl_type);
if (data_count == 0) {
rc = -EOPNOTSUPP;
@@ -3211,6 +3252,23 @@ setACLerrorExit:
goto setAclRetry;
return rc;
}
+#else
+int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName, struct posix_acl **acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
+{
+ return -EOPNOTSUPP;
+}
+
+int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *fileName, const struct posix_acl *acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_FS_POSIX_ACL */
int
CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9db9527c61cf..e80252a83225 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -759,7 +759,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
{
struct msghdr smb_msg = {};
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
- iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
+ iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -774,7 +774,7 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read)
* and cifs_readv_from_socket sets msg_control and msg_controllen
* so little to initialize in struct msghdr
*/
- iov_iter_discard(&smb_msg.msg_iter, READ, to_read);
+ iov_iter_discard(&smb_msg.msg_iter, ITER_DEST, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -786,7 +786,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
struct msghdr smb_msg = {};
struct bio_vec bv = {
.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
- iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
+ iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cd9698209930..87b56b1ae117 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1413,7 +1413,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
struct inode *inode = d_inode(cfile->dentry);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct file_lock *flock;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
unsigned int count = 0, i;
int rc = 0, xid, type;
struct list_head locks_to_send, *el;
@@ -3532,7 +3532,7 @@ static ssize_t __cifs_writev(
ctx->iter = *from;
ctx->len = len;
} else {
- rc = setup_aio_ctx_iter(ctx, from, WRITE);
+ rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE);
if (rc) {
kref_put(&ctx->refcount, cifs_aio_ctx_release);
return rc;
@@ -4276,7 +4276,7 @@ static ssize_t __cifs_readv(
ctx->iter = *to;
ctx->len = len;
} else {
- rc = setup_aio_ctx_iter(ctx, to, READ);
+ rc = setup_aio_ctx_iter(ctx, to, ITER_DEST);
if (rc) {
kref_put(&ctx->refcount, cifs_aio_ctx_release);
return rc;
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a1751b956318..f6f3a6b75601 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -150,7 +150,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page)
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_read_operation(&cres, cookie);
if (ret < 0)
@@ -180,7 +180,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_write_operation(&cres, cookie);
if (ret < 0)
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index bfaafd02fb1f..32b3877b538a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -4723,13 +4723,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- iov_iter_bvec(&iter, WRITE, bvec, npages, data_len);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len);
} else if (buf_len >= data_offset + data_len) {
/* read response payload is in buf */
WARN_ONCE(npages > 0, "read data can be either in buf or in pages");
iov.iov_base = buf + data_offset;
iov.iov_len = data_len;
- iov_iter_kvec(&iter, WRITE, &iov, 1, data_len);
+ iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len);
} else {
/* read response payload cannot be in both buf and pages */
WARN_ONCE(1, "buf can not contain only a part of read data");
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 575fa8f58342..3851d0aaa288 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -347,7 +347,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
.iov_base = &rfc1002_marker,
.iov_len = 4
};
- iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
+ iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, &hiov, 1, 4);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
goto unmask;
@@ -368,7 +368,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
size += iov[i].iov_len;
}
- iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size);
+ iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, iov, n_vec, size);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
@@ -384,7 +384,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rqst_page_get_length(&rqst[j], i, &bvec.bv_len,
&bvec.bv_offset);
- iov_iter_bvec(&smb_msg.msg_iter, WRITE,
+ iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE,
&bvec, 1, bvec.bv_len);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 998fa51f9b68..5f2fb2fd2e37 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -200,32 +200,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
}
break;
}
-
-#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- case XATTR_ACL_ACCESS:
-#ifdef CONFIG_CIFS_POSIX
- if (!value)
- goto out;
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
- value, (const int)size,
- ACL_TYPE_ACCESS, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-
- case XATTR_ACL_DEFAULT:
-#ifdef CONFIG_CIFS_POSIX
- if (!value)
- goto out;
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
- value, (const int)size,
- ACL_TYPE_DEFAULT, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
out:
@@ -366,27 +340,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
}
break;
}
-#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- case XATTR_ACL_ACCESS:
-#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
- value, size, ACL_TYPE_ACCESS,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-
- case XATTR_ACL_DEFAULT:
-#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
- value, size, ACL_TYPE_DEFAULT,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
/* We could add an additional check for streams ie
@@ -525,21 +478,6 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = {
.set = cifs_xattr_set,
};
-
-static const struct xattr_handler cifs_posix_acl_access_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = XATTR_ACL_ACCESS,
- .get = cifs_xattr_get,
- .set = cifs_xattr_set,
-};
-
-static const struct xattr_handler cifs_posix_acl_default_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = XATTR_ACL_DEFAULT,
- .get = cifs_xattr_get,
- .set = cifs_xattr_set,
-};
-
const struct xattr_handler *cifs_xattr_handlers[] = {
&cifs_user_xattr_handler,
&cifs_os2_xattr_handler,
@@ -549,7 +487,9 @@ const struct xattr_handler *cifs_xattr_handlers[] = {
&smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */
&cifs_cifs_ntsd_full_xattr_handler,
&smb3_ntsd_full_xattr_handler, /* alias for above since avoiding "cifs" */
- &cifs_posix_acl_access_xattr_handler,
- &cifs_posix_acl_default_xattr_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
NULL
};
diff --git a/fs/coredump.c b/fs/coredump.c
index 7bad7785e8e6..a4c30bb900fe 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -325,6 +325,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
err = cn_printf(cn, "%lu",
rlimit(RLIMIT_CORE));
break;
+ /* CPU the task ran on */
+ case 'C':
+ err = cn_printf(cn, "%d", cprm->cpu);
+ break;
default:
break;
}
@@ -525,7 +529,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.siginfo = siginfo,
- .regs = signal_pt_regs(),
.limit = rlimit(RLIMIT_CORE),
/*
* We must use the same mm->flags while dumping core to avoid
@@ -534,6 +537,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
*/
.mm_flags = mm->flags,
.vma_meta = NULL,
+ .cpu = raw_smp_processor_id(),
};
audit_core_dumps(siginfo->si_signo);
@@ -716,8 +720,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* filesystem.
*/
mnt_userns = file_mnt_user_ns(cprm.file);
- if (!uid_eq(i_uid_into_mnt(mnt_userns, inode),
- current_fsuid())) {
+ if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode),
+ current_fsuid())) {
pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
cn.corename);
goto close_fail;
@@ -853,7 +857,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
if (dump_interrupted())
return 0;
pos = file->f_pos;
- iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
n = __kernel_write_iter(cprm->file, &iter, &pos);
if (n != PAGE_SIZE)
return 0;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index d5f68a0c5d15..316a778cec0f 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -439,13 +439,7 @@ struct fscrypt_master_key_secret {
struct fscrypt_master_key {
/*
- * Back-pointer to the super_block of the filesystem to which this
- * master key has been added. Only valid if ->mk_active_refs > 0.
- */
- struct super_block *mk_sb;
-
- /*
- * Link in ->mk_sb->s_master_keys->key_hashtable.
+ * Link in ->s_master_keys->key_hashtable.
* Only valid if ->mk_active_refs > 0.
*/
struct hlist_node mk_node;
@@ -456,7 +450,7 @@ struct fscrypt_master_key {
/*
* Active and structural reference counts. An active ref guarantees
* that the struct continues to exist, continues to be in the keyring
- * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g.
+ * ->s_master_keys, and that any embedded subkeys (e.g.
* ->mk_direct_keys) that have been prepared continue to exist.
* A structural ref only guarantees that the struct continues to exist.
*
@@ -569,7 +563,8 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
void fscrypt_put_master_key(struct fscrypt_master_key *mk);
-void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk);
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+ struct fscrypt_master_key *mk);
struct fscrypt_master_key *
fscrypt_find_master_key(struct super_block *sb,
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 2a24b1f0ae68..78dd2ff306bd 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -79,10 +79,9 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk)
call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
}
-void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+ struct fscrypt_master_key *mk)
{
- struct super_block *sb = mk->mk_sb;
- struct fscrypt_keyring *keyring = sb->s_master_keys;
size_t i;
if (!refcount_dec_and_test(&mk->mk_active_refs))
@@ -93,9 +92,9 @@ void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
* destroying any subkeys embedded in it.
*/
- spin_lock(&keyring->lock);
+ spin_lock(&sb->s_master_keys->lock);
hlist_del_rcu(&mk->mk_node);
- spin_unlock(&keyring->lock);
+ spin_unlock(&sb->s_master_keys->lock);
/*
* ->mk_active_refs == 0 implies that ->mk_secret is not present and
@@ -243,7 +242,7 @@ void fscrypt_destroy_keyring(struct super_block *sb)
WARN_ON(refcount_read(&mk->mk_struct_refs) != 1);
WARN_ON(!is_master_key_secret_present(&mk->mk_secret));
wipe_master_key_secret(&mk->mk_secret);
- fscrypt_put_master_key_activeref(mk);
+ fscrypt_put_master_key_activeref(sb, mk);
}
}
kfree_sensitive(keyring);
@@ -424,7 +423,6 @@ static int add_new_master_key(struct super_block *sb,
if (!mk)
return -ENOMEM;
- mk->mk_sb = sb;
init_rwsem(&mk->mk_sem);
refcount_set(&mk->mk_struct_refs, 1);
mk->mk_spec = *mk_spec;
@@ -1068,7 +1066,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
err = -ENOKEY;
if (is_master_key_secret_present(&mk->mk_secret)) {
wipe_master_key_secret(&mk->mk_secret);
- fscrypt_put_master_key_activeref(mk);
+ fscrypt_put_master_key_activeref(sb, mk);
err = 0;
}
inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index f7407071a952..94757ccd3056 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -44,6 +44,21 @@ struct fscrypt_mode fscrypt_modes[] = {
.security_strength = 16,
.ivsize = 16,
},
+ [FSCRYPT_MODE_SM4_XTS] = {
+ .friendly_name = "SM4-XTS",
+ .cipher_str = "xts(sm4)",
+ .keysize = 32,
+ .security_strength = 16,
+ .ivsize = 16,
+ .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS,
+ },
+ [FSCRYPT_MODE_SM4_CTS] = {
+ .friendly_name = "SM4-CTS-CBC",
+ .cipher_str = "cts(cbc(sm4))",
+ .keysize = 16,
+ .security_strength = 16,
+ .ivsize = 16,
+ },
[FSCRYPT_MODE_ADIANTUM] = {
.friendly_name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
@@ -509,7 +524,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
spin_lock(&mk->mk_decrypted_inodes_lock);
list_del(&ci->ci_master_key_link);
spin_unlock(&mk->mk_decrypted_inodes_lock);
- fscrypt_put_master_key_activeref(mk);
+ fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
}
memzero_explicit(ci, sizeof(*ci));
kmem_cache_free(fscrypt_info_cachep, ci);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 46757c3052ef..893661b52376 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -61,6 +61,13 @@ fscrypt_get_dummy_policy(struct super_block *sb)
return sb->s_cop->get_dummy_policy(sb);
}
+/*
+ * Return %true if the given combination of encryption modes is supported for v1
+ * (and later) encryption policies.
+ *
+ * Do *not* add anything new here, since v1 encryption policies are deprecated.
+ * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only.
+ */
static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
{
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
@@ -83,6 +90,11 @@ static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
return true;
+
+ if (contents_mode == FSCRYPT_MODE_SM4_XTS &&
+ filenames_mode == FSCRYPT_MODE_SM4_CTS)
+ return true;
+
return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
}
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index ddb3fc258df9..b54f470e0d03 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf,
}
EXPORT_SYMBOL_GPL(debugfs_attr_read);
-ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct dentry *dentry = F_DENTRY(file);
ssize_t ret;
@@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
ret = debugfs_file_get(dentry);
if (unlikely(ret))
return ret;
- ret = simple_attr_write(file, buf, len, ppos);
+ if (is_signed)
+ ret = simple_attr_write_signed(file, buf, len, ppos);
+ else
+ ret = simple_attr_write(file, buf, len, ppos);
debugfs_file_put(dentry);
return ret;
}
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(debugfs_attr_write);
+ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write_signed);
+
static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode,
struct dentry *parent, void *value,
const struct file_operations *fops,
@@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
*val = atomic_read((atomic_t *)data);
return 0;
}
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get,
debugfs_atomic_t_set, "%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
"%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
"%lld\n");
/**
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index d60a8d8f109d..26fef9945cc9 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -12,55 +12,67 @@
#include <trace/events/dlm.h>
#include "dlm_internal.h"
+#include "memory.h"
#include "lock.h"
#include "user.h"
#include "ast.h"
-static uint64_t dlm_cb_seq;
-static DEFINE_SPINLOCK(dlm_cb_seq_spin);
+void dlm_release_callback(struct kref *ref)
+{
+ struct dlm_callback *cb = container_of(ref, struct dlm_callback, ref);
+
+ dlm_free_cb(cb);
+}
+
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+ struct dlm_callback *to)
+{
+ if (*from)
+ kref_put(&(*from)->ref, dlm_release_callback);
+
+ if (to)
+ kref_get(&to->ref);
+
+ *from = to;
+}
-static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb)
{
- int i;
-
- log_print("last_bast %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id,
- (unsigned long long)lkb->lkb_last_bast.seq,
- lkb->lkb_last_bast.flags,
- lkb->lkb_last_bast.mode,
- lkb->lkb_last_bast.sb_status,
- lkb->lkb_last_bast.sb_flags);
-
- log_print("last_cast %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id,
- (unsigned long long)lkb->lkb_last_cast.seq,
- lkb->lkb_last_cast.flags,
- lkb->lkb_last_cast.mode,
- lkb->lkb_last_cast.sb_status,
- lkb->lkb_last_cast.sb_flags);
-
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- log_print("cb %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id,
- (unsigned long long)lkb->lkb_callbacks[i].seq,
- lkb->lkb_callbacks[i].flags,
- lkb->lkb_callbacks[i].mode,
- lkb->lkb_callbacks[i].sb_status,
- lkb->lkb_callbacks[i].sb_flags);
+ struct dlm_callback *cb, *safe;
+
+ list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) {
+ list_del(&cb->list);
+ kref_put(&cb->ref, dlm_release_callback);
}
+
+ lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
+
+ /* invalidate */
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+ lkb->lkb_last_bast_mode = -1;
}
-int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq)
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- uint64_t prev_seq;
+ int rv = DLM_ENQUEUE_CALLBACK_SUCCESS;
+ struct dlm_callback *cb;
int prev_mode;
- int i, rv;
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- if (lkb->lkb_callbacks[i].seq)
- continue;
+ if (flags & DLM_CB_BAST) {
+ /* if cb is a bast, it should be skipped if the blocking mode is
+ * compatible with the last granted mode
+ */
+ if (lkb->lkb_last_cast) {
+ if (dlm_modes_compat(mode, lkb->lkb_last_cast->mode)) {
+ log_debug(ls, "skip %x bast mode %d for cast mode %d",
+ lkb->lkb_id, mode,
+ lkb->lkb_last_cast->mode);
+ goto out;
+ }
+ }
/*
* Suppress some redundant basts here, do more on removal.
@@ -68,148 +80,95 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
* is a bast for the same mode or a more restrictive mode.
* (the addional > PR check is needed for PR/CW inversion)
*/
-
- if ((i > 0) && (flags & DLM_CB_BAST) &&
- (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
-
- prev_seq = lkb->lkb_callbacks[i-1].seq;
- prev_mode = lkb->lkb_callbacks[i-1].mode;
+ if (lkb->lkb_last_cb && lkb->lkb_last_cb->flags & DLM_CB_BAST) {
+ prev_mode = lkb->lkb_last_cb->mode;
if ((prev_mode == mode) ||
(prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
-
- log_debug(ls, "skip %x add bast %llu mode %d "
- "for bast %llu mode %d",
- lkb->lkb_id,
- (unsigned long long)seq,
- mode,
- (unsigned long long)prev_seq,
- prev_mode);
- rv = 0;
+ log_debug(ls, "skip %x add bast mode %d for bast mode %d",
+ lkb->lkb_id, mode, prev_mode);
goto out;
}
}
-
- lkb->lkb_callbacks[i].seq = seq;
- lkb->lkb_callbacks[i].flags = flags;
- lkb->lkb_callbacks[i].mode = mode;
- lkb->lkb_callbacks[i].sb_status = status;
- lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
- rv = 0;
- break;
}
- if (i == DLM_CALLBACKS_SIZE) {
- log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id, (unsigned long long)seq,
- flags, mode, status, sbflags);
- dlm_dump_lkb_callbacks(lkb);
- rv = -1;
+ cb = dlm_allocate_cb();
+ if (!cb) {
+ rv = DLM_ENQUEUE_CALLBACK_FAILURE;
goto out;
}
- out:
- return rv;
-}
-
-int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_callback *cb, int *resid)
-{
- int i, rv;
-
- *resid = 0;
-
- if (!lkb->lkb_callbacks[0].seq) {
- rv = -ENOENT;
- goto out;
- }
-
- /* oldest undelivered cb is callbacks[0] */
-
- memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
- memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
- /* shift others down */
-
- for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
- if (!lkb->lkb_callbacks[i].seq)
- break;
- memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
- sizeof(struct dlm_callback));
- memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
- (*resid)++;
+ cb->flags = flags;
+ cb->mode = mode;
+ cb->sb_status = status;
+ cb->sb_flags = (sbflags & 0x000000FF);
+ kref_init(&cb->ref);
+ if (!(lkb->lkb_flags & DLM_IFL_CB_PENDING)) {
+ lkb->lkb_flags |= DLM_IFL_CB_PENDING;
+ rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED;
}
+ list_add_tail(&cb->list, &lkb->lkb_callbacks);
- /* if cb is a bast, it should be skipped if the blocking mode is
- compatible with the last granted mode */
-
- if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
- if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
- cb->flags |= DLM_CB_SKIP;
-
- log_debug(ls, "skip %x bast %llu mode %d "
- "for cast %llu mode %d",
- lkb->lkb_id,
- (unsigned long long)cb->seq,
- cb->mode,
- (unsigned long long)lkb->lkb_last_cast.seq,
- lkb->lkb_last_cast.mode);
- rv = 0;
- goto out;
- }
- }
+ if (flags & DLM_CB_CAST)
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, cb);
- if (cb->flags & DLM_CB_CAST) {
- memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
- lkb->lkb_last_cast_time = ktime_get();
- }
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, cb);
- if (cb->flags & DLM_CB_BAST) {
- memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
- lkb->lkb_last_bast_time = ktime_get();
- }
- rv = 0;
out:
return rv;
}
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb)
+{
+ /* oldest undelivered cb is callbacks first entry */
+ *cb = list_first_entry_or_null(&lkb->lkb_callbacks,
+ struct dlm_callback, list);
+ if (!*cb)
+ return DLM_DEQUEUE_CALLBACK_EMPTY;
+
+ /* remove it from callbacks so shift others down */
+ list_del(&(*cb)->list);
+ if (list_empty(&lkb->lkb_callbacks))
+ return DLM_DEQUEUE_CALLBACK_LAST;
+
+ return DLM_DEQUEUE_CALLBACK_SUCCESS;
+}
+
void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
uint32_t sbflags)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- uint64_t new_seq, prev_seq;
int rv;
- spin_lock(&dlm_cb_seq_spin);
- new_seq = ++dlm_cb_seq;
- if (!dlm_cb_seq)
- new_seq = ++dlm_cb_seq;
- spin_unlock(&dlm_cb_seq_spin);
-
if (lkb->lkb_flags & DLM_IFL_USER) {
- dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq);
+ dlm_user_add_ast(lkb, flags, mode, status, sbflags);
return;
}
- mutex_lock(&lkb->lkb_cb_mutex);
- prev_seq = lkb->lkb_callbacks[0].seq;
-
- rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq);
- if (rv < 0)
- goto out;
-
- if (!prev_seq) {
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+ switch (rv) {
+ case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
kref_get(&lkb->lkb_ref);
- mutex_lock(&ls->ls_cb_mutex);
+ spin_lock(&ls->ls_cb_lock);
if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
} else {
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
}
- mutex_unlock(&ls->ls_cb_mutex);
+ spin_unlock(&ls->ls_cb_lock);
+ break;
+ case DLM_ENQUEUE_CALLBACK_FAILURE:
+ WARN_ON_ONCE(1);
+ break;
+ case DLM_ENQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
- out:
- mutex_unlock(&lkb->lkb_cb_mutex);
+ spin_unlock(&lkb->lkb_cb_lock);
}
void dlm_callback_work(struct work_struct *work)
@@ -218,53 +177,46 @@ void dlm_callback_work(struct work_struct *work)
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
void (*castfn) (void *astparam);
void (*bastfn) (void *astparam, int mode);
- struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
- int i, rv, resid;
-
- memset(&callbacks, 0, sizeof(callbacks));
+ struct dlm_callback *cb;
+ int rv;
- mutex_lock(&lkb->lkb_cb_mutex);
- if (!lkb->lkb_callbacks[0].seq) {
- /* no callback work exists, shouldn't happen */
- log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id);
- dlm_print_lkb(lkb);
- dlm_dump_lkb_callbacks(lkb);
- }
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ spin_unlock(&lkb->lkb_cb_lock);
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
- if (rv < 0)
- break;
- }
+ if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY))
+ goto out;
- if (resid) {
- /* cbs remain, loop should have removed all, shouldn't happen */
- log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id,
- resid);
- dlm_print_lkb(lkb);
- dlm_dump_lkb_callbacks(lkb);
- }
- mutex_unlock(&lkb->lkb_cb_mutex);
+ for (;;) {
+ castfn = lkb->lkb_astfn;
+ bastfn = lkb->lkb_bastfn;
+
+ if (cb->flags & DLM_CB_BAST) {
+ trace_dlm_bast(ls, lkb, cb->mode);
+ lkb->lkb_last_bast_time = ktime_get();
+ lkb->lkb_last_bast_mode = cb->mode;
+ bastfn(lkb->lkb_astparam, cb->mode);
+ } else if (cb->flags & DLM_CB_CAST) {
+ lkb->lkb_lksb->sb_status = cb->sb_status;
+ lkb->lkb_lksb->sb_flags = cb->sb_flags;
+ trace_dlm_ast(ls, lkb);
+ lkb->lkb_last_cast_time = ktime_get();
+ castfn(lkb->lkb_astparam);
+ }
- castfn = lkb->lkb_astfn;
- bastfn = lkb->lkb_bastfn;
+ kref_put(&cb->ref, dlm_release_callback);
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- if (!callbacks[i].seq)
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) {
+ lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
+ spin_unlock(&lkb->lkb_cb_lock);
break;
- if (callbacks[i].flags & DLM_CB_SKIP) {
- continue;
- } else if (callbacks[i].flags & DLM_CB_BAST) {
- trace_dlm_bast(ls, lkb, callbacks[i].mode);
- bastfn(lkb->lkb_astparam, callbacks[i].mode);
- } else if (callbacks[i].flags & DLM_CB_CAST) {
- lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
- lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
- trace_dlm_ast(ls, lkb);
- castfn(lkb->lkb_astparam);
}
+ spin_unlock(&lkb->lkb_cb_lock);
}
+out:
/* undo kref_get from dlm_add_callback, may cause lkb to be freed */
dlm_put_lkb(lkb);
}
@@ -289,9 +241,9 @@ void dlm_callback_stop(struct dlm_ls *ls)
void dlm_callback_suspend(struct dlm_ls *ls)
{
if (ls->ls_callback_wq) {
- mutex_lock(&ls->ls_cb_mutex);
+ spin_lock(&ls->ls_cb_lock);
set_bit(LSFL_CB_DELAY, &ls->ls_flags);
- mutex_unlock(&ls->ls_cb_mutex);
+ spin_unlock(&ls->ls_cb_lock);
flush_workqueue(ls->ls_callback_wq);
}
@@ -308,10 +260,8 @@ void dlm_callback_resume(struct dlm_ls *ls)
if (!ls->ls_callback_wq)
return;
- clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
-
more:
- mutex_lock(&ls->ls_cb_mutex);
+ spin_lock(&ls->ls_cb_lock);
list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
list_del_init(&lkb->lkb_cb_list);
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
@@ -320,7 +270,9 @@ more:
break;
}
empty = list_empty(&ls->ls_cb_delay);
- mutex_unlock(&ls->ls_cb_mutex);
+ if (empty)
+ clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ spin_unlock(&ls->ls_cb_lock);
sum += count;
if (!empty) {
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index e5e05fcc5813..880b11882495 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -11,13 +11,22 @@
#ifndef __ASTD_DOT_H__
#define __ASTD_DOT_H__
-int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq);
-int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_callback *cb, int *resid);
+#define DLM_ENQUEUE_CALLBACK_NEED_SCHED 1
+#define DLM_ENQUEUE_CALLBACK_SUCCESS 0
+#define DLM_ENQUEUE_CALLBACK_FAILURE -1
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags);
+#define DLM_DEQUEUE_CALLBACK_EMPTY 2
+#define DLM_DEQUEUE_CALLBACK_LAST 1
+#define DLM_DEQUEUE_CALLBACK_SUCCESS 0
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb);
void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
uint32_t sbflags);
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+ struct dlm_callback *to);
+void dlm_release_callback(struct kref *ref);
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb);
void dlm_callback_work(struct work_struct *work);
int dlm_callback_start(struct dlm_ls *ls);
void dlm_callback_stop(struct dlm_ls *ls);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index ac8b62106ce0..20b60709eccf 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -183,7 +183,7 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x)
return -EINVAL;
}
- if (dlm_allow_conn)
+ if (dlm_lowcomms_is_running())
return -EBUSY;
return 0;
@@ -194,7 +194,7 @@ static int dlm_check_zero_and_dlm_running(unsigned int x)
if (!x)
return -EINVAL;
- if (dlm_allow_conn)
+ if (dlm_lowcomms_is_running())
return -EBUSY;
return 0;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fb04ebbafb5..8a0e1b1f74ad 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -246,7 +246,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
lkb->lkb_status,
lkb->lkb_grmode,
lkb->lkb_rqmode,
- lkb->lkb_last_bast.mode,
+ lkb->lkb_last_bast_mode,
rsb_lookup,
lkb->lkb_wait_type,
lkb->lkb_lvbseq,
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index e34c3d2639a5..ab1a55337a6e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -211,6 +211,7 @@ struct dlm_args {
#endif
#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
+#define DLM_IFL_CB_PENDING 0x04000000
/* least significant 2 bytes are message changed, they are full transmitted
* but at receive side only the 2 bytes LSB will be set.
*
@@ -222,18 +223,17 @@ struct dlm_args {
#define DLM_IFL_USER 0x00000001
#define DLM_IFL_ORPHAN 0x00000002
-#define DLM_CALLBACKS_SIZE 6
-
#define DLM_CB_CAST 0x00000001
#define DLM_CB_BAST 0x00000002
-#define DLM_CB_SKIP 0x00000004
struct dlm_callback {
- uint64_t seq;
uint32_t flags; /* DLM_CBF_ */
int sb_status; /* copy to lksb status */
uint8_t sb_flags; /* copy to lksb flags */
int8_t mode; /* rq mode of bast, gr mode of cast */
+
+ struct list_head list;
+ struct kref ref;
};
struct dlm_lkb {
@@ -268,12 +268,13 @@ struct dlm_lkb {
unsigned long lkb_timeout_cs;
#endif
- struct mutex lkb_cb_mutex;
+ spinlock_t lkb_cb_lock;
struct work_struct lkb_cb_work;
struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */
- struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE];
- struct dlm_callback lkb_last_cast;
- struct dlm_callback lkb_last_bast;
+ struct list_head lkb_callbacks;
+ struct dlm_callback *lkb_last_cast;
+ struct dlm_callback *lkb_last_cb;
+ int lkb_last_bast_mode;
ktime_t lkb_last_cast_time; /* for debugging */
ktime_t lkb_last_bast_time; /* for debugging */
@@ -591,11 +592,7 @@ struct dlm_ls {
int ls_new_rsb_count;
struct list_head ls_new_rsb; /* new rsb structs */
- spinlock_t ls_remove_spin;
- wait_queue_head_t ls_remove_wait;
- char ls_remove_name[DLM_RESNAME_MAXLEN+1];
char *ls_remove_names[DLM_REMOVE_NAMES_MAX];
- int ls_remove_len;
int ls_remove_lens[DLM_REMOVE_NAMES_MAX];
struct list_head ls_nodes; /* current nodes in ls */
@@ -631,7 +628,7 @@ struct dlm_ls {
/* recovery related */
- struct mutex ls_cb_mutex;
+ spinlock_t ls_cb_lock;
struct list_head ls_cb_delay; /* save for queue_work later */
struct timer_list ls_timer;
struct task_struct *ls_recoverd_task;
@@ -670,7 +667,7 @@ struct dlm_ls {
void *ls_ops_arg;
int ls_namelen;
- char ls_name[1];
+ char ls_name[DLM_LOCKSPACE_LEN + 1];
};
/*
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 94a72ede5764..e1adfa5aed05 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1209,6 +1209,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
if (!lkb)
return -ENOMEM;
+ lkb->lkb_last_bast_mode = -1;
lkb->lkb_nodeid = -1;
lkb->lkb_grmode = DLM_LOCK_IV;
kref_init(&lkb->lkb_ref);
@@ -1218,7 +1219,8 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
INIT_LIST_HEAD(&lkb->lkb_time_list);
#endif
INIT_LIST_HEAD(&lkb->lkb_cb_list);
- mutex_init(&lkb->lkb_cb_mutex);
+ INIT_LIST_HEAD(&lkb->lkb_callbacks);
+ spin_lock_init(&lkb->lkb_cb_lock);
INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
idr_preload(GFP_NOFS);
@@ -1587,37 +1589,6 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
return error;
}
-/* If there's an rsb for the same resource being removed, ensure
- * that the remove message is sent before the new lookup message.
- */
-
-#define DLM_WAIT_PENDING_COND(ls, r) \
- (ls->ls_remove_len && \
- !rsb_cmp(r, ls->ls_remove_name, \
- ls->ls_remove_len))
-
-static void wait_pending_remove(struct dlm_rsb *r)
-{
- struct dlm_ls *ls = r->res_ls;
- restart:
- spin_lock(&ls->ls_remove_spin);
- if (DLM_WAIT_PENDING_COND(ls, r)) {
- log_debug(ls, "delay lookup for remove dir %d %s",
- r->res_dir_nodeid, r->res_name);
- spin_unlock(&ls->ls_remove_spin);
- wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r));
- goto restart;
- }
- spin_unlock(&ls->ls_remove_spin);
-}
-
-/*
- * ls_remove_spin protects ls_remove_name and ls_remove_len which are
- * read by other threads in wait_pending_remove. ls_remove_names
- * and ls_remove_lens are only used by the scan thread, so they do
- * not need protection.
- */
-
static void shrink_bucket(struct dlm_ls *ls, int b)
{
struct rb_node *n, *next;
@@ -1699,11 +1670,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
* list and sending the removal. Keeping this gap small is
* important to keep us (the master node) from being out of sync
* with the remote dir node for very long.
- *
- * From the time the rsb is removed from toss until just after
- * send_remove, the rsb name is saved in ls_remove_name. A new
- * lookup checks this to ensure that a new lookup message for the
- * same resource name is not sent just before the remove message.
*/
for (i = 0; i < remote_count; i++) {
@@ -1750,22 +1716,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
}
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
-
- /* block lookup of same name until we've sent remove */
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = len;
- memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- spin_unlock(&ls->ls_rsbtbl[b].lock);
-
send_remove(r);
-
- /* allow lookup of name again */
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = 0;
- memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- wake_up(&ls->ls_remove_wait);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
dlm_free_rsb(r);
}
@@ -2716,8 +2668,6 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
return 0;
}
- wait_pending_remove(r);
-
r->res_first_lkid = lkb->lkb_id;
send_lookup(r, lkb);
return 1;
@@ -3552,7 +3502,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
static int _create_message(struct dlm_ls *ls, int mb_len,
int to_nodeid, int mstype,
struct dlm_message **ms_ret,
- struct dlm_mhandle **mh_ret)
+ struct dlm_mhandle **mh_ret,
+ gfp_t allocation)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
@@ -3562,7 +3513,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
pass into midcomms_commit and a message buffer (mb) that we
write our data into */
- mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
+ mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb);
if (!mh)
return -ENOBUFS;
@@ -3584,7 +3535,8 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
int to_nodeid, int mstype,
struct dlm_message **ms_ret,
- struct dlm_mhandle **mh_ret)
+ struct dlm_mhandle **mh_ret,
+ gfp_t allocation)
{
int mb_len = sizeof(struct dlm_message);
@@ -3605,15 +3557,16 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
}
return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
- ms_ret, mh_ret);
+ ms_ret, mh_ret, allocation);
}
/* further lowcomms enhancements or alternate implementations may make
the return value from this function useful at some point */
-static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms,
+ const void *name, int namelen)
{
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, name, namelen);
return 0;
}
@@ -3673,13 +3626,13 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
if (error)
return error;
- error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
if (error)
goto fail;
send_args(r, lkb, ms);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
if (error)
goto fail;
return 0;
@@ -3734,7 +3687,8 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
to_nodeid = lkb->lkb_nodeid;
- error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+ error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh,
+ GFP_NOFS);
if (error)
goto out;
@@ -3742,7 +3696,7 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
ms->m_result = 0;
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3755,7 +3709,8 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
to_nodeid = lkb->lkb_nodeid;
- error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh,
+ GFP_NOFS);
if (error)
goto out;
@@ -3763,7 +3718,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
ms->m_bastmode = cpu_to_le32(mode);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3780,13 +3735,14 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
if (error)
return error;
- error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh,
+ GFP_NOFS);
if (error)
goto fail;
send_args(r, lkb, ms);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
if (error)
goto fail;
return 0;
@@ -3804,14 +3760,15 @@ static int send_remove(struct dlm_rsb *r)
to_nodeid = dlm_dir_nodeid(r);
- error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh,
+ GFP_ATOMIC);
if (error)
goto out;
memcpy(ms->m_extra, r->res_name, r->res_length);
ms->m_hash = cpu_to_le32(r->res_hash);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3825,7 +3782,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
to_nodeid = lkb->lkb_nodeid;
- error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
if (error)
goto out;
@@ -3833,7 +3790,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
ms->m_result = cpu_to_le32(to_dlm_errno(rv));
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3866,7 +3823,8 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
struct dlm_mhandle *mh;
int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
- error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+ error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh,
+ GFP_NOFS);
if (error)
goto out;
@@ -3874,7 +3832,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
ms->m_result = cpu_to_le32(to_dlm_errno(rv));
ms->m_nodeid = cpu_to_le32(ret_nodeid);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, ms_in->m_extra, receive_extralen(ms_in));
out:
return error;
}
@@ -4044,66 +4002,6 @@ out:
return error;
}
-static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
-{
- char name[DLM_RESNAME_MAXLEN + 1];
- struct dlm_message *ms;
- struct dlm_mhandle *mh;
- struct dlm_rsb *r;
- uint32_t hash, b;
- int rv, dir_nodeid;
-
- memset(name, 0, sizeof(name));
- memcpy(name, ms_name, len);
-
- hash = jhash(name, len, 0);
- b = hash & (ls->ls_rsbtbl_size - 1);
-
- dir_nodeid = dlm_hash2nodeid(ls, hash);
-
- log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name);
-
- spin_lock(&ls->ls_rsbtbl[b].lock);
- rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
- if (!rv) {
- spin_unlock(&ls->ls_rsbtbl[b].lock);
- log_error(ls, "repeat_remove on keep %s", name);
- return;
- }
-
- rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
- if (!rv) {
- spin_unlock(&ls->ls_rsbtbl[b].lock);
- log_error(ls, "repeat_remove on toss %s", name);
- return;
- }
-
- /* use ls->remove_name2 to avoid conflict with shrink? */
-
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = len;
- memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- spin_unlock(&ls->ls_rsbtbl[b].lock);
-
- rv = _create_message(ls, sizeof(struct dlm_message) + len,
- dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
- if (rv)
- goto out;
-
- memcpy(ms->m_extra, name, len);
- ms->m_hash = cpu_to_le32(hash);
-
- send_message(mh, ms);
-
-out:
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = 0;
- memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- wake_up(&ls->ls_remove_wait);
-}
-
static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
@@ -4173,25 +4071,11 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
ENOTBLK request failures when the lookup reply designating us
as master is delayed. */
- /* We could repeatedly return -EBADR here if our send_remove() is
- delayed in being sent/arriving/being processed on the dir node.
- Another node would repeatedly lookup up the master, and the dir
- node would continue returning our nodeid until our send_remove
- took effect.
-
- We send another remove message in case our previous send_remove
- was lost/ignored/missed somehow. */
-
if (error != -ENOTBLK) {
log_limit(ls, "receive_request %x from %d %d",
le32_to_cpu(ms->m_lkid), from_nodeid, error);
}
- if (namelen && error == -EBADR) {
- send_repeat_remove(ls, ms->m_extra, namelen);
- msleep(1000);
- }
-
setup_stub_lkb(ls, ms);
send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
return error;
@@ -6294,8 +6178,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
}
list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
- memset(&lkb->lkb_callbacks, 0,
- sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+ dlm_purge_lkb_callbacks(lkb);
list_del_init(&lkb->lkb_cb_list);
dlm_put_lkb(lkb);
}
@@ -6336,8 +6219,7 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
spin_lock(&proc->asts_spin);
list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
- memset(&lkb->lkb_callbacks, 0,
- sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+ dlm_purge_lkb_callbacks(lkb);
list_del_init(&lkb->lkb_cb_list);
dlm_put_lkb(lkb);
}
@@ -6368,13 +6250,13 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
int error;
error = _create_message(ls, sizeof(struct dlm_message), nodeid,
- DLM_MSG_PURGE, &ms, &mh);
+ DLM_MSG_PURGE, &ms, &mh, GFP_NOFS);
if (error)
return error;
ms->m_nodeid = cpu_to_le32(nodeid);
ms->m_pid = cpu_to_le32(pid);
- return send_message(mh, ms);
+ return send_message(mh, ms, NULL, 0);
}
int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index bae050df7abf..d0b4e2181a5f 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -17,7 +17,6 @@
#include "recoverd.h"
#include "dir.h"
#include "midcomms.h"
-#include "lowcomms.h"
#include "config.h"
#include "memory.h"
#include "lock.h"
@@ -391,7 +390,7 @@ static int threads_start(void)
/* Thread for sending/receiving messages for all lockspace's */
error = dlm_midcomms_start();
if (error) {
- log_print("cannot start dlm lowcomms %d", error);
+ log_print("cannot start dlm midcomms %d", error);
goto scand_fail;
}
@@ -473,7 +472,7 @@ static int new_lockspace(const char *name, const char *cluster,
error = -ENOMEM;
- ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
+ ls = kzalloc(sizeof(*ls), GFP_NOFS);
if (!ls)
goto out;
memcpy(ls->ls_name, name, namelen);
@@ -524,9 +523,6 @@ static int new_lockspace(const char *name, const char *cluster,
spin_lock_init(&ls->ls_rsbtbl[i].lock);
}
- spin_lock_init(&ls->ls_remove_spin);
- init_waitqueue_head(&ls->ls_remove_wait);
-
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
GFP_KERNEL);
@@ -567,7 +563,7 @@ static int new_lockspace(const char *name, const char *cluster,
init_completion(&ls->ls_recovery_done);
ls->ls_recovery_result = -1;
- mutex_init(&ls->ls_cb_mutex);
+ spin_lock_init(&ls->ls_cb_lock);
INIT_LIST_HEAD(&ls->ls_cb_delay);
ls->ls_recoverd_task = NULL;
@@ -726,7 +722,7 @@ static int __dlm_new_lockspace(const char *name, const char *cluster,
if (!ls_count) {
dlm_scand_stop();
dlm_midcomms_shutdown();
- dlm_lowcomms_stop();
+ dlm_midcomms_stop();
}
out:
mutex_unlock(&ls_lock);
@@ -929,7 +925,7 @@ int dlm_release_lockspace(void *lockspace, int force)
if (!error)
ls_count--;
if (!ls_count)
- dlm_lowcomms_stop();
+ dlm_midcomms_stop();
mutex_unlock(&ls_lock);
return error;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 59f64c596233..8b80ca0cd65f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,41 +63,49 @@
#define NEEDED_RMEM (4*1024*1024)
-/* Number of messages to send before rescheduling */
-#define MAX_SEND_MSG_COUNT 25
-#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000)
-
struct connection {
struct socket *sock; /* NULL if not connected */
uint32_t nodeid; /* So we know who we are in the list */
- struct mutex sock_mutex;
+ /* this semaphore is used to allow parallel recv/send in read
+ * lock mode. When we release a sock we need to held the write lock.
+ *
+ * However this is locking code and not nice. When we remove the
+ * othercon handling we can look into other mechanism to synchronize
+ * io handling to call sock_release() at the right time.
+ */
+ struct rw_semaphore sock_lock;
unsigned long flags;
-#define CF_READ_PENDING 1
-#define CF_WRITE_PENDING 2
-#define CF_INIT_PENDING 4
+#define CF_APP_LIMITED 0
+#define CF_RECV_PENDING 1
+#define CF_SEND_PENDING 2
+#define CF_RECV_INTR 3
+#define CF_IO_STOP 4
#define CF_IS_OTHERCON 5
-#define CF_CLOSE 6
-#define CF_APP_LIMITED 7
-#define CF_CLOSING 8
-#define CF_SHUTDOWN 9
-#define CF_CONNECTED 10
-#define CF_RECONNECT 11
-#define CF_DELAY_CONNECT 12
-#define CF_EOF 13
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
- atomic_t writequeue_cnt;
int retries;
-#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
+ /* due some connect()/accept() races we currently have this cross over
+ * connection attempt second connection for one node.
+ *
+ * There is a solution to avoid the race by introducing a connect
+ * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to
+ * connect. Otherside can connect but will only be considered that
+ * the other side wants to have a reconnect.
+ *
+ * However changing to this behaviour will break backwards compatible.
+ * In a DLM protocol major version upgrade we should remove this!
+ */
struct connection *othercon;
- struct connection *sendcon;
- struct work_struct rwork; /* Receive workqueue */
- struct work_struct swork; /* Send workqueue */
- wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
- unsigned char *rx_buf;
- int rx_buflen;
+ struct work_struct rwork; /* receive worker */
+ struct work_struct swork; /* send worker */
+ unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE];
int rx_leftover;
+ int mark;
+ int addr_count;
+ int curr_addr_index;
+ struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT];
+ spinlock_t addrs_lock;
struct rcu_head rcu;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -136,13 +144,12 @@ struct dlm_msg {
struct kref ref;
};
-struct dlm_node_addr {
- struct list_head list;
+struct processqueue_entry {
+ unsigned char *buf;
int nodeid;
- int mark;
- int addr_count;
- int curr_addr_index;
- struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+ int buflen;
+
+ struct list_head list;
};
struct dlm_proto_ops {
@@ -157,10 +164,6 @@ struct dlm_proto_ops {
int (*listen_validate)(void);
void (*listen_sockopts)(struct socket *sock);
int (*listen_bind)(struct socket *sock);
- /* What to do to shutdown */
- void (*shutdown_action)(struct connection *con);
- /* What to do to eof check */
- bool (*eof_condition)(struct connection *con);
};
static struct listen_sock_callbacks {
@@ -170,17 +173,13 @@ static struct listen_sock_callbacks {
void (*sk_write_space)(struct sock *);
} listen_sock;
-static LIST_HEAD(dlm_node_addrs);
-static DEFINE_SPINLOCK(dlm_node_addrs_spin);
-
static struct listen_connection listen_con;
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT];
static int dlm_local_count;
-int dlm_allow_conn;
/* Work queues */
-static struct workqueue_struct *recv_workqueue;
-static struct workqueue_struct *send_workqueue;
+static struct workqueue_struct *io_workqueue;
+static struct workqueue_struct *process_workqueue;
static struct hlist_head connection_hash[CONN_HASH_SIZE];
static DEFINE_SPINLOCK(connections_lock);
@@ -188,8 +187,45 @@ DEFINE_STATIC_SRCU(connections_srcu);
static const struct dlm_proto_ops *dlm_proto_ops;
+#define DLM_IO_SUCCESS 0
+#define DLM_IO_END 1
+#define DLM_IO_EOF 2
+#define DLM_IO_RESCHED 3
+
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
+static void process_dlm_messages(struct work_struct *work);
+
+static DECLARE_WORK(process_work, process_dlm_messages);
+static DEFINE_SPINLOCK(processqueue_lock);
+static bool process_dlm_messages_pending;
+static LIST_HEAD(processqueue);
+
+bool dlm_lowcomms_is_running(void)
+{
+ return !!listen_con.sock;
+}
+
+static void lowcomms_queue_swork(struct connection *con)
+{
+ assert_spin_locked(&con->writequeue_lock);
+
+ if (!test_bit(CF_IO_STOP, &con->flags) &&
+ !test_bit(CF_APP_LIMITED, &con->flags) &&
+ !test_and_set_bit(CF_SEND_PENDING, &con->flags))
+ queue_work(io_workqueue, &con->swork);
+}
+
+static void lowcomms_queue_rwork(struct connection *con)
+{
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk));
+#endif
+
+ if (!test_bit(CF_IO_STOP, &con->flags) &&
+ !test_and_set_bit(CF_RECV_PENDING, &con->flags))
+ queue_work(io_workqueue, &con->rwork);
+}
static void writequeue_entry_ctor(void *data)
{
@@ -214,15 +250,12 @@ static struct writequeue_entry *con_next_wq(struct connection *con)
{
struct writequeue_entry *e;
- if (list_empty(&con->writequeue))
- return NULL;
-
- e = list_first_entry(&con->writequeue, struct writequeue_entry,
- list);
+ e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry,
+ list);
/* if len is zero nothing is to send, if there are users filling
* buffers we wait until the users are done so we can send more.
*/
- if (e->users || e->len == 0)
+ if (!e || e->users || e->len == 0)
return NULL;
return e;
@@ -240,28 +273,15 @@ static struct connection *__find_con(int nodeid, int r)
return NULL;
}
-static bool tcp_eof_condition(struct connection *con)
-{
- return atomic_read(&con->writequeue_cnt);
-}
-
-static int dlm_con_init(struct connection *con, int nodeid)
+static void dlm_con_init(struct connection *con, int nodeid)
{
- con->rx_buflen = dlm_config.ci_buffer_size;
- con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
- if (!con->rx_buf)
- return -ENOMEM;
-
con->nodeid = nodeid;
- mutex_init(&con->sock_mutex);
+ init_rwsem(&con->sock_lock);
INIT_LIST_HEAD(&con->writequeue);
spin_lock_init(&con->writequeue_lock);
- atomic_set(&con->writequeue_cnt, 0);
INIT_WORK(&con->swork, process_send_sockets);
INIT_WORK(&con->rwork, process_recv_sockets);
- init_waitqueue_head(&con->shutdown_wait);
-
- return 0;
+ spin_lock_init(&con->addrs_lock);
}
/*
@@ -271,7 +291,7 @@ static int dlm_con_init(struct connection *con, int nodeid)
static struct connection *nodeid2con(int nodeid, gfp_t alloc)
{
struct connection *con, *tmp;
- int r, ret;
+ int r;
r = nodeid_hash(nodeid);
con = __find_con(nodeid, r);
@@ -282,11 +302,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
if (!con)
return NULL;
- ret = dlm_con_init(con, nodeid);
- if (ret) {
- kfree(con);
- return NULL;
- }
+ dlm_con_init(con, nodeid);
spin_lock(&connections_lock);
/* Because multiple workqueues/threads calls this function it can
@@ -298,7 +314,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
tmp = __find_con(nodeid, r);
if (tmp) {
spin_unlock(&connections_lock);
- kfree(con->rx_buf);
kfree(con);
return tmp;
}
@@ -309,29 +324,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
return con;
}
-/* Loop round all connections */
-static void foreach_conn(void (*conn_func)(struct connection *c))
-{
- int i;
- struct connection *con;
-
- for (i = 0; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry_rcu(con, &connection_hash[i], list)
- conn_func(con);
- }
-}
-
-static struct dlm_node_addr *find_node_addr(int nodeid)
-{
- struct dlm_node_addr *na;
-
- list_for_each_entry(na, &dlm_node_addrs, list) {
- if (na->nodeid == nodeid)
- return na;
- }
- return NULL;
-}
-
static int addr_compare(const struct sockaddr_storage *x,
const struct sockaddr_storage *y)
{
@@ -365,40 +357,47 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
unsigned int *mark)
{
struct sockaddr_storage sas;
- struct dlm_node_addr *na;
+ struct connection *con;
+ int idx;
if (!dlm_local_count)
return -1;
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (na && na->addr_count) {
- memcpy(&sas, na->addr[na->curr_addr_index],
- sizeof(struct sockaddr_storage));
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
- if (try_new_addr) {
- na->curr_addr_index++;
- if (na->curr_addr_index == na->addr_count)
- na->curr_addr_index = 0;
- }
+ spin_lock(&con->addrs_lock);
+ if (!con->addr_count) {
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
}
- spin_unlock(&dlm_node_addrs_spin);
- if (!na)
- return -EEXIST;
+ memcpy(&sas, &con->addr[con->curr_addr_index],
+ sizeof(struct sockaddr_storage));
- if (!na->addr_count)
- return -ENOENT;
+ if (try_new_addr) {
+ con->curr_addr_index++;
+ if (con->curr_addr_index == con->addr_count)
+ con->curr_addr_index = 0;
+ }
- *mark = na->mark;
+ *mark = con->mark;
+ spin_unlock(&con->addrs_lock);
if (sas_out)
memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
- if (!sa_out)
+ if (!sa_out) {
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
+ }
- if (dlm_local_addr[0]->ss_family == AF_INET) {
+ if (dlm_local_addr[0].ss_family == AF_INET) {
struct sockaddr_in *in4 = (struct sockaddr_in *) &sas;
struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
@@ -408,43 +407,46 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
ret6->sin6_addr = in6->sin6_addr;
}
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
}
static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid,
unsigned int *mark)
{
- struct dlm_node_addr *na;
- int rv = -EEXIST;
- int addr_i;
-
- spin_lock(&dlm_node_addrs_spin);
- list_for_each_entry(na, &dlm_node_addrs, list) {
- if (!na->addr_count)
- continue;
-
- for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
- if (addr_compare(na->addr[addr_i], addr)) {
- *nodeid = na->nodeid;
- *mark = na->mark;
- rv = 0;
- goto unlock;
+ struct connection *con;
+ int i, idx, addr_i;
+
+ idx = srcu_read_lock(&connections_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+ WARN_ON_ONCE(!con->addr_count);
+
+ spin_lock(&con->addrs_lock);
+ for (addr_i = 0; addr_i < con->addr_count; addr_i++) {
+ if (addr_compare(&con->addr[addr_i], addr)) {
+ *nodeid = con->nodeid;
+ *mark = con->mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return 0;
+ }
}
+ spin_unlock(&con->addrs_lock);
}
}
-unlock:
- spin_unlock(&dlm_node_addrs_spin);
- return rv;
+ srcu_read_unlock(&connections_srcu, idx);
+
+ return -ENOENT;
}
-/* caller need to held dlm_node_addrs_spin lock */
-static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
- const struct sockaddr_storage *addr)
+static bool dlm_lowcomms_con_has_addr(const struct connection *con,
+ const struct sockaddr_storage *addr)
{
int i;
- for (i = 0; i < na->addr_count; i++) {
- if (addr_compare(na->addr[i], addr))
+ for (i = 0; i < con->addr_count; i++) {
+ if (addr_compare(&con->addr[i], addr))
return true;
}
@@ -453,118 +455,82 @@ static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
{
- struct sockaddr_storage *new_addr;
- struct dlm_node_addr *new_node, *na;
- bool ret;
-
- new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
- if (!new_node)
- return -ENOMEM;
+ struct connection *con;
+ bool ret, idx;
- new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS);
- if (!new_addr) {
- kfree(new_node);
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, GFP_NOFS);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOMEM;
}
- memcpy(new_addr, addr, len);
-
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (!na) {
- new_node->nodeid = nodeid;
- new_node->addr[0] = new_addr;
- new_node->addr_count = 1;
- new_node->mark = dlm_config.ci_mark;
- list_add(&new_node->list, &dlm_node_addrs);
- spin_unlock(&dlm_node_addrs_spin);
+ spin_lock(&con->addrs_lock);
+ if (!con->addr_count) {
+ memcpy(&con->addr[0], addr, sizeof(*addr));
+ con->addr_count = 1;
+ con->mark = dlm_config.ci_mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
}
- ret = dlm_lowcomms_na_has_addr(na, addr);
+ ret = dlm_lowcomms_con_has_addr(con, addr);
if (ret) {
- spin_unlock(&dlm_node_addrs_spin);
- kfree(new_addr);
- kfree(new_node);
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return -EEXIST;
}
- if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
- spin_unlock(&dlm_node_addrs_spin);
- kfree(new_addr);
- kfree(new_node);
+ if (con->addr_count >= DLM_MAX_ADDR_COUNT) {
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOSPC;
}
- na->addr[na->addr_count++] = new_addr;
- spin_unlock(&dlm_node_addrs_spin);
- kfree(new_node);
+ memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr));
+ srcu_read_unlock(&connections_srcu, idx);
+ spin_unlock(&con->addrs_lock);
return 0;
}
/* Data available on socket or listen socket received a connect */
static void lowcomms_data_ready(struct sock *sk)
{
- struct connection *con;
-
- con = sock2con(sk);
- if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
- queue_work(recv_workqueue, &con->rwork);
-}
-
-static void lowcomms_listen_data_ready(struct sock *sk)
-{
- if (!dlm_allow_conn)
- return;
+ struct connection *con = sock2con(sk);
- queue_work(recv_workqueue, &listen_con.rwork);
+ set_bit(CF_RECV_INTR, &con->flags);
+ lowcomms_queue_rwork(con);
}
static void lowcomms_write_space(struct sock *sk)
{
- struct connection *con;
-
- con = sock2con(sk);
- if (!con)
- return;
-
- if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
- log_print("connected to node %d", con->nodeid);
- queue_work(send_workqueue, &con->swork);
- return;
- }
+ struct connection *con = sock2con(sk);
clear_bit(SOCK_NOSPACE, &con->sock->flags);
+ spin_lock_bh(&con->writequeue_lock);
if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
con->sock->sk->sk_write_pending--;
clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
}
- queue_work(send_workqueue, &con->swork);
-}
-
-static inline void lowcomms_connect_sock(struct connection *con)
-{
- if (test_bit(CF_CLOSE, &con->flags))
- return;
- queue_work(send_workqueue, &con->swork);
- cond_resched();
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
}
static void lowcomms_state_change(struct sock *sk)
{
/* SCTP layer is not calling sk_data_ready when the connection
- * is done, so we catch the signal through here. Also, it
- * doesn't switch socket state when entering shutdown, so we
- * skip the write in that case.
+ * is done, so we catch the signal through here.
*/
- if (sk->sk_shutdown) {
- if (sk->sk_shutdown == RCV_SHUTDOWN)
- lowcomms_data_ready(sk);
- } else if (sk->sk_state == TCP_ESTABLISHED) {
- lowcomms_write_space(sk);
- }
+ if (sk->sk_shutdown == RCV_SHUTDOWN)
+ lowcomms_data_ready(sk);
+}
+
+static void lowcomms_listen_data_ready(struct sock *sk)
+{
+ queue_work(io_workqueue, &listen_con.rwork);
}
int dlm_lowcomms_connect_node(int nodeid)
@@ -576,47 +542,49 @@ int dlm_lowcomms_connect_node(int nodeid)
return 0;
idx = srcu_read_lock(&connections_srcu);
- con = nodeid2con(nodeid, GFP_NOFS);
- if (!con) {
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
srcu_read_unlock(&connections_srcu, idx);
- return -ENOMEM;
+ return -ENOENT;
}
- lowcomms_connect_sock(con);
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ }
+ up_read(&con->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
+ cond_resched();
return 0;
}
int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
{
- struct dlm_node_addr *na;
+ struct connection *con;
+ int idx;
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (!na) {
- spin_unlock(&dlm_node_addrs_spin);
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOENT;
}
- na->mark = mark;
- spin_unlock(&dlm_node_addrs_spin);
-
+ spin_lock(&con->addrs_lock);
+ con->mark = mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
}
static void lowcomms_error_report(struct sock *sk)
{
- struct connection *con;
- void (*orig_report)(struct sock *) = NULL;
+ struct connection *con = sock2con(sk);
struct inet_sock *inet;
- con = sock2con(sk);
- if (con == NULL)
- goto out;
-
- orig_report = listen_sock.sk_error_report;
-
inet = inet_sk(sk);
switch (sk->sk_family) {
case AF_INET:
@@ -642,66 +610,25 @@ static void lowcomms_error_report(struct sock *sk)
"invalid socket family %d set, "
"sk_err=%d/%d\n", dlm_our_nodeid(),
sk->sk_family, sk->sk_err, sk->sk_err_soft);
- goto out;
- }
-
- /* below sendcon only handling */
- if (test_bit(CF_IS_OTHERCON, &con->flags))
- con = con->sendcon;
-
- switch (sk->sk_err) {
- case ECONNREFUSED:
- set_bit(CF_DELAY_CONNECT, &con->flags);
- break;
- default:
break;
}
- if (!test_and_set_bit(CF_RECONNECT, &con->flags))
- queue_work(send_workqueue, &con->swork);
+ dlm_midcomms_unack_msg_resend(con->nodeid);
-out:
- if (orig_report)
- orig_report(sk);
+ listen_sock.sk_error_report(sk);
}
-/* Note: sk_callback_lock must be locked before calling this function. */
-static void save_listen_callbacks(struct socket *sock)
+static void restore_callbacks(struct sock *sk)
{
- struct sock *sk = sock->sk;
-
- listen_sock.sk_data_ready = sk->sk_data_ready;
- listen_sock.sk_state_change = sk->sk_state_change;
- listen_sock.sk_write_space = sk->sk_write_space;
- listen_sock.sk_error_report = sk->sk_error_report;
-}
-
-static void restore_callbacks(struct socket *sock)
-{
- struct sock *sk = sock->sk;
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(!lockdep_sock_is_held(sk));
+#endif
- lock_sock(sk);
sk->sk_user_data = NULL;
sk->sk_data_ready = listen_sock.sk_data_ready;
sk->sk_state_change = listen_sock.sk_state_change;
sk->sk_write_space = listen_sock.sk_write_space;
sk->sk_error_report = listen_sock.sk_error_report;
- release_sock(sk);
-}
-
-static void add_listen_sock(struct socket *sock, struct listen_connection *con)
-{
- struct sock *sk = sock->sk;
-
- lock_sock(sk);
- save_listen_callbacks(sock);
- con->sock = sock;
-
- sk->sk_user_data = con;
- sk->sk_allocation = GFP_NOFS;
- /* Install a data_ready callback */
- sk->sk_data_ready = lowcomms_listen_data_ready;
- release_sock(sk);
}
/* Make a socket active */
@@ -713,10 +640,10 @@ static void add_sock(struct socket *sock, struct connection *con)
con->sock = sock;
sk->sk_user_data = con;
- /* Install a data_ready callback */
sk->sk_data_ready = lowcomms_data_ready;
sk->sk_write_space = lowcomms_write_space;
- sk->sk_state_change = lowcomms_state_change;
+ if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
+ sk->sk_state_change = lowcomms_state_change;
sk->sk_allocation = GFP_NOFS;
sk->sk_error_report = lowcomms_error_report;
release_sock(sk);
@@ -727,7 +654,7 @@ static void add_sock(struct socket *sock, struct connection *con)
static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
int *addr_len)
{
- saddr->ss_family = dlm_local_addr[0]->ss_family;
+ saddr->ss_family = dlm_local_addr[0].ss_family;
if (saddr->ss_family == AF_INET) {
struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
in4_addr->sin_port = cpu_to_be16(port);
@@ -773,43 +700,67 @@ static void free_entry(struct writequeue_entry *e)
}
list_del(&e->list);
- atomic_dec(&e->con->writequeue_cnt);
kref_put(&e->ref, dlm_page_release);
}
static void dlm_close_sock(struct socket **sock)
{
- if (*sock) {
- restore_callbacks(*sock);
- sock_release(*sock);
- *sock = NULL;
+ lock_sock((*sock)->sk);
+ restore_callbacks((*sock)->sk);
+ release_sock((*sock)->sk);
+
+ sock_release(*sock);
+ *sock = NULL;
+}
+
+static void allow_connection_io(struct connection *con)
+{
+ if (con->othercon)
+ clear_bit(CF_IO_STOP, &con->othercon->flags);
+ clear_bit(CF_IO_STOP, &con->flags);
+}
+
+static void stop_connection_io(struct connection *con)
+{
+ if (con->othercon)
+ stop_connection_io(con->othercon);
+
+ down_write(&con->sock_lock);
+ if (con->sock) {
+ lock_sock(con->sock->sk);
+ restore_callbacks(con->sock->sk);
+
+ spin_lock_bh(&con->writequeue_lock);
+ set_bit(CF_IO_STOP, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
+ } else {
+ spin_lock_bh(&con->writequeue_lock);
+ set_bit(CF_IO_STOP, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
}
+ up_write(&con->sock_lock);
+
+ cancel_work_sync(&con->swork);
+ cancel_work_sync(&con->rwork);
}
/* Close a remote connection and tidy up */
-static void close_connection(struct connection *con, bool and_other,
- bool tx, bool rx)
+static void close_connection(struct connection *con, bool and_other)
{
- bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
struct writequeue_entry *e;
- if (tx && !closing && cancel_work_sync(&con->swork)) {
- log_print("canceled swork for node %d", con->nodeid);
- clear_bit(CF_WRITE_PENDING, &con->flags);
- }
- if (rx && !closing && cancel_work_sync(&con->rwork)) {
- log_print("canceled rwork for node %d", con->nodeid);
- clear_bit(CF_READ_PENDING, &con->flags);
+ if (con->othercon && and_other)
+ close_connection(con->othercon, false);
+
+ down_write(&con->sock_lock);
+ if (!con->sock) {
+ up_write(&con->sock_lock);
+ return;
}
- mutex_lock(&con->sock_mutex);
dlm_close_sock(&con->sock);
- if (con->othercon && and_other) {
- /* Will only re-enter once. */
- close_connection(con->othercon, false, tx, rx);
- }
-
/* if we send a writequeue entry only a half way, we drop the
* whole entry because reconnection and that we not start of the
* middle of a msg which will confuse the other end.
@@ -821,200 +772,209 @@ static void close_connection(struct connection *con, bool and_other,
* our policy is to start on a clean state when disconnects, we don't
* know what's send/received on transport layer in this case.
*/
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
if (!list_empty(&con->writequeue)) {
e = list_first_entry(&con->writequeue, struct writequeue_entry,
list);
if (e->dirty)
free_entry(e);
}
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
con->rx_leftover = 0;
con->retries = 0;
clear_bit(CF_APP_LIMITED, &con->flags);
- clear_bit(CF_CONNECTED, &con->flags);
- clear_bit(CF_DELAY_CONNECT, &con->flags);
- clear_bit(CF_RECONNECT, &con->flags);
- clear_bit(CF_EOF, &con->flags);
- mutex_unlock(&con->sock_mutex);
- clear_bit(CF_CLOSING, &con->flags);
+ clear_bit(CF_RECV_PENDING, &con->flags);
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ up_write(&con->sock_lock);
}
-static void shutdown_connection(struct connection *con)
+static struct processqueue_entry *new_processqueue_entry(int nodeid,
+ int buflen)
{
- int ret;
-
- flush_work(&con->swork);
+ struct processqueue_entry *pentry;
- mutex_lock(&con->sock_mutex);
- /* nothing to shutdown */
- if (!con->sock) {
- mutex_unlock(&con->sock_mutex);
- return;
- }
+ pentry = kmalloc(sizeof(*pentry), GFP_NOFS);
+ if (!pentry)
+ return NULL;
- set_bit(CF_SHUTDOWN, &con->flags);
- ret = kernel_sock_shutdown(con->sock, SHUT_WR);
- mutex_unlock(&con->sock_mutex);
- if (ret) {
- log_print("Connection %p failed to shutdown: %d will force close",
- con, ret);
- goto force_close;
- } else {
- ret = wait_event_timeout(con->shutdown_wait,
- !test_bit(CF_SHUTDOWN, &con->flags),
- DLM_SHUTDOWN_WAIT_TIMEOUT);
- if (ret == 0) {
- log_print("Connection %p shutdown timed out, will force close",
- con);
- goto force_close;
- }
+ pentry->buf = kmalloc(buflen, GFP_NOFS);
+ if (!pentry->buf) {
+ kfree(pentry);
+ return NULL;
}
- return;
+ pentry->nodeid = nodeid;
+ return pentry;
+}
-force_close:
- clear_bit(CF_SHUTDOWN, &con->flags);
- close_connection(con, false, true, true);
+static void free_processqueue_entry(struct processqueue_entry *pentry)
+{
+ kfree(pentry->buf);
+ kfree(pentry);
}
-static void dlm_tcp_shutdown(struct connection *con)
+struct dlm_processed_nodes {
+ int nodeid;
+
+ struct list_head list;
+};
+
+static void add_processed_node(int nodeid, struct list_head *processed_nodes)
{
- if (con->othercon)
- shutdown_connection(con->othercon);
- shutdown_connection(con);
+ struct dlm_processed_nodes *n;
+
+ list_for_each_entry(n, processed_nodes, list) {
+ /* we already remembered this node */
+ if (n->nodeid == nodeid)
+ return;
+ }
+
+ /* if it's fails in worst case we simple don't send an ack back.
+ * We try it next time.
+ */
+ n = kmalloc(sizeof(*n), GFP_NOFS);
+ if (!n)
+ return;
+
+ n->nodeid = nodeid;
+ list_add(&n->list, processed_nodes);
}
-static int con_realloc_receive_buf(struct connection *con, int newlen)
+static void process_dlm_messages(struct work_struct *work)
{
- unsigned char *newbuf;
+ struct dlm_processed_nodes *n, *n_tmp;
+ struct processqueue_entry *pentry;
+ LIST_HEAD(processed_nodes);
- newbuf = kmalloc(newlen, GFP_NOFS);
- if (!newbuf)
- return -ENOMEM;
+ spin_lock(&processqueue_lock);
+ pentry = list_first_entry_or_null(&processqueue,
+ struct processqueue_entry, list);
+ if (WARN_ON_ONCE(!pentry)) {
+ spin_unlock(&processqueue_lock);
+ return;
+ }
- /* copy any leftover from last receive */
- if (con->rx_leftover)
- memmove(newbuf, con->rx_buf, con->rx_leftover);
+ list_del(&pentry->list);
+ spin_unlock(&processqueue_lock);
- /* swap to new buffer space */
- kfree(con->rx_buf);
- con->rx_buflen = newlen;
- con->rx_buf = newbuf;
+ for (;;) {
+ dlm_process_incoming_buffer(pentry->nodeid, pentry->buf,
+ pentry->buflen);
+ add_processed_node(pentry->nodeid, &processed_nodes);
+ free_processqueue_entry(pentry);
+
+ spin_lock(&processqueue_lock);
+ pentry = list_first_entry_or_null(&processqueue,
+ struct processqueue_entry, list);
+ if (!pentry) {
+ process_dlm_messages_pending = false;
+ spin_unlock(&processqueue_lock);
+ break;
+ }
- return 0;
+ list_del(&pentry->list);
+ spin_unlock(&processqueue_lock);
+ }
+
+ /* send ack back after we processed couple of messages */
+ list_for_each_entry_safe(n, n_tmp, &processed_nodes, list) {
+ list_del(&n->list);
+ dlm_midcomms_receive_done(n->nodeid);
+ kfree(n);
+ }
}
/* Data received from remote end */
-static int receive_from_sock(struct connection *con)
+static int receive_from_sock(struct connection *con, int buflen)
{
+ struct processqueue_entry *pentry;
+ int ret, buflen_real;
struct msghdr msg;
struct kvec iov;
- int ret, buflen;
- mutex_lock(&con->sock_mutex);
+ pentry = new_processqueue_entry(con->nodeid, buflen);
+ if (!pentry)
+ return DLM_IO_RESCHED;
- if (con->sock == NULL) {
- ret = -EAGAIN;
- goto out_close;
- }
-
- /* realloc if we get new buffer size to read out */
- buflen = dlm_config.ci_buffer_size;
- if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
- ret = con_realloc_receive_buf(con, buflen);
- if (ret < 0)
- goto out_resched;
- }
+ memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover);
- for (;;) {
- /* calculate new buffer parameter regarding last receive and
- * possible leftover bytes
- */
- iov.iov_base = con->rx_buf + con->rx_leftover;
- iov.iov_len = con->rx_buflen - con->rx_leftover;
-
- memset(&msg, 0, sizeof(msg));
- msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
- ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
- msg.msg_flags);
- trace_dlm_recv(con->nodeid, ret);
- if (ret == -EAGAIN)
- break;
- else if (ret <= 0)
- goto out_close;
-
- /* new buflen according readed bytes and leftover from last receive */
- buflen = ret + con->rx_leftover;
- ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
- if (ret < 0)
- goto out_close;
-
- /* calculate leftover bytes from process and put it into begin of
- * the receive buffer, so next receive we have the full message
- * at the start address of the receive buffer.
- */
- con->rx_leftover = buflen - ret;
- if (con->rx_leftover) {
- memmove(con->rx_buf, con->rx_buf + ret,
- con->rx_leftover);
+ /* calculate new buffer parameter regarding last receive and
+ * possible leftover bytes
+ */
+ iov.iov_base = pentry->buf + con->rx_leftover;
+ iov.iov_len = buflen - con->rx_leftover;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+ clear_bit(CF_RECV_INTR, &con->flags);
+again:
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
+ trace_dlm_recv(con->nodeid, ret);
+ if (ret == -EAGAIN) {
+ lock_sock(con->sock->sk);
+ if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) {
+ release_sock(con->sock->sk);
+ goto again;
}
+
+ clear_bit(CF_RECV_PENDING, &con->flags);
+ release_sock(con->sock->sk);
+ free_processqueue_entry(pentry);
+ return DLM_IO_END;
+ } else if (ret == 0) {
+ /* close will clear CF_RECV_PENDING */
+ free_processqueue_entry(pentry);
+ return DLM_IO_EOF;
+ } else if (ret < 0) {
+ free_processqueue_entry(pentry);
+ return ret;
}
- dlm_midcomms_receive_done(con->nodeid);
- mutex_unlock(&con->sock_mutex);
- return 0;
+ /* new buflen according readed bytes and leftover from last receive */
+ buflen_real = ret + con->rx_leftover;
+ ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf,
+ buflen_real);
+ if (ret < 0) {
+ free_processqueue_entry(pentry);
+ return ret;
+ }
-out_resched:
- if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
- queue_work(recv_workqueue, &con->rwork);
- mutex_unlock(&con->sock_mutex);
- return -EAGAIN;
-
-out_close:
- if (ret == 0) {
- log_print("connection %p got EOF from %d",
- con, con->nodeid);
-
- if (dlm_proto_ops->eof_condition &&
- dlm_proto_ops->eof_condition(con)) {
- set_bit(CF_EOF, &con->flags);
- mutex_unlock(&con->sock_mutex);
- } else {
- mutex_unlock(&con->sock_mutex);
- close_connection(con, false, true, false);
+ pentry->buflen = ret;
- /* handling for tcp shutdown */
- clear_bit(CF_SHUTDOWN, &con->flags);
- wake_up(&con->shutdown_wait);
- }
+ /* calculate leftover bytes from process and put it into begin of
+ * the receive buffer, so next receive we have the full message
+ * at the start address of the receive buffer.
+ */
+ con->rx_leftover = buflen_real - ret;
+ memmove(con->rx_leftover_buf, pentry->buf + ret,
+ con->rx_leftover);
- /* signal to breaking receive worker */
- ret = -1;
- } else {
- mutex_unlock(&con->sock_mutex);
+ spin_lock(&processqueue_lock);
+ list_add_tail(&pentry->list, &processqueue);
+ if (!process_dlm_messages_pending) {
+ process_dlm_messages_pending = true;
+ queue_work(process_workqueue, &process_work);
}
- return ret;
+ spin_unlock(&processqueue_lock);
+
+ return DLM_IO_SUCCESS;
}
/* Listening socket is busy, accept a connection */
-static int accept_from_sock(struct listen_connection *con)
+static int accept_from_sock(void)
{
- int result;
struct sockaddr_storage peeraddr;
- struct socket *newsock;
- int len, idx;
- int nodeid;
+ int len, idx, result, nodeid;
struct connection *newcon;
- struct connection *addcon;
+ struct socket *newsock;
unsigned int mark;
- if (!con->sock)
- return -ENOTCONN;
-
- result = kernel_accept(con->sock, &newsock, O_NONBLOCK);
- if (result < 0)
+ result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK);
+ if (result == -EAGAIN)
+ return DLM_IO_END;
+ else if (result < 0)
goto accept_err;
/* Get the connected socket's peer */
@@ -1062,16 +1022,16 @@ static int accept_from_sock(struct listen_connection *con)
* In this case we store the incoming one in "othercon"
*/
idx = srcu_read_lock(&connections_srcu);
- newcon = nodeid2con(nodeid, GFP_NOFS);
- if (!newcon) {
+ newcon = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!newcon)) {
srcu_read_unlock(&connections_srcu, idx);
- result = -ENOMEM;
+ result = -ENOENT;
goto accept_err;
}
sock_set_mark(newsock->sk, mark);
- mutex_lock(&newcon->sock_mutex);
+ down_write(&newcon->sock_lock);
if (newcon->sock) {
struct connection *othercon = newcon->othercon;
@@ -1079,63 +1039,50 @@ static int accept_from_sock(struct listen_connection *con)
othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
if (!othercon) {
log_print("failed to allocate incoming socket");
- mutex_unlock(&newcon->sock_mutex);
+ up_write(&newcon->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
result = -ENOMEM;
goto accept_err;
}
- result = dlm_con_init(othercon, nodeid);
- if (result < 0) {
- kfree(othercon);
- mutex_unlock(&newcon->sock_mutex);
- srcu_read_unlock(&connections_srcu, idx);
- goto accept_err;
- }
-
- lockdep_set_subclass(&othercon->sock_mutex, 1);
- set_bit(CF_IS_OTHERCON, &othercon->flags);
+ dlm_con_init(othercon, nodeid);
+ lockdep_set_subclass(&othercon->sock_lock, 1);
newcon->othercon = othercon;
- othercon->sendcon = newcon;
+ set_bit(CF_IS_OTHERCON, &othercon->flags);
} else {
/* close other sock con if we have something new */
- close_connection(othercon, false, true, false);
+ close_connection(othercon, false);
}
- mutex_lock(&othercon->sock_mutex);
+ down_write(&othercon->sock_lock);
add_sock(newsock, othercon);
- addcon = othercon;
- mutex_unlock(&othercon->sock_mutex);
+
+ /* check if we receved something while adding */
+ lock_sock(othercon->sock->sk);
+ lowcomms_queue_rwork(othercon);
+ release_sock(othercon->sock->sk);
+ up_write(&othercon->sock_lock);
}
else {
/* accept copies the sk after we've saved the callbacks, so we
don't want to save them a second time or comm errors will
result in calling sk_error_report recursively. */
add_sock(newsock, newcon);
- addcon = newcon;
- }
-
- set_bit(CF_CONNECTED, &addcon->flags);
- mutex_unlock(&newcon->sock_mutex);
-
- /*
- * Add it to the active queue in case we got data
- * between processing the accept adding the socket
- * to the read_sockets list
- */
- if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
- queue_work(recv_workqueue, &addcon->rwork);
+ /* check if we receved something while adding */
+ lock_sock(newcon->sock->sk);
+ lowcomms_queue_rwork(newcon);
+ release_sock(newcon->sock->sk);
+ }
+ up_write(&newcon->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
- return 0;
+ return DLM_IO_SUCCESS;
accept_err:
if (newsock)
sock_release(newsock);
- if (result != -EAGAIN)
- log_print("error accepting connection from node: %d", result);
return result;
}
@@ -1167,7 +1114,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port)
int i, addr_len, result = 0;
for (i = 0; i < dlm_local_count; i++) {
- memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+ memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr));
make_sockaddr(&localaddr, port, &addr_len);
if (!i)
@@ -1187,7 +1134,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port)
/* Get local addresses */
static void init_local(void)
{
- struct sockaddr_storage sas, *addr;
+ struct sockaddr_storage sas;
int i;
dlm_local_count = 0;
@@ -1195,21 +1142,10 @@ static void init_local(void)
if (dlm_our_addr(&sas, i))
break;
- addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS);
- if (!addr)
- break;
- dlm_local_addr[dlm_local_count++] = addr;
+ memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas));
}
}
-static void deinit_local(void)
-{
- int i;
-
- for (i = 0; i < dlm_local_count; i++)
- kfree(dlm_local_addr[i]);
-}
-
static struct writequeue_entry *new_writequeue_entry(struct connection *con)
{
struct writequeue_entry *entry;
@@ -1240,7 +1176,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
{
struct writequeue_entry *e;
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
if (!list_empty(&con->writequeue)) {
e = list_last_entry(&con->writequeue, struct writequeue_entry, list);
if (DLM_WQ_REMAIN_BYTES(e) >= len) {
@@ -1263,14 +1199,13 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
kref_get(&e->ref);
*ppc = page_address(e->page);
e->end += len;
- atomic_inc(&con->writequeue_cnt);
if (cb)
cb(data);
list_add_tail(&e->list, &con->writequeue);
out:
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
return e;
};
@@ -1319,13 +1254,13 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
len < sizeof(struct dlm_header)) {
BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE);
log_print("failed to allocate a buffer of size %d", len);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return NULL;
}
idx = srcu_read_lock(&connections_srcu);
- con = nodeid2con(nodeid, allocation);
- if (!con) {
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
srcu_read_unlock(&connections_srcu, idx);
return NULL;
}
@@ -1350,7 +1285,7 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
struct connection *con = e->con;
int users;
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
kref_get(&msg->ref);
list_add(&msg->list, &e->msgs);
@@ -1359,13 +1294,11 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
goto out;
e->len = DLM_WQ_LENGTH_BYTES(e);
- spin_unlock(&con->writequeue_lock);
- queue_work(send_workqueue, &con->swork);
- return;
+ lowcomms_queue_swork(con);
out:
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
return;
}
@@ -1387,7 +1320,7 @@ void dlm_lowcomms_put_msg(struct dlm_msg *msg)
kref_put(&msg->ref, dlm_msg_release);
}
-/* does not held connections_srcu, usage workqueue only */
+/* does not held connections_srcu, usage lowcomms_error_report only */
int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
{
struct dlm_msg *msg_resend;
@@ -1413,90 +1346,79 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
}
/* Send a message */
-static void send_to_sock(struct connection *con)
+static int send_to_sock(struct connection *con)
{
const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
int len, offset, ret;
- int count = 0;
- mutex_lock(&con->sock_mutex);
- if (con->sock == NULL)
- goto out_connect;
-
- spin_lock(&con->writequeue_lock);
- for (;;) {
- e = con_next_wq(con);
- if (!e)
- break;
+ spin_lock_bh(&con->writequeue_lock);
+ e = con_next_wq(con);
+ if (!e) {
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ return DLM_IO_END;
+ }
- len = e->len;
- offset = e->offset;
- BUG_ON(len == 0 && e->users == 0);
- spin_unlock(&con->writequeue_lock);
-
- ret = kernel_sendpage(con->sock, e->page, offset, len,
- msg_flags);
- trace_dlm_send(con->nodeid, ret);
- if (ret == -EAGAIN || ret == 0) {
- if (ret == -EAGAIN &&
- test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
- !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
- /* Notify TCP that we're limited by the
- * application window size.
- */
- set_bit(SOCK_NOSPACE, &con->sock->flags);
- con->sock->sk->sk_write_pending++;
- }
- cond_resched();
- goto out;
- } else if (ret < 0)
- goto out;
+ len = e->len;
+ offset = e->offset;
+ WARN_ON_ONCE(len == 0 && e->users == 0);
+ spin_unlock_bh(&con->writequeue_lock);
- /* Don't starve people filling buffers */
- if (++count >= MAX_SEND_MSG_COUNT) {
- cond_resched();
- count = 0;
+ ret = kernel_sendpage(con->sock, e->page, offset, len,
+ msg_flags);
+ trace_dlm_send(con->nodeid, ret);
+ if (ret == -EAGAIN || ret == 0) {
+ lock_sock(con->sock->sk);
+ spin_lock_bh(&con->writequeue_lock);
+ if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
+ !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+ /* Notify TCP that we're limited by the
+ * application window size.
+ */
+ set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags);
+ con->sock->sk->sk_write_pending++;
+
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
+
+ /* wait for write_space() event */
+ return DLM_IO_END;
}
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
- spin_lock(&con->writequeue_lock);
- writequeue_entry_complete(e, ret);
- }
- spin_unlock(&con->writequeue_lock);
-
- /* close if we got EOF */
- if (test_and_clear_bit(CF_EOF, &con->flags)) {
- mutex_unlock(&con->sock_mutex);
- close_connection(con, false, false, true);
-
- /* handling for tcp shutdown */
- clear_bit(CF_SHUTDOWN, &con->flags);
- wake_up(&con->shutdown_wait);
- } else {
- mutex_unlock(&con->sock_mutex);
+ return DLM_IO_RESCHED;
+ } else if (ret < 0) {
+ return ret;
}
- return;
-
-out:
- mutex_unlock(&con->sock_mutex);
- return;
+ spin_lock_bh(&con->writequeue_lock);
+ writequeue_entry_complete(e, ret);
+ spin_unlock_bh(&con->writequeue_lock);
-out_connect:
- mutex_unlock(&con->sock_mutex);
- queue_work(send_workqueue, &con->swork);
- cond_resched();
+ return DLM_IO_SUCCESS;
}
static void clean_one_writequeue(struct connection *con)
{
struct writequeue_entry *e, *safe;
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
list_for_each_entry_safe(e, safe, &con->writequeue, list) {
free_entry(e);
}
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
+}
+
+static void connection_release(struct rcu_head *rcu)
+{
+ struct connection *con = container_of(rcu, struct connection, rcu);
+
+ WARN_ON_ONCE(!list_empty(&con->writequeue));
+ WARN_ON_ONCE(con->sock);
+ kfree(con);
}
/* Called from recovery when it knows that a node has
@@ -1504,286 +1426,311 @@ static void clean_one_writequeue(struct connection *con)
int dlm_lowcomms_close(int nodeid)
{
struct connection *con;
- struct dlm_node_addr *na;
int idx;
log_print("closing connection to node %d", nodeid);
+
idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, 0);
- if (con) {
- set_bit(CF_CLOSE, &con->flags);
- close_connection(con, true, true, true);
- clean_one_writequeue(con);
+ if (WARN_ON_ONCE(!con)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
+
+ stop_connection_io(con);
+ log_print("io handling for node: %d stopped", nodeid);
+ close_connection(con, true);
+
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+
+ clean_one_writequeue(con);
+ call_srcu(&connections_srcu, &con->rcu, connection_release);
+ if (con->othercon) {
+ clean_one_writequeue(con->othercon);
if (con->othercon)
- clean_one_writequeue(con->othercon);
+ call_srcu(&connections_srcu, &con->othercon->rcu, connection_release);
}
srcu_read_unlock(&connections_srcu, idx);
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (na) {
- list_del(&na->list);
- while (na->addr_count--)
- kfree(na->addr[na->addr_count]);
- kfree(na);
- }
- spin_unlock(&dlm_node_addrs_spin);
+ /* for debugging we print when we are done to compare with other
+ * messages in between. This function need to be correctly synchronized
+ * with io handling
+ */
+ log_print("closing connection to node %d done", nodeid);
return 0;
}
-/* Receive workqueue function */
+/* Receive worker function */
static void process_recv_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, rwork);
+ int ret, buflen;
+
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ up_read(&con->sock_lock);
+ return;
+ }
+
+ buflen = READ_ONCE(dlm_config.ci_buffer_size);
+ do {
+ ret = receive_from_sock(con, buflen);
+ } while (ret == DLM_IO_SUCCESS);
+ up_read(&con->sock_lock);
- clear_bit(CF_READ_PENDING, &con->flags);
- receive_from_sock(con);
+ switch (ret) {
+ case DLM_IO_END:
+ /* CF_RECV_PENDING cleared */
+ break;
+ case DLM_IO_EOF:
+ close_connection(con, false);
+ /* CF_RECV_PENDING cleared */
+ break;
+ case DLM_IO_RESCHED:
+ cond_resched();
+ queue_work(io_workqueue, &con->rwork);
+ /* CF_RECV_PENDING not cleared */
+ break;
+ default:
+ if (ret < 0) {
+ if (test_bit(CF_IS_OTHERCON, &con->flags)) {
+ close_connection(con, false);
+ } else {
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ }
+
+ /* CF_RECV_PENDING cleared for othercon
+ * we trigger send queue if not already done
+ * and process_send_sockets will handle it
+ */
+ break;
+ }
+
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void process_listen_recv_socket(struct work_struct *work)
{
- accept_from_sock(&listen_con);
+ int ret;
+
+ if (WARN_ON_ONCE(!listen_con.sock))
+ return;
+
+ do {
+ ret = accept_from_sock();
+ } while (ret == DLM_IO_SUCCESS);
+
+ if (ret < 0)
+ log_print("critical error accepting connection: %d", ret);
}
-static void dlm_connect(struct connection *con)
+static int dlm_connect(struct connection *con)
{
struct sockaddr_storage addr;
int result, addr_len;
struct socket *sock;
unsigned int mark;
- /* Some odd races can cause double-connects, ignore them */
- if (con->retries++ > MAX_CONNECT_RETRIES)
- return;
-
- if (con->sock) {
- log_print("node %d already connected.", con->nodeid);
- return;
- }
-
memset(&addr, 0, sizeof(addr));
result = nodeid_to_addr(con->nodeid, &addr, NULL,
dlm_proto_ops->try_new_addr, &mark);
if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
- return;
+ return result;
}
/* Create a socket to communicate with */
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0)
- goto socket_err;
+ return result;
sock_set_mark(sock->sk, mark);
dlm_proto_ops->sockopts(sock);
- add_sock(sock, con);
-
result = dlm_proto_ops->bind(sock);
- if (result < 0)
- goto add_sock_err;
+ if (result < 0) {
+ sock_release(sock);
+ return result;
+ }
+
+ add_sock(sock, con);
log_print_ratelimited("connecting to %d", con->nodeid);
make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr,
addr_len);
- if (result < 0)
- goto add_sock_err;
-
- return;
-
-add_sock_err:
- dlm_close_sock(&con->sock);
+ switch (result) {
+ case -EINPROGRESS:
+ /* not an error */
+ fallthrough;
+ case 0:
+ break;
+ default:
+ if (result < 0)
+ dlm_close_sock(&con->sock);
-socket_err:
- /*
- * Some errors are fatal and this list might need adjusting. For other
- * errors we try again until the max number of retries is reached.
- */
- if (result != -EHOSTUNREACH &&
- result != -ENETUNREACH &&
- result != -ENETDOWN &&
- result != -EINVAL &&
- result != -EPROTONOSUPPORT) {
- log_print("connect %d try %d error %d", con->nodeid,
- con->retries, result);
- msleep(1000);
- lowcomms_connect_sock(con);
+ break;
}
+
+ return result;
}
-/* Send workqueue function */
+/* Send worker function */
static void process_send_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, swork);
+ int ret;
- WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags));
-
- clear_bit(CF_WRITE_PENDING, &con->flags);
+ WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags));
- if (test_and_clear_bit(CF_RECONNECT, &con->flags)) {
- close_connection(con, false, false, true);
- dlm_midcomms_unack_msg_resend(con->nodeid);
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ up_read(&con->sock_lock);
+ down_write(&con->sock_lock);
+ if (!con->sock) {
+ ret = dlm_connect(con);
+ switch (ret) {
+ case 0:
+ break;
+ case -EINPROGRESS:
+ /* avoid spamming resched on connection
+ * we might can switch to a state_change
+ * event based mechanism if established
+ */
+ msleep(100);
+ break;
+ default:
+ /* CF_SEND_PENDING not cleared */
+ up_write(&con->sock_lock);
+ log_print("connect to node %d try %d error %d",
+ con->nodeid, con->retries++, ret);
+ msleep(1000);
+ /* For now we try forever to reconnect. In
+ * future we should send a event to cluster
+ * manager to fence itself after certain amount
+ * of retries.
+ */
+ queue_work(io_workqueue, &con->swork);
+ return;
+ }
+ }
+ downgrade_write(&con->sock_lock);
}
- if (con->sock == NULL) {
- if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
- msleep(1000);
+ do {
+ ret = send_to_sock(con);
+ } while (ret == DLM_IO_SUCCESS);
+ up_read(&con->sock_lock);
- mutex_lock(&con->sock_mutex);
- dlm_connect(con);
- mutex_unlock(&con->sock_mutex);
- }
+ switch (ret) {
+ case DLM_IO_END:
+ /* CF_SEND_PENDING cleared */
+ break;
+ case DLM_IO_RESCHED:
+ /* CF_SEND_PENDING not cleared */
+ cond_resched();
+ queue_work(io_workqueue, &con->swork);
+ break;
+ default:
+ if (ret < 0) {
+ close_connection(con, false);
+
+ /* CF_SEND_PENDING cleared */
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ break;
+ }
- if (!list_empty(&con->writequeue))
- send_to_sock(con);
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void work_stop(void)
{
- if (recv_workqueue) {
- destroy_workqueue(recv_workqueue);
- recv_workqueue = NULL;
+ if (io_workqueue) {
+ destroy_workqueue(io_workqueue);
+ io_workqueue = NULL;
}
- if (send_workqueue) {
- destroy_workqueue(send_workqueue);
- send_workqueue = NULL;
+ if (process_workqueue) {
+ destroy_workqueue(process_workqueue);
+ process_workqueue = NULL;
}
}
static int work_start(void)
{
- recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM);
- if (!recv_workqueue) {
- log_print("can't start dlm_recv");
+ io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM,
+ 0);
+ if (!io_workqueue) {
+ log_print("can't start dlm_io");
return -ENOMEM;
}
- send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM);
- if (!send_workqueue) {
- log_print("can't start dlm_send");
- destroy_workqueue(recv_workqueue);
- recv_workqueue = NULL;
+ /* ordered dlm message process queue,
+ * should be converted to a tasklet
+ */
+ process_workqueue = alloc_ordered_workqueue("dlm_process",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM);
+ if (!process_workqueue) {
+ log_print("can't start dlm_process");
+ destroy_workqueue(io_workqueue);
+ io_workqueue = NULL;
return -ENOMEM;
}
return 0;
}
-static void shutdown_conn(struct connection *con)
-{
- if (dlm_proto_ops->shutdown_action)
- dlm_proto_ops->shutdown_action(con);
-}
-
void dlm_lowcomms_shutdown(void)
{
- int idx;
-
- /* Set all the flags to prevent any
- * socket activity.
- */
- dlm_allow_conn = 0;
-
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
+ /* stop lowcomms_listen_data_ready calls */
+ lock_sock(listen_con.sock->sk);
+ listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready;
+ release_sock(listen_con.sock->sk);
+ cancel_work_sync(&listen_con.rwork);
dlm_close_sock(&listen_con.sock);
- idx = srcu_read_lock(&connections_srcu);
- foreach_conn(shutdown_conn);
- srcu_read_unlock(&connections_srcu, idx);
-}
-
-static void _stop_conn(struct connection *con, bool and_other)
-{
- mutex_lock(&con->sock_mutex);
- set_bit(CF_CLOSE, &con->flags);
- set_bit(CF_READ_PENDING, &con->flags);
- set_bit(CF_WRITE_PENDING, &con->flags);
- if (con->sock && con->sock->sk) {
- lock_sock(con->sock->sk);
- con->sock->sk->sk_user_data = NULL;
- release_sock(con->sock->sk);
- }
- if (con->othercon && and_other)
- _stop_conn(con->othercon, false);
- mutex_unlock(&con->sock_mutex);
-}
-
-static void stop_conn(struct connection *con)
-{
- _stop_conn(con, true);
+ flush_workqueue(process_workqueue);
}
-static void connection_release(struct rcu_head *rcu)
+void dlm_lowcomms_shutdown_node(int nodeid, bool force)
{
- struct connection *con = container_of(rcu, struct connection, rcu);
-
- kfree(con->rx_buf);
- kfree(con);
-}
+ struct connection *con;
+ int idx;
-static void free_conn(struct connection *con)
-{
- close_connection(con, true, true, true);
- spin_lock(&connections_lock);
- hlist_del_rcu(&con->list);
- spin_unlock(&connections_lock);
- if (con->othercon) {
- clean_one_writequeue(con->othercon);
- call_srcu(&connections_srcu, &con->othercon->rcu,
- connection_release);
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return;
}
- clean_one_writequeue(con);
- call_srcu(&connections_srcu, &con->rcu, connection_release);
-}
-static void work_flush(void)
-{
- int ok;
- int i;
- struct connection *con;
-
- do {
- ok = 1;
- foreach_conn(stop_conn);
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
- for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
- hlist_for_each_entry_rcu(con, &connection_hash[i],
- list) {
- ok &= test_bit(CF_READ_PENDING, &con->flags);
- ok &= test_bit(CF_WRITE_PENDING, &con->flags);
- if (con->othercon) {
- ok &= test_bit(CF_READ_PENDING,
- &con->othercon->flags);
- ok &= test_bit(CF_WRITE_PENDING,
- &con->othercon->flags);
- }
- }
- }
- } while (!ok);
+ flush_work(&con->swork);
+ stop_connection_io(con);
+ WARN_ON_ONCE(!force && !list_empty(&con->writequeue));
+ close_connection(con, true);
+ clean_one_writequeue(con);
+ if (con->othercon)
+ clean_one_writequeue(con->othercon);
+ allow_connection_io(con);
+ srcu_read_unlock(&connections_srcu, idx);
}
void dlm_lowcomms_stop(void)
{
- int idx;
-
- idx = srcu_read_lock(&connections_srcu);
- work_flush();
- foreach_conn(free_conn);
- srcu_read_unlock(&connections_srcu, idx);
work_stop();
- deinit_local();
-
dlm_proto_ops = NULL;
}
@@ -1799,7 +1746,7 @@ static int dlm_listen_for_all(void)
if (result < 0)
return result;
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0) {
log_print("Can't create comms socket: %d", result);
@@ -1813,14 +1760,22 @@ static int dlm_listen_for_all(void)
if (result < 0)
goto out;
- save_listen_callbacks(sock);
- add_listen_sock(sock, &listen_con);
+ lock_sock(sock->sk);
+ listen_sock.sk_data_ready = sock->sk->sk_data_ready;
+ listen_sock.sk_write_space = sock->sk->sk_write_space;
+ listen_sock.sk_error_report = sock->sk->sk_error_report;
+ listen_sock.sk_state_change = sock->sk->sk_state_change;
+
+ listen_con.sock = sock;
+
+ sock->sk->sk_allocation = GFP_NOFS;
+ sock->sk->sk_data_ready = lowcomms_listen_data_ready;
+ release_sock(sock->sk);
- INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
result = sock->ops->listen(sock, 5);
if (result < 0) {
dlm_close_sock(&listen_con.sock);
- goto out;
+ return result;
}
return 0;
@@ -1838,7 +1793,7 @@ static int dlm_tcp_bind(struct socket *sock)
/* Bind to our cluster-known address connecting to avoid
* routing problems.
*/
- memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+ memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
make_sockaddr(&src_addr, 0, &addr_len);
result = sock->ops->bind(sock, (struct sockaddr *)&src_addr,
@@ -1854,17 +1809,7 @@ static int dlm_tcp_bind(struct socket *sock)
static int dlm_tcp_connect(struct connection *con, struct socket *sock,
struct sockaddr *addr, int addr_len)
{
- int ret;
-
- ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
- switch (ret) {
- case -EINPROGRESS:
- fallthrough;
- case 0:
- return 0;
- }
-
- return ret;
+ return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
}
static int dlm_tcp_listen_validate(void)
@@ -1895,8 +1840,8 @@ static int dlm_tcp_listen_bind(struct socket *sock)
int addr_len;
/* Bind to our port */
- make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
- return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0],
+ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
+ return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0],
addr_len);
}
@@ -1909,8 +1854,6 @@ static const struct dlm_proto_ops dlm_tcp_ops = {
.listen_validate = dlm_tcp_listen_validate,
.listen_sockopts = dlm_tcp_listen_sockopts,
.listen_bind = dlm_tcp_listen_bind,
- .shutdown_action = dlm_tcp_shutdown,
- .eof_condition = tcp_eof_condition,
};
static int dlm_sctp_bind(struct socket *sock)
@@ -1931,13 +1874,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
sock_set_sndtimeo(sock->sk, 5);
ret = sock->ops->connect(sock, addr, addr_len, 0);
sock_set_sndtimeo(sock->sk, 0);
- if (ret < 0)
- return ret;
-
- if (!test_and_set_bit(CF_CONNECTED, &con->flags))
- log_print("connected to node %d", con->nodeid);
-
- return 0;
+ return ret;
}
static int dlm_sctp_listen_validate(void)
@@ -1977,11 +1914,7 @@ static const struct dlm_proto_ops dlm_sctp_ops = {
int dlm_lowcomms_start(void)
{
- int error = -EINVAL;
- int i;
-
- for (i = 0; i < CONN_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&connection_hash[i]);
+ int error;
init_local();
if (!dlm_local_count) {
@@ -1990,13 +1923,9 @@ int dlm_lowcomms_start(void)
goto fail;
}
- INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
-
error = work_start();
if (error)
- goto fail_local;
-
- dlm_allow_conn = 1;
+ goto fail;
/* Start listening */
switch (dlm_config.ci_protocol) {
@@ -2022,25 +1951,38 @@ int dlm_lowcomms_start(void)
fail_listen:
dlm_proto_ops = NULL;
fail_proto_ops:
- dlm_allow_conn = 0;
- dlm_close_sock(&listen_con.sock);
work_stop();
-fail_local:
- deinit_local();
fail:
return error;
}
+void dlm_lowcomms_init(void)
+{
+ int i;
+
+ for (i = 0; i < CONN_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&connection_hash[i]);
+
+ INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
+}
+
void dlm_lowcomms_exit(void)
{
- struct dlm_node_addr *na, *safe;
+ struct connection *con;
+ int i, idx;
- spin_lock(&dlm_node_addrs_spin);
- list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) {
- list_del(&na->list);
- while (na->addr_count--)
- kfree(na->addr[na->addr_count]);
- kfree(na);
+ idx = srcu_read_lock(&connections_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+
+ if (con->othercon)
+ call_srcu(&connections_srcu, &con->othercon->rcu,
+ connection_release);
+ call_srcu(&connections_srcu, &con->rcu, connection_release);
+ }
}
- spin_unlock(&dlm_node_addrs_spin);
+ srcu_read_unlock(&connections_srcu, idx);
}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 29369feea991..3e8dca66183b 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -29,12 +29,14 @@ static inline int nodeid_hash(int nodeid)
return nodeid & (CONN_HASH_SIZE-1);
}
-/* switch to check if dlm is running */
-extern int dlm_allow_conn;
+/* check if dlm is running */
+bool dlm_lowcomms_is_running(void);
int dlm_lowcomms_start(void);
void dlm_lowcomms_shutdown(void);
+void dlm_lowcomms_shutdown_node(int nodeid, bool force);
void dlm_lowcomms_stop(void);
+void dlm_lowcomms_init(void);
void dlm_lowcomms_exit(void);
int dlm_lowcomms_close(int nodeid);
struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 1c5be4b70ac1..a77338be3237 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -17,7 +17,7 @@
#include "user.h"
#include "memory.h"
#include "config.h"
-#include "lowcomms.h"
+#include "midcomms.h"
#define CREATE_TRACE_POINTS
#include <trace/events/dlm.h>
@@ -30,6 +30,8 @@ static int __init init_dlm(void)
if (error)
goto out;
+ dlm_midcomms_init();
+
error = dlm_lockspace_init();
if (error)
goto out_mem;
@@ -66,6 +68,7 @@ static int __init init_dlm(void)
out_lockspace:
dlm_lockspace_exit();
out_mem:
+ dlm_midcomms_exit();
dlm_memory_exit();
out:
return error;
@@ -79,7 +82,7 @@ static void __exit exit_dlm(void)
dlm_config_exit();
dlm_memory_exit();
dlm_lockspace_exit();
- dlm_lowcomms_exit();
+ dlm_midcomms_exit();
dlm_unregister_debugfs();
}
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 2af2ccfe43a9..923c01a8a0aa 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -573,7 +573,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
node = &rv->nodes[i];
if (dlm_is_member(ls, node->nodeid))
continue;
- dlm_add_member(ls, node);
+ error = dlm_add_member(ls, node);
+ if (error)
+ return error;
+
log_rinfo(ls, "add member %d", node->nodeid);
}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ce35c3c19aeb..eb7a08641fcf 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -14,12 +14,14 @@
#include "lowcomms.h"
#include "config.h"
#include "memory.h"
+#include "ast.h"
static struct kmem_cache *writequeue_cache;
static struct kmem_cache *mhandle_cache;
static struct kmem_cache *msg_cache;
static struct kmem_cache *lkb_cache;
static struct kmem_cache *rsb_cache;
+static struct kmem_cache *cb_cache;
int __init dlm_memory_init(void)
@@ -46,8 +48,16 @@ int __init dlm_memory_init(void)
if (!rsb_cache)
goto rsb;
+ cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback),
+ __alignof__(struct dlm_callback), 0,
+ NULL);
+ if (!rsb_cache)
+ goto cb;
+
return 0;
+cb:
+ kmem_cache_destroy(rsb_cache);
rsb:
kmem_cache_destroy(msg_cache);
msg:
@@ -67,6 +77,7 @@ void dlm_memory_exit(void)
kmem_cache_destroy(msg_cache);
kmem_cache_destroy(lkb_cache);
kmem_cache_destroy(rsb_cache);
+ kmem_cache_destroy(cb_cache);
}
char *dlm_allocate_lvb(struct dlm_ls *ls)
@@ -115,12 +126,17 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
kfree(ua);
}
}
+
+ /* drop references if they are set */
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+
kmem_cache_free(lkb_cache, lkb);
}
-struct dlm_mhandle *dlm_allocate_mhandle(void)
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation)
{
- return kmem_cache_alloc(mhandle_cache, GFP_NOFS);
+ return kmem_cache_alloc(mhandle_cache, allocation);
}
void dlm_free_mhandle(struct dlm_mhandle *mhandle)
@@ -147,3 +163,13 @@ void dlm_free_msg(struct dlm_msg *msg)
{
kmem_cache_free(msg_cache, msg);
}
+
+struct dlm_callback *dlm_allocate_cb(void)
+{
+ return kmem_cache_alloc(cb_cache, GFP_ATOMIC);
+}
+
+void dlm_free_cb(struct dlm_callback *cb)
+{
+ kmem_cache_free(cb_cache, cb);
+}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 7bd3f1a391ca..6b29563d24f7 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -20,12 +20,14 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
void dlm_free_lkb(struct dlm_lkb *l);
char *dlm_allocate_lvb(struct dlm_ls *ls);
void dlm_free_lvb(char *l);
-struct dlm_mhandle *dlm_allocate_mhandle(void);
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation);
void dlm_free_mhandle(struct dlm_mhandle *mhandle);
struct writequeue_entry *dlm_allocate_writequeue(void);
void dlm_free_writequeue(struct writequeue_entry *writequeue);
struct dlm_msg *dlm_allocate_msg(gfp_t allocation);
void dlm_free_msg(struct dlm_msg *msg);
+struct dlm_callback *dlm_allocate_cb(void);
+void dlm_free_cb(struct dlm_callback *cb);
#endif /* __MEMORY_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 6489bc22ad61..fc015a6abe17 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -132,6 +132,7 @@
*/
#define DLM_DEBUG_FENCE_TERMINATION 0
+#include <trace/events/dlm.h>
#include <net/tcp.h>
#include "dlm_internal.h"
@@ -194,7 +195,7 @@ struct midcomms_node {
};
struct dlm_mhandle {
- const struct dlm_header *inner_hd;
+ const union dlm_packet *inner_p;
struct midcomms_node *node;
struct dlm_opts *opts;
struct dlm_msg *msg;
@@ -305,11 +306,11 @@ static void dlm_send_queue_flush(struct midcomms_node *node)
pr_debug("flush midcomms send queue of node %d\n", node->nodeid);
rcu_read_lock();
- spin_lock(&node->send_queue_lock);
+ spin_lock_bh(&node->send_queue_lock);
list_for_each_entry_rcu(mh, &node->send_queue, list) {
dlm_mhandle_delete(node, mh);
}
- spin_unlock(&node->send_queue_lock);
+ spin_unlock_bh(&node->send_queue_lock);
rcu_read_unlock();
}
@@ -415,7 +416,7 @@ static int dlm_send_fin(struct midcomms_node *node,
m_header->h_cmd = DLM_FIN;
pr_debug("sending fin msg to node %d\n", node->nodeid);
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
return 0;
@@ -436,7 +437,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
}
}
- spin_lock(&node->send_queue_lock);
+ spin_lock_bh(&node->send_queue_lock);
list_for_each_entry_rcu(mh, &node->send_queue, list) {
if (before(mh->seq, seq)) {
dlm_mhandle_delete(node, mh);
@@ -445,7 +446,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
break;
}
}
- spin_unlock(&node->send_queue_lock);
+ spin_unlock_bh(&node->send_queue_lock);
rcu_read_unlock();
}
@@ -468,12 +469,26 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node)
spin_unlock(&node->state_lock);
log_print("%s: unexpected state: %d\n",
__func__, node->state);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return;
}
spin_unlock(&node->state_lock);
}
+static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p)
+{
+ switch (p->header.h_cmd) {
+ case DLM_MSG:
+ trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message);
+ break;
+ case DLM_RCOM:
+ trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom);
+ break;
+ default:
+ break;
+ }
+}
+
static void dlm_midcomms_receive_buffer(union dlm_packet *p,
struct midcomms_node *node,
uint32_t seq)
@@ -525,7 +540,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
spin_unlock(&node->state_lock);
log_print("%s: unexpected state: %d\n",
__func__, node->state);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return;
}
spin_unlock(&node->state_lock);
@@ -533,7 +548,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
break;
default:
- WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ dlm_receive_buffer_3_2_trace(seq, p);
dlm_receive_buffer(p, node->nodeid);
set_bit(DLM_NODE_ULP_DELIVERED, &node->flags);
break;
@@ -754,7 +770,7 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
goto out;
}
- WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
dlm_receive_buffer(p, nodeid);
break;
case DLM_OPTS:
@@ -874,12 +890,7 @@ static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid)
dlm_receive_buffer(p, nodeid);
}
-/*
- * Called from the low-level comms layer to process a buffer of
- * commands.
- */
-
-int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len)
{
const unsigned char *ptr = buf;
const struct dlm_header *hd;
@@ -914,6 +925,32 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
if (msglen > len)
break;
+ ret += msglen;
+ len -= msglen;
+ ptr += msglen;
+ }
+
+ return ret;
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ */
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
+{
+ const unsigned char *ptr = buf;
+ const struct dlm_header *hd;
+ uint16_t msglen;
+ int ret = 0;
+
+ while (len >= sizeof(struct dlm_header)) {
+ hd = (struct dlm_header *)ptr;
+
+ msglen = le16_to_cpu(hd->h_length);
+ if (msglen > len)
+ break;
+
switch (hd->h_version) {
case cpu_to_le32(DLM_VERSION_3_1):
dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid);
@@ -1030,9 +1067,9 @@ static void midcomms_new_msg_cb(void *data)
atomic_inc(&mh->node->send_queue_cnt);
- spin_lock(&mh->node->send_queue_lock);
+ spin_lock_bh(&mh->node->send_queue_lock);
list_add_tail_rcu(&mh->list, &mh->node->send_queue);
- spin_unlock(&mh->node->send_queue_lock);
+ spin_unlock_bh(&mh->node->send_queue_lock);
mh->seq = mh->node->seq_send++;
}
@@ -1055,7 +1092,7 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node
dlm_fill_opts_header(opts, len, mh->seq);
*ppc += sizeof(*opts);
- mh->inner_hd = (const struct dlm_header *)*ppc;
+ mh->inner_p = (const union dlm_packet *)*ppc;
return msg;
}
@@ -1079,9 +1116,9 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
}
/* this is a bug, however we going on and hope it will be resolved */
- WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
- mh = dlm_allocate_mhandle();
+ mh = dlm_allocate_mhandle(allocation);
if (!mh)
goto err;
@@ -1111,7 +1148,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
break;
default:
dlm_free_mhandle(mh);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
goto err;
}
@@ -1130,11 +1167,32 @@ err:
}
#endif
-static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
+static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh,
+ const void *name, int namelen)
+{
+ switch (mh->inner_p->header.h_cmd) {
+ case DLM_MSG:
+ trace_dlm_send_message(mh->node->nodeid, mh->seq,
+ &mh->inner_p->message,
+ name, namelen);
+ break;
+ case DLM_RCOM:
+ trace_dlm_send_rcom(mh->node->nodeid, mh->seq,
+ &mh->inner_p->rcom);
+ break;
+ default:
+ /* nothing to trace */
+ break;
+ }
+}
+
+static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh,
+ const void *name, int namelen)
{
/* nexthdr chain for fast lookup */
- mh->opts->o_nextcmd = mh->inner_hd->h_cmd;
+ mh->opts->o_nextcmd = mh->inner_p->header.h_cmd;
mh->committed = true;
+ dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen);
dlm_lowcomms_commit_msg(mh->msg);
}
@@ -1142,8 +1200,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
* dlm_midcomms_get_mhandle
*/
#ifndef __CHECKER__
-void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh,
+ const void *name, int namelen)
{
+
switch (mh->node->version) {
case DLM_VERSION_3_1:
srcu_read_unlock(&nodes_srcu, mh->idx);
@@ -1154,12 +1214,12 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
dlm_free_mhandle(mh);
break;
case DLM_VERSION_3_2:
- dlm_midcomms_commit_msg_3_2(mh);
+ dlm_midcomms_commit_msg_3_2(mh, name, namelen);
srcu_read_unlock(&nodes_srcu, mh->idx);
break;
default:
srcu_read_unlock(&nodes_srcu, mh->idx);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
break;
}
}
@@ -1167,12 +1227,27 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
int dlm_midcomms_start(void)
{
+ return dlm_lowcomms_start();
+}
+
+void dlm_midcomms_stop(void)
+{
+ dlm_lowcomms_stop();
+}
+
+void dlm_midcomms_init(void)
+{
int i;
for (i = 0; i < CONN_HASH_SIZE; i++)
INIT_HLIST_HEAD(&node_hash[i]);
- return dlm_lowcomms_start();
+ dlm_lowcomms_init();
+}
+
+void dlm_midcomms_exit(void)
+{
+ dlm_lowcomms_exit();
}
static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
@@ -1201,7 +1276,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
spin_unlock(&node->state_lock);
log_print("%s: unexpected state: %d\n",
__func__, node->state);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return;
}
spin_unlock(&node->state_lock);
@@ -1319,7 +1394,7 @@ static void midcomms_node_release(struct rcu_head *rcu)
{
struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
- WARN_ON(atomic_read(&node->send_queue_cnt));
+ WARN_ON_ONCE(atomic_read(&node->send_queue_cnt));
kfree(node);
}
@@ -1372,11 +1447,13 @@ static void midcomms_shutdown(struct midcomms_node *node)
pr_debug("active shutdown timed out for node %d with state %s\n",
node->nodeid, dlm_state_str(node->state));
midcomms_node_reset(node);
+ dlm_lowcomms_shutdown_node(node->nodeid, true);
return;
}
pr_debug("active shutdown done for node %d with state %s\n",
node->nodeid, dlm_state_str(node->state));
+ dlm_lowcomms_shutdown_node(node->nodeid, false);
}
void dlm_midcomms_shutdown(void)
@@ -1384,6 +1461,8 @@ void dlm_midcomms_shutdown(void)
struct midcomms_node *node;
int i, idx;
+ dlm_lowcomms_shutdown();
+
mutex_lock(&close_lock);
idx = srcu_read_lock(&nodes_srcu);
for (i = 0; i < CONN_HASH_SIZE; i++) {
@@ -1401,8 +1480,6 @@ void dlm_midcomms_shutdown(void)
}
srcu_read_unlock(&nodes_srcu, idx);
mutex_unlock(&close_lock);
-
- dlm_lowcomms_shutdown();
}
int dlm_midcomms_close(int nodeid)
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 82bcd9661922..bea1cee4279c 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -14,12 +14,17 @@
struct midcomms_node;
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len);
int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
gfp_t allocation, char **ppc);
-void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh);
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name,
+ int namelen);
int dlm_midcomms_close(int nodeid);
int dlm_midcomms_start(void);
+void dlm_midcomms_stop(void);
+void dlm_midcomms_init(void);
+void dlm_midcomms_exit(void);
void dlm_midcomms_shutdown(void);
void dlm_midcomms_add_member(int nodeid);
void dlm_midcomms_remove_member(int nodeid);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f19860315043..b76d52e2f6bd 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -91,7 +91,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc)
{
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
}
static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc)
@@ -516,7 +516,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
rf = (struct rcom_config *) rc->rc_buf;
rf->rf_lvblen = cpu_to_le32(~0U);
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
return 0;
}
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 036a9a0078f6..8be2893ad15b 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -44,7 +44,8 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
e->nodeid = nodeid;
- memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length));
+ memcpy(&e->request, ms, sizeof(*ms));
+ memcpy(&e->request.m_extra, ms->m_extra, length);
atomic_inc(&ls->ls_requestqueue_cnt);
mutex_lock(&ls->ls_requestqueue_mutex);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index c5d27bccc3dc..35129505ddda 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -25,6 +25,7 @@
#include "user.h"
#include "ast.h"
#include "config.h"
+#include "memory.h"
static const char name_prefix[] = "dlm";
static const struct file_operations device_fops;
@@ -175,7 +176,7 @@ static int lkb_is_endoflife(int mode, int status)
being removed and then remove that lkb from the orphans list and free it */
void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq)
+ int status, uint32_t sbflags)
{
struct dlm_ls *ls;
struct dlm_user_args *ua;
@@ -209,16 +210,22 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
spin_lock(&proc->asts_spin);
- rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
- if (rv < 0) {
+ rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+ switch (rv) {
+ case DLM_ENQUEUE_CALLBACK_FAILURE:
spin_unlock(&proc->asts_spin);
+ WARN_ON_ONCE(1);
goto out;
- }
-
- if (list_empty(&lkb->lkb_cb_list)) {
+ case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
kref_get(&lkb->lkb_ref);
list_add_tail(&lkb->lkb_cb_list, &proc->asts);
wake_up_interruptible(&proc->wait);
+ break;
+ case DLM_ENQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
spin_unlock(&proc->asts_spin);
@@ -800,8 +807,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
struct dlm_user_proc *proc = file->private_data;
struct dlm_lkb *lkb;
DECLARE_WAITQUEUE(wait, current);
- struct dlm_callback cb;
- int rv, resid, copy_lvb = 0;
+ struct dlm_callback *cb;
+ int rv, copy_lvb = 0;
int old_mode, new_mode;
if (count == sizeof(struct dlm_device_version)) {
@@ -857,53 +864,58 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
without removing lkb_cb_list; so empty lkb_cb_list is always
consistent with empty lkb_callbacks */
- lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list);
+ lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list);
/* rem_lkb_callback sets a new lkb_last_cast */
- old_mode = lkb->lkb_last_cast.mode;
+ old_mode = lkb->lkb_last_cast->mode;
- rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
- if (rv < 0) {
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ switch (rv) {
+ case DLM_DEQUEUE_CALLBACK_EMPTY:
/* this shouldn't happen; lkb should have been removed from
- list when resid was zero */
+ * list when last item was dequeued
+ */
log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
list_del_init(&lkb->lkb_cb_list);
spin_unlock(&proc->asts_spin);
/* removes ref for proc->asts, may cause lkb to be freed */
dlm_put_lkb(lkb);
+ WARN_ON_ONCE(1);
goto try_another;
- }
- if (!resid)
+ case DLM_DEQUEUE_CALLBACK_LAST:
list_del_init(&lkb->lkb_cb_list);
- spin_unlock(&proc->asts_spin);
-
- if (cb.flags & DLM_CB_SKIP) {
- /* removes ref for proc->asts, may cause lkb to be freed */
- if (!resid)
- dlm_put_lkb(lkb);
- goto try_another;
+ lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
+ break;
+ case DLM_DEQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
+ spin_unlock(&proc->asts_spin);
- if (cb.flags & DLM_CB_BAST) {
- trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb.mode);
- } else if (cb.flags & DLM_CB_CAST) {
- new_mode = cb.mode;
+ if (cb->flags & DLM_CB_BAST) {
+ trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode);
+ } else if (cb->flags & DLM_CB_CAST) {
+ new_mode = cb->mode;
- if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
+ if (!cb->sb_status && lkb->lkb_lksb->sb_lvbptr &&
dlm_lvb_operations[old_mode + 1][new_mode + 1])
copy_lvb = 1;
- lkb->lkb_lksb->sb_status = cb.sb_status;
- lkb->lkb_lksb->sb_flags = cb.sb_flags;
+ lkb->lkb_lksb->sb_status = cb->sb_status;
+ lkb->lkb_lksb->sb_flags = cb->sb_flags;
trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
}
rv = copy_result_to_user(lkb->lkb_ua,
test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
- cb.flags, cb.mode, copy_lvb, buf, count);
+ cb->flags, cb->mode, copy_lvb, buf, count);
+
+ kref_put(&cb->ref, dlm_release_callback);
/* removes ref for proc->asts, may cause lkb to be freed */
- if (!resid)
+ if (rv == DLM_DEQUEUE_CALLBACK_LAST)
dlm_put_lkb(lkb);
return rv;
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 6b9bce6b96e0..33059452d79e 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -7,7 +7,7 @@
#define __USER_DOT_H__
void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq);
+ int status, uint32_t sbflags);
int dlm_user_init(void);
void dlm_user_exit(void);
int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c214fe0981bd..f3cd00fac9c3 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -18,6 +18,8 @@
#include <linux/fs_stack.h>
#include <linux/slab.h>
#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/fileattr.h>
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
@@ -1120,6 +1122,28 @@ static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns,
return rc;
}
+static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry),
+ posix_acl_xattr_name(type));
+}
+
+static int ecryptfs_set_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct posix_acl *acl,
+ int type)
+{
+ int rc;
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ struct inode *lower_inode = d_inode(lower_dentry);
+
+ rc = vfs_set_acl(&init_user_ns, lower_dentry,
+ posix_acl_xattr_name(type), acl);
+ if (!rc)
+ fsstack_copy_attr_all(d_inode(dentry), lower_inode);
+ return rc;
+}
+
const struct inode_operations ecryptfs_symlink_iops = {
.get_link = ecryptfs_get_link,
.permission = ecryptfs_permission,
@@ -1143,6 +1167,8 @@ const struct inode_operations ecryptfs_dir_iops = {
.listxattr = ecryptfs_listxattr,
.fileattr_get = ecryptfs_fileattr_get,
.fileattr_set = ecryptfs_fileattr_set,
+ .get_acl = ecryptfs_get_acl,
+ .set_acl = ecryptfs_set_acl,
};
const struct inode_operations ecryptfs_main_iops = {
@@ -1152,6 +1178,8 @@ const struct inode_operations ecryptfs_main_iops = {
.listxattr = ecryptfs_listxattr,
.fileattr_get = ecryptfs_fileattr_get,
.fileattr_set = ecryptfs_fileattr_set,
+ .get_acl = ecryptfs_get_acl,
+ .set_acl = ecryptfs_set_acl,
};
static int ecryptfs_xattr_get(const struct xattr_handler *handler,
@@ -1182,6 +1210,10 @@ static const struct xattr_handler ecryptfs_xattr_handler = {
};
const struct xattr_handler *ecryptfs_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
&ecryptfs_xattr_handler,
NULL
};
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fe8ac0e163f7..f57f921683d7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -13,9 +13,7 @@
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
if (buf->kmap_type == EROFS_KMAP)
- kunmap(buf->page);
- else if (buf->kmap_type == EROFS_KMAP_ATOMIC)
- kunmap_atomic(buf->base);
+ kunmap_local(buf->base);
buf->base = NULL;
buf->kmap_type = EROFS_NO_KMAP;
}
@@ -54,9 +52,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
}
if (buf->kmap_type == EROFS_NO_KMAP) {
if (type == EROFS_KMAP)
- buf->base = kmap(page);
- else if (type == EROFS_KMAP_ATOMIC)
- buf->base = kmap_atomic(page);
+ buf->base = kmap_local_page(page);
buf->kmap_type = type;
} else if (buf->kmap_type != type) {
DBG_BUGON(1);
@@ -403,6 +399,8 @@ const struct address_space_operations erofs_raw_access_aops = {
.readahead = erofs_readahead,
.bmap = erofs_bmap,
.direct_IO = noop_direct_IO,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
};
#ifdef CONFIG_FS_DAX
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index af5ed6b9c54d..014e20962376 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -11,265 +11,201 @@ static DEFINE_MUTEX(erofs_domain_cookies_lock);
static LIST_HEAD(erofs_domain_list);
static struct vfsmount *erofs_pseudo_mnt;
-static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
+struct erofs_fscache_request {
+ struct erofs_fscache_request *primary;
+ struct netfs_cache_resources cache_resources;
+ struct address_space *mapping; /* The mapping being accessed */
+ loff_t start; /* Start position */
+ size_t len; /* Length of the request */
+ size_t submitted; /* Length of submitted */
+ short error; /* 0 or error that occurred */
+ refcount_t ref;
+};
+
+static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
loff_t start, size_t len)
{
- struct netfs_io_request *rreq;
+ struct erofs_fscache_request *req;
- rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
- if (!rreq)
+ req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
+ if (!req)
return ERR_PTR(-ENOMEM);
- rreq->start = start;
- rreq->len = len;
- rreq->mapping = mapping;
- rreq->inode = mapping->host;
- INIT_LIST_HEAD(&rreq->subrequests);
- refcount_set(&rreq->ref, 1);
- return rreq;
-}
+ req->mapping = mapping;
+ req->start = start;
+ req->len = len;
+ refcount_set(&req->ref, 1);
-static void erofs_fscache_put_request(struct netfs_io_request *rreq)
-{
- if (!refcount_dec_and_test(&rreq->ref))
- return;
- if (rreq->cache_resources.ops)
- rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
+ return req;
}
-static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq)
+static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
+ size_t len)
{
- if (!refcount_dec_and_test(&subreq->ref))
- return;
- erofs_fscache_put_request(subreq->rreq);
- kfree(subreq);
-}
+ struct erofs_fscache_request *req;
-static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
+ /* use primary request for the first submission */
+ if (!primary->submitted) {
+ refcount_inc(&primary->ref);
+ return primary;
+ }
- while (!list_empty(&rreq->subrequests)) {
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_io_subrequest, rreq_link);
- list_del(&subreq->rreq_link);
- erofs_fscache_put_subrequest(subreq);
+ req = erofs_fscache_req_alloc(primary->mapping,
+ primary->start + primary->submitted, len);
+ if (!IS_ERR(req)) {
+ req->primary = primary;
+ refcount_inc(&primary->ref);
}
+ return req;
}
-static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq)
+static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
{
- struct netfs_io_subrequest *subreq;
struct folio *folio;
- unsigned int iopos = 0;
- pgoff_t start_page = rreq->start / PAGE_SIZE;
- pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
- bool subreq_failed = false;
+ bool failed = req->error;
+ pgoff_t start_page = req->start / PAGE_SIZE;
+ pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1;
- XA_STATE(xas, &rreq->mapping->i_pages, start_page);
-
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_io_subrequest, rreq_link);
- subreq_failed = (subreq->error < 0);
+ XA_STATE(xas, &req->mapping->i_pages, start_page);
rcu_read_lock();
xas_for_each(&xas, folio, last_page) {
- unsigned int pgpos, pgend;
- bool pg_failed = false;
-
if (xas_retry(&xas, folio))
continue;
-
- pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
- pgend = pgpos + folio_size(folio);
-
- for (;;) {
- if (!subreq) {
- pg_failed = true;
- break;
- }
-
- pg_failed |= subreq_failed;
- if (pgend < iopos + subreq->len)
- break;
-
- iopos += subreq->len;
- if (!list_is_last(&subreq->rreq_link,
- &rreq->subrequests)) {
- subreq = list_next_entry(subreq, rreq_link);
- subreq_failed = (subreq->error < 0);
- } else {
- subreq = NULL;
- subreq_failed = false;
- }
- if (pgend == iopos)
- break;
- }
-
- if (!pg_failed)
+ if (!failed)
folio_mark_uptodate(folio);
-
folio_unlock(folio);
}
rcu_read_unlock();
}
-static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq)
+static void erofs_fscache_req_put(struct erofs_fscache_request *req)
{
- erofs_fscache_rreq_unlock_folios(rreq);
- erofs_fscache_clear_subrequests(rreq);
- erofs_fscache_put_request(rreq);
+ if (refcount_dec_and_test(&req->ref)) {
+ if (req->cache_resources.ops)
+ req->cache_resources.ops->end_operation(&req->cache_resources);
+ if (!req->primary)
+ erofs_fscache_req_complete(req);
+ else
+ erofs_fscache_req_put(req->primary);
+ kfree(req);
+ }
}
-static void erofc_fscache_subreq_complete(void *priv,
+static void erofs_fscache_subreq_complete(void *priv,
ssize_t transferred_or_error, bool was_async)
{
- struct netfs_io_subrequest *subreq = priv;
- struct netfs_io_request *rreq = subreq->rreq;
-
- if (IS_ERR_VALUE(transferred_or_error))
- subreq->error = transferred_or_error;
+ struct erofs_fscache_request *req = priv;
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- erofs_fscache_rreq_complete(rreq);
-
- erofs_fscache_put_subrequest(subreq);
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ if (req->primary)
+ req->primary->error = transferred_or_error;
+ else
+ req->error = transferred_or_error;
+ }
+ erofs_fscache_req_put(req);
}
/*
- * Read data from fscache and fill the read data into page cache described by
- * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes
- * the start physical address in the cache file.
+ * Read data from fscache (cookie, pstart, len), and fill the read data into
+ * page cache described by (req->mapping, lstart, len). @pstart describeis the
+ * start physical address in the cache file.
*/
static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
- struct netfs_io_request *rreq, loff_t pstart)
+ struct erofs_fscache_request *req, loff_t pstart, size_t len)
{
enum netfs_io_source source;
- struct super_block *sb = rreq->mapping->host->i_sb;
- struct netfs_io_subrequest *subreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct super_block *sb = req->mapping->host->i_sb;
+ struct netfs_cache_resources *cres = &req->cache_resources;
struct iov_iter iter;
- loff_t start = rreq->start;
- size_t len = rreq->len;
+ loff_t lstart = req->start + req->submitted;
size_t done = 0;
int ret;
- atomic_set(&rreq->nr_outstanding, 1);
+ DBG_BUGON(len > req->len - req->submitted);
ret = fscache_begin_read_operation(cres, cookie);
if (ret)
- goto out;
+ return ret;
while (done < len) {
- subreq = kzalloc(sizeof(struct netfs_io_subrequest),
- GFP_KERNEL);
- if (subreq) {
- INIT_LIST_HEAD(&subreq->rreq_link);
- refcount_set(&subreq->ref, 2);
- subreq->rreq = rreq;
- refcount_inc(&rreq->ref);
- } else {
- ret = -ENOMEM;
- goto out;
- }
-
- subreq->start = pstart + done;
- subreq->len = len - done;
- subreq->flags = 1 << NETFS_SREQ_ONDEMAND;
+ loff_t sstart = pstart + done;
+ size_t slen = len - done;
+ unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
- list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-
- source = cres->ops->prepare_read(subreq, LLONG_MAX);
- if (WARN_ON(subreq->len == 0))
+ source = cres->ops->prepare_ondemand_read(cres,
+ sstart, &slen, LLONG_MAX, &flags, 0);
+ if (WARN_ON(slen == 0))
source = NETFS_INVALID_READ;
if (source != NETFS_READ_FROM_CACHE) {
- erofs_err(sb, "failed to fscache prepare_read (source %d)",
- source);
- ret = -EIO;
- subreq->error = ret;
- erofs_fscache_put_subrequest(subreq);
- goto out;
+ erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+ return -EIO;
}
- atomic_inc(&rreq->nr_outstanding);
+ refcount_inc(&req->ref);
+ iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
+ lstart + done, slen);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
- start + done, subreq->len);
-
- ret = fscache_read(cres, subreq->start, &iter,
- NETFS_READ_HOLE_FAIL,
- erofc_fscache_subreq_complete, subreq);
+ ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
+ erofs_fscache_subreq_complete, req);
if (ret == -EIOCBQUEUED)
ret = 0;
if (ret) {
erofs_err(sb, "failed to fscache_read (ret %d)", ret);
- goto out;
+ return ret;
}
- done += subreq->len;
+ done += slen;
}
-out:
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- erofs_fscache_rreq_complete(rreq);
-
- return ret;
+ DBG_BUGON(done != len);
+ return 0;
}
static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
{
int ret;
struct super_block *sb = folio_mapping(folio)->host->i_sb;
- struct netfs_io_request *rreq;
+ struct erofs_fscache_request *req;
struct erofs_map_dev mdev = {
.m_deviceid = 0,
.m_pa = folio_pos(folio),
};
ret = erofs_map_dev(sb, &mdev);
- if (ret)
- goto out;
+ if (ret) {
+ folio_unlock(folio);
+ return ret;
+ }
- rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio_mapping(folio),
folio_pos(folio), folio_size(folio));
- if (IS_ERR(rreq)) {
- ret = PTR_ERR(rreq);
- goto out;
+ if (IS_ERR(req)) {
+ folio_unlock(folio);
+ return PTR_ERR(req);
}
- return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- rreq, mdev.m_pa);
-out:
- folio_unlock(folio);
+ ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ req, mdev.m_pa, folio_size(folio));
+ if (ret)
+ req->error = ret;
+
+ erofs_fscache_req_put(req);
return ret;
}
-/*
- * Read into page cache in the range described by (@pos, @len).
- *
- * On return, the caller is responsible for page unlocking if the output @unlock
- * is true, or the callee will take this responsibility through netfs_io_request
- * interface.
- *
- * The return value is the number of bytes successfully handled, or negative
- * error code on failure. The only exception is that, the length of the range
- * instead of the error code is returned on failure after netfs_io_request is
- * allocated, so that .readahead() could advance rac accordingly.
- */
-static int erofs_fscache_data_read(struct address_space *mapping,
- loff_t pos, size_t len, bool *unlock)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
{
+ struct address_space *mapping = primary->mapping;
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
- struct netfs_io_request *rreq;
+ struct erofs_fscache_request *req;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
struct iov_iter iter;
+ loff_t pos = primary->start + primary->submitted;
size_t count;
int ret;
- *unlock = true;
-
map.m_la = pos;
ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
if (ret)
@@ -290,24 +226,26 @@ static int erofs_fscache_data_read(struct address_space *mapping,
if (IS_ERR(src))
return PTR_ERR(src);
- iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE);
+ iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
if (copy_to_iter(src + offset, size, &iter) != size) {
erofs_put_metabuf(&buf);
return -EFAULT;
}
iov_iter_zero(PAGE_SIZE - size, &iter);
erofs_put_metabuf(&buf);
- return PAGE_SIZE;
+ primary->submitted += PAGE_SIZE;
+ return 0;
}
+ count = primary->len - primary->submitted;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
- count = len;
- iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count);
+ iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
iov_iter_zero(count, &iter);
- return count;
+ primary->submitted += count;
+ return 0;
}
- count = min_t(size_t, map.m_llen - (pos - map.m_la), len);
+ count = min_t(size_t, map.m_llen - (pos - map.m_la), count);
DBG_BUGON(!count || count % PAGE_SIZE);
mdev = (struct erofs_map_dev) {
@@ -318,64 +256,65 @@ static int erofs_fscache_data_read(struct address_space *mapping,
if (ret)
return ret;
- rreq = erofs_fscache_alloc_request(mapping, pos, count);
- if (IS_ERR(rreq))
- return PTR_ERR(rreq);
+ req = erofs_fscache_req_chain(primary, count);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
- *unlock = false;
- erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- rreq, mdev.m_pa + (pos - map.m_la));
- return count;
+ ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ req, mdev.m_pa + (pos - map.m_la), count);
+ erofs_fscache_req_put(req);
+ primary->submitted += count;
+ return ret;
}
-static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+static int erofs_fscache_data_read(struct erofs_fscache_request *req)
{
- bool unlock;
int ret;
- DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+ do {
+ ret = erofs_fscache_data_read_slice(req);
+ if (ret)
+ req->error = ret;
+ } while (!ret && req->submitted < req->len);
- ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio),
- folio_size(folio), &unlock);
- if (unlock) {
- if (ret > 0)
- folio_mark_uptodate(folio);
+ return ret;
+}
+
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+{
+ struct erofs_fscache_request *req;
+ int ret;
+
+ req = erofs_fscache_req_alloc(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(req)) {
folio_unlock(folio);
+ return PTR_ERR(req);
}
- return ret < 0 ? ret : 0;
+
+ ret = erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
+ return ret;
}
static void erofs_fscache_readahead(struct readahead_control *rac)
{
- struct folio *folio;
- size_t len, done = 0;
- loff_t start, pos;
- bool unlock;
- int ret, size;
+ struct erofs_fscache_request *req;
if (!readahead_count(rac))
return;
- start = readahead_pos(rac);
- len = readahead_length(rac);
+ req = erofs_fscache_req_alloc(rac->mapping,
+ readahead_pos(rac), readahead_length(rac));
+ if (IS_ERR(req))
+ return;
- do {
- pos = start + done;
- ret = erofs_fscache_data_read(rac->mapping, pos,
- len - done, &unlock);
- if (ret <= 0)
- return;
+ /* The request completion will drop refs on the folios. */
+ while (readahead_folio(rac))
+ ;
- size = ret;
- while (size) {
- folio = readahead_folio(rac);
- size -= folio_size(folio);
- if (unlock) {
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- }
- }
- } while ((done += ret) < len);
+ erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
}
static const struct address_space_operations erofs_fscache_meta_aops = {
@@ -494,7 +433,8 @@ static int erofs_fscache_register_domain(struct super_block *sb)
static
struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
struct fscache_volume *volume = EROFS_SB(sb)->volume;
struct erofs_fscache *ctx;
@@ -516,7 +456,7 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
fscache_use_cookie(cookie, false);
ctx->cookie = cookie;
- if (need_inode) {
+ if (flags & EROFS_REG_COOKIE_NEED_INODE) {
struct inode *const inode = new_inode(sb);
if (!inode) {
@@ -554,14 +494,15 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
static
struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
int err;
struct inode *inode;
struct erofs_fscache *ctx;
struct erofs_domain *domain = EROFS_SB(sb)->domain;
- ctx = erofs_fscache_acquire_cookie(sb, name, need_inode);
+ ctx = erofs_fscache_acquire_cookie(sb, name, flags);
if (IS_ERR(ctx))
return ctx;
@@ -589,7 +530,8 @@ out:
static
struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
struct inode *inode;
struct erofs_fscache *ctx;
@@ -602,23 +544,30 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
ctx = inode->i_private;
if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
continue;
- igrab(inode);
+ if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) {
+ igrab(inode);
+ } else {
+ erofs_err(sb, "%s already exists in domain %s", name,
+ domain->domain_id);
+ ctx = ERR_PTR(-EEXIST);
+ }
spin_unlock(&psb->s_inode_list_lock);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
}
spin_unlock(&psb->s_inode_list_lock);
- ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode);
+ ctx = erofs_fscache_domain_init_cookie(sb, name, flags);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
}
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
if (EROFS_SB(sb)->domain_id)
- return erofs_domain_register_cookie(sb, name, need_inode);
- return erofs_fscache_acquire_cookie(sb, name, need_inode);
+ return erofs_domain_register_cookie(sb, name, flags);
+ return erofs_fscache_acquire_cookie(sb, name, flags);
}
void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
@@ -647,6 +596,7 @@ int erofs_fscache_register_fs(struct super_block *sb)
int ret;
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
+ unsigned int flags;
if (sbi->domain_id)
ret = erofs_fscache_register_domain(sb);
@@ -655,8 +605,20 @@ int erofs_fscache_register_fs(struct super_block *sb)
if (ret)
return ret;
- /* acquired domain/volume will be relinquished in kill_sb() on error */
- fscache = erofs_fscache_register_cookie(sb, sbi->fsid, true);
+ /*
+ * When shared domain is enabled, using NEED_NOEXIST to guarantee
+ * the primary data blob (aka fsid) is unique in the shared domain.
+ *
+ * For non-shared-domain case, fscache_acquire_volume() invoked by
+ * erofs_fscache_register_volume() has already guaranteed
+ * the uniqueness of primary data blob.
+ *
+ * Acquired domain/volume will be relinquished in kill_sb() on error.
+ */
+ flags = EROFS_REG_COOKIE_NEED_INODE;
+ if (sbi->domain_id)
+ flags |= EROFS_REG_COOKIE_NEED_NOEXIST;
+ fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags);
if (IS_ERR(fscache))
return PTR_ERR(fscache);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index ad2a82f2eb4c..d3b8736fa124 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -268,6 +268,7 @@ static int erofs_fill_inode(struct inode *inode)
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
inode->i_fop = &erofs_dir_fops;
+ inode_nohighmem(inode);
break;
case S_IFLNK:
err = erofs_fill_symlink(inode, kaddr, ofs);
@@ -295,6 +296,7 @@ static int erofs_fill_inode(struct inode *inode)
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
+ mapping_set_large_folios(inode->i_mapping);
#ifdef CONFIG_EROFS_FS_ONDEMAND
if (erofs_is_fscache_mode(inode->i_sb))
inode->i_mapping->a_ops = &erofs_fscache_access_aops;
@@ -371,7 +373,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
const struct inode_operations erofs_generic_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
.fiemap = erofs_fiemap,
};
@@ -379,12 +381,12 @@ const struct inode_operations erofs_symlink_iops = {
.get_link = page_get_link,
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
};
const struct inode_operations erofs_fast_symlink_iops = {
.get_link = simple_get_link,
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
};
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 05dc68627722..bb8501c0ff5b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -255,8 +255,7 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
- EROFS_KMAP, /* use kmap() to map the buffer */
- EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */
+ EROFS_KMAP, /* use kmap_local_page() to map the buffer */
};
struct erofs_buf {
@@ -604,13 +603,18 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+/* flags for erofs_fscache_register_cookie() */
+#define EROFS_REG_COOKIE_NEED_INODE 1
+#define EROFS_REG_COOKIE_NEED_NOEXIST 2
+
/* fscache.c */
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
- char *name, bool need_inode);
+ char *name,
+ unsigned int flags);
void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
extern const struct address_space_operations erofs_fscache_access_aops;
@@ -623,7 +627,8 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
static inline
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
return ERR_PTR(-EOPNOTSUPP);
}
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 0dc34721080c..b64a108fac92 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -228,6 +228,6 @@ const struct inode_operations erofs_dir_iops = {
.lookup = erofs_lookup,
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
.fiemap = erofs_fiemap,
};
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 1c7dcca702b3..481788c24a68 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -245,7 +245,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
}
if (erofs_is_fscache_mode(sb)) {
- fscache = erofs_fscache_register_cookie(sb, dif->path, false);
+ fscache = erofs_fscache_register_cookie(sb, dif->path, 0);
if (IS_ERR(fscache))
return PTR_ERR(fscache);
dif->fscache = fscache;
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 8106bcb5a38d..a62fb8a3318a 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -148,7 +148,7 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
it->blkaddr += erofs_blknr(it->ofs);
it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
it->ofs = erofs_blkoff(it->ofs);
@@ -174,7 +174,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
return vi->xattr_isize - xattr_header_sz;
@@ -368,7 +368,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->it.kaddr))
return PTR_ERR(it->it.kaddr);
it->it.blkaddr = blkaddr;
@@ -580,7 +580,7 @@ static int shared_listxattr(struct listxattr_iter *it)
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->it.kaddr))
return PTR_ERR(it->it.kaddr);
it->it.blkaddr = blkaddr;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index b792d424d774..ccf7c55d477f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -175,16 +175,6 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
DBG_BUGON(1);
}
-/* how to allocate cached pages for a pcluster */
-enum z_erofs_cache_alloctype {
- DONTALLOC, /* don't allocate any cached pages */
- /*
- * try to use cached I/O if page allocation succeeds or fallback
- * to in-place I/O instead to avoid any direct reclaim.
- */
- TRYALLOC,
-};
-
/*
* tagged pointer with 1-bit tag for all compressed pages
* tag 0 - the page is just found with an extra page reference
@@ -292,12 +282,29 @@ struct z_erofs_decompress_frontend {
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
.mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
+static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+{
+ unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
+
+ if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
+ return false;
+
+ if (fe->backmost)
+ return true;
+
+ if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
+ fe->map.m_la < fe->headoffset)
+ return true;
+
+ return false;
+}
+
static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
- enum z_erofs_cache_alloctype type,
struct page **pagepool)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
+ bool shouldalloc = z_erofs_should_alloc_cache(fe);
bool standalone = true;
/*
* optimistic allocation without direct reclaim since inplace I/O
@@ -326,18 +333,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
} else {
/* I/O is needed, no possible to decompress directly */
standalone = false;
- switch (type) {
- case TRYALLOC:
- newpage = erofs_allocpage(pagepool, gfp);
- if (!newpage)
- continue;
- set_page_private(newpage,
- Z_EROFS_PREALLOCATED_PAGE);
- t = tag_compressed_page_justfound(newpage);
- break;
- default: /* DONTALLOC */
+ if (!shouldalloc)
continue;
- }
+
+ /*
+ * try to use cached I/O if page allocation
+ * succeeds or fallback to in-place I/O instead
+ * to avoid any direct reclaim.
+ */
+ newpage = erofs_allocpage(pagepool, gfp);
+ if (!newpage)
+ continue;
+ set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
+ t = tag_compressed_page_justfound(newpage);
}
if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
@@ -488,7 +496,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
struct erofs_workgroup *grp;
int err;
- if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+ if (!(map->m_flags & EROFS_MAP_ENCODED) ||
+ (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -637,20 +646,6 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
return true;
}
-static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
- unsigned int cachestrategy,
- erofs_off_t la)
-{
- if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
- return false;
-
- if (fe->backmost)
- return true;
-
- return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
- la < fe->headoffset;
-}
-
static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
struct page *page, unsigned int pageofs,
unsigned int len)
@@ -687,12 +682,9 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct page *page, struct page **pagepool)
{
struct inode *const inode = fe->inode;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
bool tight = true, exclusive;
-
- enum z_erofs_cache_alloctype cache_strategy;
unsigned int cur, end, spiltted;
int err = 0;
@@ -746,13 +738,7 @@ repeat:
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
- if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
- map->m_la))
- cache_strategy = TRYALLOC;
- else
- cache_strategy = DONTALLOC;
-
- z_erofs_bind_cache(fe, cache_strategy, pagepool);
+ z_erofs_bind_cache(fe, pagepool);
}
hitted:
/*
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 0bb66927e3d0..0150570c33aa 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -178,7 +178,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int advise, type;
m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+ erofs_blknr(pos), EROFS_KMAP);
if (IS_ERR(m->kaddr))
return PTR_ERR(m->kaddr);
@@ -416,7 +416,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
out:
pos += lcn * (1 << amortizedshift);
m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+ erofs_blknr(pos), EROFS_KMAP);
if (IS_ERR(m->kaddr))
return PTR_ERR(m->kaddr);
return unpack_compacted_index(m, amortizedshift, pos, lookahead);
@@ -694,10 +694,15 @@ static int z_erofs_do_map_blocks(struct inode *inode,
map->m_pa = blknr_to_addr(m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
- goto out;
+ goto unmap_out;
}
if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
+ if (map->m_llen > map->m_plen) {
+ DBG_BUGON(1);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
map->m_algorithmformat =
Z_EROFS_COMPRESSION_INTERLACED;
@@ -718,14 +723,12 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
}
+
unmap_out:
erofs_unmap_metabuf(&m.map->buf);
-
-out:
erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
__func__, map->m_la, map->m_pa,
map->m_llen, map->m_plen, map->m_flags);
-
return err;
}
diff --git a/fs/exec.c b/fs/exec.c
index a0b1f0337a62..ab913243a367 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -64,6 +64,7 @@
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
+#include <linux/time_namespace.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -171,7 +172,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
exit:
fput(file);
out:
- return error;
+ return error;
}
#endif /* #ifdef CONFIG_USELIB */
@@ -199,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
{
struct page *page;
int ret;
- unsigned int gup_flags = FOLL_FORCE;
+ unsigned int gup_flags = 0;
#ifdef CONFIG_STACK_GROWSUP
if (write) {
@@ -842,16 +843,13 @@ int setup_arg_pages(struct linux_binprm *bprm,
* will align it up.
*/
rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
+
+ stack_expand = min(rlim_stack, stack_size + stack_expand);
+
#ifdef CONFIG_STACK_GROWSUP
- if (stack_size + stack_expand > rlim_stack)
- stack_base = vma->vm_start + rlim_stack;
- else
- stack_base = vma->vm_end + stack_expand;
+ stack_base = vma->vm_start + stack_expand;
#else
- if (stack_size + stack_expand > rlim_stack)
- stack_base = vma->vm_end - rlim_stack;
- else
- stack_base = vma->vm_start - stack_expand;
+ stack_base = vma->vm_end - stack_expand;
#endif
current->mm->start_stack = bprm->p;
ret = expand_stack(vma, stack_base);
@@ -1297,6 +1295,10 @@ int begin_new_exec(struct linux_binprm * bprm)
bprm->mm = NULL;
+ retval = exec_task_namespaces();
+ if (retval)
+ goto out_unlock;
+
#ifdef CONFIG_POSIX_TIMERS
spin_lock_irq(&me->sighand->siglock);
posix_cpu_timers_exit(me);
@@ -1568,6 +1570,12 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
if (task_no_new_privs(current))
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+ /*
+ * If another task is sharing our fs, we cannot safely
+ * suid exec because the differently privileged task
+ * will be able to manipulate the current directory, etc.
+ * It would be nice to force an unshare instead...
+ */
t = p;
n_fs = 1;
spin_lock(&p->fs->lock);
@@ -1591,8 +1599,8 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
struct user_namespace *mnt_userns;
struct inode *inode = file_inode(file);
unsigned int mode;
- kuid_t uid;
- kgid_t gid;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
if (!mnt_may_suid(file->f_path.mnt))
return;
@@ -1611,23 +1619,23 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
/* reload atomically mode/uid/gid now that lock held */
mode = inode->i_mode;
- uid = i_uid_into_mnt(mnt_userns, inode);
- gid = i_gid_into_mnt(mnt_userns, inode);
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
inode_unlock(inode);
/* We ignore suid/sgid if there are no mappings for them in the ns */
- if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
- !kgid_has_mapping(bprm->cred->user_ns, gid))
+ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
+ !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
return;
if (mode & S_ISUID) {
bprm->per_clear |= PER_CLEAR_ON_SETID;
- bprm->cred->euid = uid;
+ bprm->cred->euid = vfsuid_into_kuid(vfsuid);
}
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
bprm->per_clear |= PER_CLEAR_ON_SETID;
- bprm->cred->egid = gid;
+ bprm->cred->egid = vfsgid_into_kgid(vfsgid);
}
}
@@ -1748,6 +1756,7 @@ static int search_binary_handler(struct linux_binprm *bprm)
return retval;
}
+/* binfmt handlers will call back into begin_new_exec() on success. */
static int exec_binprm(struct linux_binprm *bprm)
{
pid_t old_pid, old_vpid;
@@ -1806,6 +1815,11 @@ static int bprm_execve(struct linux_binprm *bprm,
if (retval)
return retval;
+ /*
+ * Check for unsafe execution states before exec_binprm(), which
+ * will call back into begin_new_exec(), into bprm_creds_from_file(),
+ * where setuid-ness is evaluated.
+ */
check_unsafe_exec(bprm);
current->in_execve = 1;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index bf298967c5b8..440d5f1e9d47 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -219,11 +219,12 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
* inode->i_mutex: down
*/
int
-ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error;
int update_mode = 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 925ab6287d35..3841becb94ff 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size)
/* acl.c */
extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu);
-extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 5dc0a31f4a08..eca60b747c6b 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -667,7 +667,7 @@ ext2_try_to_allocate(struct super_block *sb, int group,
{
ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group);
ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group);
- ext2_grpblk_t start, end;
+ ext2_grpblk_t start, end;
unsigned long num = 0;
start = 0;
@@ -1481,11 +1481,11 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
desc_count, bitmap_count);
return bitmap_count;
#else
- for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
- desc = ext2_get_group_desc (sb, i, NULL);
- if (!desc)
- continue;
- desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+ for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+ desc = ext2_get_group_desc(sb, i, NULL);
+ if (!desc)
+ continue;
+ desc_count += le16_to_cpu(desc->bg_free_blocks_count);
}
return desc_count;
#endif
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 8f597753ac12..e5cbc27ba459 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -81,11 +81,10 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
{
struct address_space *mapping = page->mapping;
struct inode *dir = mapping->host;
- int err = 0;
inode_inc_iversion(dir);
block_write_end(NULL, mapping, pos, len, len, page, NULL);
@@ -94,16 +93,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
}
-
- if (IS_DIRSYNC(dir)) {
- err = write_one_page(page);
- if (!err)
- err = sync_inode_metadata(dir, 1);
- } else {
- unlock_page(page);
- }
-
- return err;
+ unlock_page(page);
}
static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
@@ -413,7 +403,7 @@ found:
return de;
}
-/**
+/*
* Return the '..' directory entry and the page in which the entry was found
* (as a parameter - p).
*
@@ -460,6 +450,17 @@ static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, ext2_get_block);
}
+
+static int ext2_handle_dirsync(struct inode *dir)
+{
+ int err;
+
+ err = filemap_write_and_wait(dir->i_mapping);
+ if (!err)
+ err = sync_inode_metadata(dir, 1);
+ return err;
+}
+
void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
struct page *page, void *page_addr, struct inode *inode,
int update_times)
@@ -474,11 +475,12 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
BUG_ON(err);
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type(de, inode);
- err = ext2_commit_chunk(page, pos, len);
+ ext2_commit_chunk(page, pos, len);
if (update_times)
dir->i_mtime = dir->i_ctime = current_time(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
+ ext2_handle_dirsync(dir);
}
/*
@@ -566,10 +568,11 @@ got_it:
memcpy(de->name, name, namelen);
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
- err = ext2_commit_chunk(page, pos, rec_len);
+ ext2_commit_chunk(page, pos, rec_len);
dir->i_mtime = dir->i_ctime = current_time(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
+ err = ext2_handle_dirsync(dir);
/* OFFSET_CACHE */
out_put:
ext2_put_page(page, page_addr);
@@ -615,10 +618,11 @@ int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page,
if (pde)
pde->rec_len = ext2_rec_len_to_disk(to - from);
dir->inode = 0;
- err = ext2_commit_chunk(page, pos, to - from);
+ ext2_commit_chunk(page, pos, to - from);
inode->i_ctime = inode->i_mtime = current_time(inode);
EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(inode);
+ err = ext2_handle_dirsync(inode);
out:
return err;
}
@@ -658,7 +662,8 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
memcpy (de->name, "..\0", 4);
ext2_set_de_type (de, inode);
kunmap_atomic(kaddr);
- err = ext2_commit_chunk(page, 0, chunk_size);
+ ext2_commit_chunk(page, 0, chunk_size);
+ err = ext2_handle_dirsync(inode);
fail:
put_page(page);
return err;
@@ -679,7 +684,7 @@ int ext2_empty_dir (struct inode * inode)
page = ext2_get_page(inode, i, 0, &page_addr);
if (IS_ERR(page))
- goto not_empty;
+ return 0;
kaddr = page_addr;
de = (ext2_dirent *)kaddr;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index eb97aa3d700e..6b4bebe982ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -200,7 +200,7 @@ const struct inode_operations ext2_file_inode_operations = {
.listxattr = ext2_listxattr,
.getattr = ext2_getattr,
.setattr = ext2_setattr,
- .get_acl = ext2_get_acl,
+ .get_inode_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
.fiemap = ext2_fiemap,
.fileattr_get = ext2_fileattr_get,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index f4944c4dee60..78b8686d9a4a 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -277,7 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- parent_group = prandom_u32_max(ngroups);
+ parent_group = get_random_u32_below(ngroups);
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
desc = ext2_get_group_desc (sb, group, NULL);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 918ab2f9e4c0..69aed9e2359e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -869,11 +869,6 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
-static int ext2_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, ext2_get_block, wbc);
-}
-
static int ext2_read_folio(struct file *file, struct folio *folio)
{
return mpage_read_folio(folio, ext2_get_block);
@@ -948,7 +943,6 @@ const struct address_space_operations ext2_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = ext2_read_folio,
.readahead = ext2_readahead,
- .writepage = ext2_writepage,
.write_begin = ext2_write_begin,
.write_end = ext2_write_end,
.bmap = ext2_bmap,
@@ -1652,7 +1646,7 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
setattr_copy(&init_user_ns, inode, iattr);
if (iattr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
mark_inode_dirty(inode);
return error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 9125eab85146..c056957221a2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -427,7 +427,7 @@ const struct inode_operations ext2_dir_inode_operations = {
.listxattr = ext2_listxattr,
.getattr = ext2_getattr,
.setattr = ext2_setattr,
- .get_acl = ext2_get_acl,
+ .get_inode_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
.tmpfile = ext2_tmpfile,
.fileattr_get = ext2_fileattr_get,
@@ -438,6 +438,6 @@ const struct inode_operations ext2_special_inode_operations = {
.listxattr = ext2_listxattr,
.getattr = ext2_getattr,
.setattr = ext2_setattr,
- .get_acl = ext2_get_acl,
+ .get_inode_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
};
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 03f2af98b1b4..69c88facfe90 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1648,7 +1648,7 @@ static int __init init_ext2_fs(void)
err = init_inodecache();
if (err)
return err;
- err = register_filesystem(&ext2_fs_type);
+ err = register_filesystem(&ext2_fs_type);
if (err)
goto out;
return 0;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 57e82e25f8e2..a9f89539aeee 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -225,12 +225,13 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
}
int
-ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
handle_t *handle;
int error, credits, retries = 0;
size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
int update_mode = 0;
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 3219669732bf..09c4a8a3b716 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size)
/* acl.c */
struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu);
-int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8d5453852f98..140e1eb300d1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -558,7 +558,7 @@ enum {
*
* It's not paranoia if the Murphy's Law really *is* out to get you. :-)
*/
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
static inline void ext4_check_flag_values(void)
@@ -2964,7 +2964,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode,
typedef enum {
EXT4_IGET_NORMAL = 0,
EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */
- EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */
+ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */
+ EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */
} ext4_iget_flags;
extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -2999,6 +3000,7 @@ extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
@@ -3619,8 +3621,8 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
- struct inode *inode);
+extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
struct dentry *dentry);
@@ -3756,8 +3758,7 @@ extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
- int len,
- bool keep_towrite);
+ int len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 8e1fb18f465e..77f318ec8abb 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -86,15 +86,21 @@ static int ext4_journal_check_start(struct super_block *sb)
return 0;
}
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+handle_t *__ext4_journal_start_sb(struct inode *inode,
+ struct super_block *sb, unsigned int line,
int type, int blocks, int rsv_blocks,
int revoke_creds)
{
journal_t *journal;
int err;
-
- trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
- _RET_IP_);
+ if (inode)
+ trace_ext4_journal_start_inode(inode, blocks, rsv_blocks,
+ revoke_creds, type,
+ _RET_IP_);
+ else
+ trace_ext4_journal_start_sb(sb, blocks, rsv_blocks,
+ revoke_creds, type,
+ _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index db2ae4a2b38d..0c77697d5e90 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -261,9 +261,9 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
(bh))
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks,
- int revoke_creds);
+handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
+ unsigned int line, int type, int blocks,
+ int rsv_blocks, int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -303,7 +303,7 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
+ __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\
ext4_trans_default_revoke_credits(sb))
#define ext4_journal_start(inode, type, nblocks) \
@@ -323,7 +323,7 @@ static inline handle_t *__ext4_journal_start(struct inode *inode,
int blocks, int rsv_blocks,
int revoke_creds)
{
- return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks,
rsv_blocks, revoke_creds);
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6c399a8b22b3..9de1c9d1a13d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2635,9 +2635,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unwritten, ex_ee_len);
path[depth].p_ext = ex;
- a = ex_ee_block > start ? ex_ee_block : start;
- b = ex_ee_block+ex_ee_len - 1 < end ?
- ex_ee_block+ex_ee_len - 1 : end;
+ a = max(ex_ee_block, start);
+ b = min(ex_ee_block + ex_ee_len - 1, end);
ext_debug(inode, " border %u:%u\n", a, b);
@@ -5567,8 +5566,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
- ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
- len_lblk, SHIFT_RIGHT);
+ max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
@@ -5799,6 +5797,14 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
struct ext4_extent *extent;
ext4_lblk_t first_lblk, first_lclu, last_lclu;
+ /*
+ * if data can be stored inline, the logical cluster isn't
+ * mapped - no physical clusters have been allocated, and the
+ * file has no extents
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ return 0;
+
/* search for the extent closest to the first block in the cluster */
path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
if (IS_ERR(path)) {
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index cd0a861853e3..7bc221038c6c 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -155,9 +155,7 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
int __init ext4_init_es(void)
{
- ext4_es_cachep = kmem_cache_create("ext4_extent_status",
- sizeof(struct extent_status),
- 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
if (ext4_es_cachep == NULL)
return -ENOMEM;
return 0;
@@ -1371,7 +1369,7 @@ retry:
if (count_reserved)
count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
&orig_es, &rc);
- goto out;
+ goto out_get_reserved;
}
if (len1 > 0) {
@@ -1413,6 +1411,7 @@ retry:
}
}
+out_get_reserved:
if (count_reserved)
*reserved = get_rsvd(inode, end, es, &rc);
out:
@@ -1807,9 +1806,7 @@ static void ext4_print_pending_tree(struct inode *inode)
int __init ext4_init_pending(void)
{
- ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
- sizeof(struct pending_reservation),
- 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
if (ext4_pending_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 0f6d0a80467d..4594b62f147b 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -420,25 +420,34 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
struct __track_dentry_update_args *dentry_update =
(struct __track_dentry_update_args *)arg;
struct dentry *dentry = dentry_update->dentry;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
mutex_unlock(&ei->i_fc_lock);
+
+ if (IS_ENCRYPTED(dir)) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
+ NULL);
+ mutex_lock(&ei->i_fc_lock);
+ return -EOPNOTSUPP;
+ }
+
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
node->fcd_op = dentry_update->op;
- node->fcd_parent = dentry->d_parent->d_inode->i_ino;
+ node->fcd_parent = dir->i_ino;
node->fcd_ino = inode->i_ino;
if (dentry->d_name.len > DNAME_INLINE_LEN) {
node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
if (!node->fcd_name.name) {
kmem_cache_free(ext4_fc_dentry_cachep, node);
- ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_NOMEM, NULL);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -666,18 +675,6 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
/* Ext4 commit path routines */
-/* memzero and update CRC */
-static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
- u32 *crc)
-{
- void *ret;
-
- ret = memset(dst, 0, len);
- if (crc)
- *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
- return ret;
-}
-
/*
* Allocate len bytes on a fast commit buffer.
*
@@ -691,62 +688,60 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
*/
static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{
- struct ext4_fc_tl *tl;
+ struct ext4_fc_tl tl;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct buffer_head *bh;
int bsize = sbi->s_journal->j_blocksize;
int ret, off = sbi->s_fc_bytes % bsize;
- int pad_len;
+ int remaining;
+ u8 *dst;
/*
- * After allocating len, we should have space at least for a 0 byte
- * padding.
+ * If 'len' is too long to fit in any block alongside a PAD tlv, then we
+ * cannot fulfill the request.
*/
- if (len + EXT4_FC_TAG_BASE_LEN > bsize)
+ if (len > bsize - EXT4_FC_TAG_BASE_LEN)
return NULL;
- if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) {
- /*
- * Only allocate from current buffer if we have enough space for
- * this request AND we have space to add a zero byte padding.
- */
- if (!sbi->s_fc_bh) {
- ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
- if (ret)
- return NULL;
- sbi->s_fc_bh = bh;
- }
+ if (!sbi->s_fc_bh) {
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ }
+ dst = sbi->s_fc_bh->b_data + off;
+
+ /*
+ * Allocate the bytes in the current block if we can do so while still
+ * leaving enough space for a PAD tlv.
+ */
+ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
+ if (len <= remaining) {
sbi->s_fc_bytes += len;
- return sbi->s_fc_bh->b_data + off;
+ return dst;
}
- /* Need to add PAD tag */
- tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
- tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
- pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
- tl->fc_len = cpu_to_le16(pad_len);
- if (crc)
- *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
- if (pad_len > 0)
- ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+
+ /*
+ * Else, terminate the current block with a PAD tlv, then allocate a new
+ * block and allocate the bytes at the start of that new block.
+ */
+
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
+ tl.fc_len = cpu_to_le16(remaining);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
+ *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
+
ext4_fc_submit_bh(sb, false);
ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
if (ret)
return NULL;
sbi->s_fc_bh = bh;
- sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
+ sbi->s_fc_bytes += bsize - off + len;
return sbi->s_fc_bh->b_data;
}
-/* memcpy to fc reserved space and update CRC */
-static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
- int len, u32 *crc)
-{
- if (crc)
- *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
- return memcpy(dst, src, len);
-}
-
/*
* Complete a fast commit by writing tail tag.
*
@@ -774,16 +769,20 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
off = sbi->s_fc_bytes % bsize;
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
- tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
+ tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
dst += EXT4_FC_TAG_BASE_LEN;
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
- ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
+ memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
dst += sizeof(tail.fc_tid);
+ crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
+ dst - (u8 *)sbi->s_fc_bh->b_data);
tail.fc_crc = cpu_to_le32(crc);
- ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+ memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
+ dst += sizeof(tail.fc_crc);
+ memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
ext4_fc_submit_bh(sb, true);
@@ -807,8 +806,8 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
tl.fc_tag = cpu_to_le16(tag);
tl.fc_len = cpu_to_le16(len);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
- ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
return true;
}
@@ -830,11 +829,11 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
dst += EXT4_FC_TAG_BASE_LEN;
- ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
+ memcpy(dst, &fcd, sizeof(fcd));
dst += sizeof(fcd);
- ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
+ memcpy(dst, fc_dentry->fcd_name.name, dlen);
return true;
}
@@ -872,15 +871,11 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
if (!dst)
goto err;
- if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc))
- goto err;
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
dst += EXT4_FC_TAG_BASE_LEN;
- if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
- goto err;
+ memcpy(dst, &fc_inode, sizeof(fc_inode));
dst += sizeof(fc_inode);
- if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
- inode_len, crc))
- goto err;
+ memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
ret = 0;
err:
brelse(iloc.bh);
@@ -986,7 +981,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
finish_wait(&ei->i_fc_wait, &wait);
}
spin_unlock(&sbi->s_fc_lock);
- ret = jbd2_submit_inode_data(ei->jinode);
+ ret = jbd2_submit_inode_data(journal, ei->jinode);
if (ret)
return ret;
spin_lock(&sbi->s_fc_lock);
@@ -1388,7 +1383,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
return 0;
}
- ret = __ext4_unlink(NULL, old_parent, &entry, inode);
+ ret = __ext4_unlink(old_parent, &entry, inode, NULL);
/* -ENOENT ok coz it might not exist anymore. */
if (ret == -ENOENT)
ret = 0;
@@ -1977,32 +1972,31 @@ void ext4_fc_replay_cleanup(struct super_block *sb)
kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}
-static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl,
- u8 *val, u8 *end)
+static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
+ int tag, int len)
{
- if (val + tl->fc_len > end)
- return false;
-
- /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do
- * journal rescan before do CRC check. Other tags length check will
- * rely on CRC check.
- */
- switch (tl->fc_tag) {
+ switch (tag) {
case EXT4_FC_TAG_ADD_RANGE:
- return (sizeof(struct ext4_fc_add_range) == tl->fc_len);
- case EXT4_FC_TAG_TAIL:
- return (sizeof(struct ext4_fc_tail) <= tl->fc_len);
- case EXT4_FC_TAG_HEAD:
- return (sizeof(struct ext4_fc_head) == tl->fc_len);
+ return len == sizeof(struct ext4_fc_add_range);
case EXT4_FC_TAG_DEL_RANGE:
+ return len == sizeof(struct ext4_fc_del_range);
+ case EXT4_FC_TAG_CREAT:
case EXT4_FC_TAG_LINK:
case EXT4_FC_TAG_UNLINK:
- case EXT4_FC_TAG_CREAT:
+ len -= sizeof(struct ext4_fc_dentry_info);
+ return len >= 1 && len <= EXT4_NAME_LEN;
case EXT4_FC_TAG_INODE:
+ len -= sizeof(struct ext4_fc_inode);
+ return len >= EXT4_GOOD_OLD_INODE_SIZE &&
+ len <= sbi->s_inode_size;
case EXT4_FC_TAG_PAD:
- default:
- return true;
+ return true; /* padding can have any length */
+ case EXT4_FC_TAG_TAIL:
+ return len >= sizeof(struct ext4_fc_tail);
+ case EXT4_FC_TAG_HEAD:
+ return len == sizeof(struct ext4_fc_head);
}
+ return false;
}
/*
@@ -2040,7 +2034,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
state = &sbi->s_fc_replay_state;
start = (u8 *)bh->b_data;
- end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+ end = start + journal->j_blocksize;
if (state->fc_replay_expected_off == 0) {
state->fc_cur_tag = 0;
@@ -2061,11 +2055,12 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_replay_expected_off++;
- for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
ext4_fc_get_tl(&tl, cur);
val = cur + EXT4_FC_TAG_BASE_LEN;
- if (!ext4_fc_tag_len_isvalid(&tl, val, end)) {
+ if (tl.fc_len > end - val ||
+ !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
ret = state->fc_replay_num_tags ?
JBD2_FC_REPLAY_STOP : -ECANCELED;
goto out_err;
@@ -2178,9 +2173,9 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
#endif
start = (u8 *)bh->b_data;
- end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+ end = start + journal->j_blocksize;
- for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
ext4_fc_get_tl(&tl, cur);
val = cur + EXT4_FC_TAG_BASE_LEN;
@@ -2249,17 +2244,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)
journal->j_fc_cleanup_callback = ext4_fc_cleanup;
}
-static const char *fc_ineligible_reasons[] = {
- "Extended attributes changed",
- "Cross rename",
- "Journal flag changed",
- "Insufficient memory",
- "Swap boot",
- "Resize",
- "Dir renamed",
- "Falloc range op",
- "Data journalling",
- "FC Commit Failed"
+static const char * const fc_ineligible_reasons[] = {
+ [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
+ [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
+ [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
+ [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
+ [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
+ [EXT4_FC_REASON_RESIZE] = "Resize",
+ [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
+ [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
+ [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
+ [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
};
int ext4_fc_info_show(struct seq_file *seq, void *v)
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index a6154c3ed135..2fadb2c4780c 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -58,7 +58,7 @@ struct ext4_fc_dentry_info {
__u8 fc_dname[];
};
-/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
+/* Value structure for EXT4_FC_TAG_INODE. */
struct ext4_fc_inode {
__le32 fc_ino;
__u8 fc_raw_inode[];
@@ -96,6 +96,7 @@ enum {
EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
+ EXT4_FC_REASON_ENCRYPTED_FILENAME,
EXT4_FC_REASON_MAX
};
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a7a597c727e6..7ac0a81bd371 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -955,7 +955,7 @@ const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_file_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
.fileattr_get = ext4_fileattr_get,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e9bc46684106..63f9bb6e8851 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -465,7 +465,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
parent_group = hinfo.hash % ngroups;
} else
- parent_group = prandom_u32_max(ngroups);
+ parent_group = get_random_u32_below(ngroups);
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
get_orlov_stats(sb, g, flex_size, &stats);
@@ -870,7 +870,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
struct super_block *sb = dir->i_sb;
int nblocks = 0;
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+ struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -1076,8 +1076,8 @@ repeat_in_this_group:
if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
BUG_ON(nblocks <= 0);
- handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks, 0,
+ handle = __ext4_journal_start_sb(NULL, dir->i_sb,
+ line_no, handle_type, nblocks, 0,
ext4_trans_default_revoke_credits(sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 860fc5119009..c68bebe7ff4b 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
struct super_block *sb = inode->i_sb;
Indirect *p = chain;
struct buffer_head *bh;
+ unsigned int key;
int ret = -EIO;
*err = 0;
@@ -156,7 +157,13 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
if (!p->key)
goto no_block;
while (--depth) {
- bh = sb_getblk(sb, le32_to_cpu(p->key));
+ key = le32_to_cpu(p->key);
+ if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+ /* the block was out of range */
+ ret = -EFSCORRUPTED;
+ goto failure;
+ }
+ bh = sb_getblk(sb, key);
if (unlikely(!bh)) {
ret = -ENOMEM;
goto failure;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index a4fbe825694b..2b42ececa46d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -180,8 +180,7 @@ static int ext4_read_inline_data(struct inode *inode, void *buffer,
BUG_ON(len > EXT4_I(inode)->i_inline_size);
- cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
- len : EXT4_MIN_INLINE_DATA_SIZE;
+ cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE);
raw_inode = ext4_raw_inode(iloc);
memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2b5ef1b64249..9d9f414f99fe 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -222,13 +222,13 @@ void ext4_evict_inode(struct inode *inode)
/*
* For inodes with journalled data, transaction commit could have
- * dirtied the inode. Flush worker is ignoring it because of I_FREEING
- * flag but we still need to remove the inode from the writeback lists.
+ * dirtied the inode. And for inodes with dioread_nolock, unwritten
+ * extents converting worker could merge extents and also have dirtied
+ * the inode. Flush worker is ignoring it because of I_FREEING flag but
+ * we still need to remove the inode from the writeback lists.
*/
- if (!list_empty_careful(&inode->i_io_list)) {
- WARN_ON_ONCE(!ext4_should_journal_data(inode));
+ if (!list_empty_careful(&inode->i_io_list))
inode_io_list_del(inode);
- }
/*
* Protect us against freezing - iput() caller didn't have to have any
@@ -335,6 +335,12 @@ stop_handle:
ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
+ /*
+ * Check out some where else accidentally dirty the evicting inode,
+ * which may probably cause inode use-after-free issues later.
+ */
+ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));
+
if (!list_empty(&EXT4_I(inode)->i_fc_list))
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
@@ -1309,7 +1315,8 @@ static int ext4_write_end(struct file *file,
trace_ext4_write_end(inode, pos, len, copied);
- if (ext4_has_inline_data(inode))
+ if (ext4_has_inline_data(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
return ext4_write_inline_data_end(inode, pos, len, copied, page);
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -1543,9 +1550,12 @@ void ext4_da_release_space(struct inode *inode, int to_free)
*/
struct mpage_da_data {
+ /* These are input fields for ext4_do_writepages() */
struct inode *inode;
struct writeback_control *wbc;
+ unsigned int can_map:1; /* Can writepages call map blocks? */
+ /* These are internal state of ext4_do_writepages() */
pgoff_t first_page; /* The first page to write */
pgoff_t next_page; /* Current page to examine */
pgoff_t last_page; /* Last page to examine */
@@ -2009,7 +2019,6 @@ static int ext4_writepage(struct page *page,
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
struct ext4_io_submit io_submit;
- bool keep_towrite = false;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
folio_invalidate(folio, 0, folio_size(folio));
@@ -2067,7 +2076,6 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return 0;
}
- keep_towrite = true;
}
if (PageChecked(page) && ext4_should_journal_data(inode))
@@ -2084,7 +2092,7 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return -ENOMEM;
}
- ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite);
+ ret = ext4_bio_write_page(&io_submit, page, len);
ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
@@ -2118,7 +2126,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
- err = ext4_bio_write_page(&mpd->io_submit, page, len, false);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len);
if (!err)
mpd->wbc->nr_to_write--;
mpd->first_page++;
@@ -2551,18 +2559,33 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}
+/* Return true if the page needs to be written as part of transaction commit */
+static bool ext4_page_nomap_can_writeout(struct page *page)
+{
+ struct buffer_head *bh, *head;
+
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh))
+ return true;
+ } while ((bh = bh->b_this_page) != head);
+ return false;
+}
+
/*
* mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
- * and underlying extent to map
+ * needing mapping, submit mapped pages
*
* @mpd - where to look for pages
*
* Walk dirty pages in the mapping. If they are fully mapped, submit them for
- * IO immediately. When we find a page which isn't mapped we start accumulating
- * extent of buffers underlying these pages that needs mapping (formed by
- * either delayed or unwritten buffers). We also lock the pages containing
- * these buffers. The extent found is returned in @mpd structure (starting at
- * mpd->lblk with length mpd->len blocks).
+ * IO immediately. If we cannot map blocks, we submit just already mapped
+ * buffers in the page for IO and keep page dirty. When we can map blocks and
+ * we find a page which isn't mapped we start accumulating extent of buffers
+ * underlying these pages that needs mapping (formed by either delayed or
+ * unwritten buffers). We also lock the pages containing these buffers. The
+ * extent found is returned in @mpd structure (starting at mpd->lblk with
+ * length mpd->len blocks).
*
* Note that this function can attach bios to one io_end structure which are
* neither logically nor physically contiguous. Although it may seem as an
@@ -2653,14 +2676,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
- /* Add all dirty buffers to mpd */
- lblk = ((ext4_lblk_t)page->index) <<
- (PAGE_SHIFT - blkbits);
- head = page_buffers(page);
- err = mpage_process_page_bufs(mpd, head, head, lblk);
- if (err <= 0)
- goto out;
- err = 0;
+ /*
+ * Writeout for transaction commit where we cannot
+ * modify metadata is simple. Just submit the page.
+ */
+ if (!mpd->can_map) {
+ if (ext4_page_nomap_can_writeout(page)) {
+ err = mpage_submit_page(mpd, page);
+ if (err < 0)
+ goto out;
+ } else {
+ unlock_page(page);
+ mpd->first_page++;
+ }
+ } else {
+ /* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_SHIFT - blkbits);
+ head = page_buffers(page);
+ err = mpage_process_page_bufs(mpd, head, head,
+ lblk);
+ if (err <= 0)
+ goto out;
+ err = 0;
+ }
left--;
}
pagevec_release(&pvec);
@@ -2673,25 +2712,27 @@ out:
return err;
}
-static int ext4_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc,
+ void *data)
{
+ return ext4_writepage(page, wbc);
+}
+
+static int ext4_do_writepages(struct mpage_da_data *mpd)
+{
+ struct writeback_control *wbc = mpd->wbc;
pgoff_t writeback_index = 0;
long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
int cycled = 1;
handle_t *handle = NULL;
- struct mpage_da_data mpd;
- struct inode *inode = mapping->host;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
struct blk_plug plug;
bool give_up_on_write = false;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
-
- percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
/*
@@ -2703,7 +2744,9 @@ static int ext4_writepages(struct address_space *mapping,
goto out_writepages;
if (ext4_should_journal_data(inode)) {
- ret = generic_writepages(mapping, wbc);
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL);
+ blk_finish_plug(&plug);
goto out_writepages;
}
@@ -2757,19 +2800,18 @@ static int ext4_writepages(struct address_space *mapping,
writeback_index = mapping->writeback_index;
if (writeback_index)
cycled = 0;
- mpd.first_page = writeback_index;
- mpd.last_page = -1;
+ mpd->first_page = writeback_index;
+ mpd->last_page = -1;
} else {
- mpd.first_page = wbc->range_start >> PAGE_SHIFT;
- mpd.last_page = wbc->range_end >> PAGE_SHIFT;
+ mpd->first_page = wbc->range_start >> PAGE_SHIFT;
+ mpd->last_page = wbc->range_end >> PAGE_SHIFT;
}
- mpd.inode = inode;
- mpd.wbc = wbc;
- ext4_io_submit_init(&mpd.io_submit, wbc);
+ ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ tag_pages_for_writeback(mapping, mpd->first_page,
+ mpd->last_page);
blk_start_plug(&plug);
/*
@@ -2778,31 +2820,32 @@ retry:
* in the block layer on device congestion while having transaction
* started.
*/
- mpd.do_map = 0;
- mpd.scanned_until_end = 0;
- mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!mpd.io_submit.io_end) {
+ mpd->do_map = 0;
+ mpd->scanned_until_end = 0;
+ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd->io_submit.io_end) {
ret = -ENOMEM;
goto unplug;
}
- ret = mpage_prepare_extent_to_map(&mpd);
+ ret = mpage_prepare_extent_to_map(mpd);
/* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, false);
+ mpage_release_unused_pages(mpd, false);
/* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
- ext4_put_io_end_defer(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_io_submit(&mpd->io_submit);
+ ext4_put_io_end_defer(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
if (ret < 0)
goto unplug;
- while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
+ while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
/* For each extent of pages we use new io_end */
- mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!mpd.io_submit.io_end) {
+ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd->io_submit.io_end) {
ret = -ENOMEM;
break;
}
+ WARN_ON_ONCE(!mpd->can_map);
/*
* We have two constraints: We find one extent to map and we
* must always write out whole page (makes a difference when
@@ -2822,16 +2865,16 @@ retry:
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
/* Release allocated io_end */
- ext4_put_io_end(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_put_io_end(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
break;
}
- mpd.do_map = 1;
+ mpd->do_map = 1;
- trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
- ret = mpage_prepare_extent_to_map(&mpd);
- if (!ret && mpd.map.m_len)
- ret = mpage_map_and_submit_extent(handle, &mpd,
+ trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
+ ret = mpage_prepare_extent_to_map(mpd);
+ if (!ret && mpd->map.m_len)
+ ret = mpage_map_and_submit_extent(handle, mpd,
&give_up_on_write);
/*
* Caution: If the handle is synchronous,
@@ -2846,12 +2889,12 @@ retry:
if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
ext4_journal_stop(handle);
handle = NULL;
- mpd.do_map = 0;
+ mpd->do_map = 0;
}
/* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, give_up_on_write);
+ mpage_release_unused_pages(mpd, give_up_on_write);
/* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
+ ext4_io_submit(&mpd->io_submit);
/*
* Drop our io_end reference we got from init. We have
@@ -2861,11 +2904,11 @@ retry:
* up doing unwritten extent conversion.
*/
if (handle) {
- ext4_put_io_end_defer(mpd.io_submit.io_end);
+ ext4_put_io_end_defer(mpd->io_submit.io_end);
ext4_journal_stop(handle);
} else
- ext4_put_io_end(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_put_io_end(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2885,8 +2928,8 @@ unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
- mpd.last_page = writeback_index - 1;
- mpd.first_page = 0;
+ mpd->last_page = writeback_index - 1;
+ mpd->first_page = 0;
goto retry;
}
@@ -2896,15 +2939,51 @@ unplug:
* Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = mpd.first_page;
+ mapping->writeback_index = mpd->first_page;
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct super_block *sb = mapping->host->i_sb;
+ struct mpage_da_data mpd = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .can_map = 1,
+ };
+ int ret;
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ return -EIO;
+
+ percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
+ ret = ext4_do_writepages(&mpd);
+ percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
+
+ return ret;
+}
+
+int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+ struct mpage_da_data mpd = {
+ .inode = jinode->i_vfs_inode,
+ .wbc = &wbc,
+ .can_map = 0,
+ };
+ return ext4_do_writepages(&mpd);
+}
+
static int ext4_dax_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -3646,7 +3725,6 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
static const struct address_space_operations ext4_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
@@ -3664,7 +3742,6 @@ static const struct address_space_operations ext4_aops = {
static const struct address_space_operations ext4_journalled_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
@@ -3673,6 +3750,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.invalidate_folio = ext4_journalled_invalidate_folio,
.release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
+ .migrate_folio = buffer_migrate_folio_norefs,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = ext4_iomap_swap_activate,
@@ -3681,7 +3759,6 @@ static const struct address_space_operations ext4_journalled_aops = {
static const struct address_space_operations ext4_da_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
@@ -4225,7 +4302,8 @@ int ext4_truncate(struct inode *inode)
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
- if (ext4_inode_attach_jinode(inode) < 0)
+ err = ext4_inode_attach_jinode(inode);
+ if (err)
goto out_trace;
}
@@ -4473,9 +4551,17 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((ino - 1) %
EXT4_INODES_PER_GROUP(sb));
- block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ block = ext4_inode_table(sb, gdp);
+ if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
+ (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
+ ext4_error(sb, "Invalid inode table block %llu in "
+ "block_group %u", block, iloc->block_group);
+ return -EFSCORRUPTED;
+ }
+ block += (inode_offset / inodes_per_block);
+
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
@@ -5044,8 +5130,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
ext4_error_inode(inode, function, line, 0,
"casefold flag without casefold feature");
- brelse(iloc.bh);
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ ext4_error_inode(inode, function, line, 0,
+ "bad inode without EXT4_IGET_BAD flag");
+ ret = -EUCLEAN;
+ goto bad_inode;
+ }
+ brelse(iloc.bh);
unlock_new_inode(inode);
return inode;
@@ -5550,7 +5642,7 @@ out_mmap_sem:
ext4_orphan_del(NULL, inode);
if (!error && (ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
err_out:
if (error)
@@ -5853,6 +5945,14 @@ static int __ext4_expand_extra_isize(struct inode *inode,
return 0;
}
+ /*
+ * We may need to allocate external xattr block so we need quotas
+ * initialized. Here we can be called with various locks held so we
+ * cannot affort to initialize quotas ourselves. So just bail.
+ */
+ if (dquot_initialize_needed(inode))
+ return -EAGAIN;
+
/* try to expand with EAs present */
error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
raw_inode, handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 95dfea28bf4e..8067ccda34e4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -374,7 +374,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
blkcnt_t blocks;
unsigned short bytes;
- inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO,
+ EXT4_IGET_SPECIAL | EXT4_IGET_BAD);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
@@ -424,7 +425,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
- if (inode_bl->i_nlink == 0) {
+ if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) {
/* this inode has never been used as a BOOT_LOADER */
set_nlink(inode_bl, 1);
i_uid_write(inode_bl, 0);
@@ -731,6 +732,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
if (ext4_is_quota_file(inode))
return err;
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
@@ -746,10 +751,6 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
brelse(iloc.bh);
}
- err = dquot_initialize(inode);
- if (err)
- return err;
-
handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(sb) +
EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
@@ -1153,19 +1154,22 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
if (fsuuid.fsu_len == 0) {
fsuuid.fsu_len = UUID_SIZE;
- if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len)))
+ if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len,
+ sizeof(fsuuid.fsu_len)))
return -EFAULT;
- return -EINVAL;
+ return 0;
}
- if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0)
return -EINVAL;
lock_buffer(sbi->s_sbh);
memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
unlock_buffer(sbi->s_sbh);
- if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) ||
+ copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
return -EFAULT;
return 0;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9dad93059945..5b2ae37a8b80 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5204,7 +5204,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
mutex_lock(&ac->ac_lg->lg_mutex);
}
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
@@ -5253,8 +5253,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
(unsigned) ar->lleft, (unsigned) ar->pleft,
(unsigned) ar->lright, (unsigned) ar->pright,
inode_is_open_for_write(ar->inode) ? "" : "non-");
- return 0;
-
}
static noinline_for_stack void
@@ -5591,11 +5589,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto out;
}
- *errp = ext4_mb_initialize_context(ac, ar);
- if (*errp) {
- ar->len = 0;
- goto out;
- }
+ ext4_mb_initialize_context(ac, ar);
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
seq = this_cpu_read(discard_pa_seq);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 588cb09c5291..4681fff6665f 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -262,13 +262,7 @@ void ext4_stop_mmpd(struct ext4_sb_info *sbi)
*/
static unsigned int mmp_new_seq(void)
{
- u32 new_seq;
-
- do {
- new_seq = get_random_u32();
- } while (new_seq > EXT4_MMP_SEQ_MAX);
-
- return new_seq;
+ return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1);
}
/*
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c08c0aba1883..dd28453d6ea3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3204,14 +3204,20 @@ end_rmdir:
return retval;
}
-int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
- struct inode *inode)
+int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode,
+ struct dentry *dentry /* NULL during fast_commit recovery */)
{
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
+ handle_t *handle;
int skip_remove_dentry = 0;
+ /*
+ * Keep this outside the transaction; it may have to set up the
+ * directory's encryption key, which isn't GFP_NOFS-safe.
+ */
bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -3228,7 +3234,14 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
skip_remove_dentry = 1;
else
- goto out;
+ goto out_bh;
+ }
+
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ goto out_bh;
}
if (IS_DIRSYNC(dir))
@@ -3237,12 +3250,12 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
if (!skip_remove_dentry) {
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
- goto out;
+ goto out_handle;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
- goto out;
+ goto out_handle;
} else {
retval = 0;
}
@@ -3255,15 +3268,17 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
retval = ext4_mark_inode_dirty(handle, inode);
-
-out:
+ if (dentry && !retval)
+ ext4_fc_track_unlink(handle, dentry);
+out_handle:
+ ext4_journal_stop(handle);
+out_bh:
brelse(bh);
return retval;
}
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
- handle_t *handle;
int retval;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
@@ -3281,16 +3296,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (retval)
goto out_trace;
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle)) {
- retval = PTR_ERR(handle);
- goto out_trace;
- }
-
- retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
- if (!retval)
- ext4_fc_track_unlink(handle, dentry);
+ retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -3301,8 +3307,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
- if (handle)
- ext4_journal_stop(handle);
out_trace:
trace_ext4_unlink_exit(dentry, retval);
@@ -3794,6 +3798,9 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
retval = dquot_initialize(old.dir);
if (retval)
return retval;
+ retval = dquot_initialize(old.inode);
+ if (retval)
+ return retval;
retval = dquot_initialize(new.dir);
if (retval)
return retval;
@@ -4194,7 +4201,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
.fileattr_get = ext4_fileattr_get,
@@ -4205,6 +4212,6 @@ const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
};
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 69a9cf9137a6..e5b47dda3317 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -412,7 +412,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
/* don't clear list on RO mount w/ errors */
if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
- "clearing orphan list.\n");
+ "clearing orphan list.");
es->s_last_orphan = 0;
}
ext4_debug("Skipping orphan recovery on fs with errors.\n");
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 97fa7b4c645f..beaec6d81074 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -430,25 +430,20 @@ submit_and_retry:
int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
- int len,
- bool keep_towrite)
+ int len)
{
struct page *bounce_page = NULL;
struct inode *inode = page->mapping->host;
unsigned block_start;
struct buffer_head *bh, *head;
int ret = 0;
- int nr_submitted = 0;
int nr_to_submit = 0;
struct writeback_control *wbc = io->io_wbc;
+ bool keep_towrite = false;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- if (keep_towrite)
- set_page_writeback_keepwrite(page);
- else
- set_page_writeback(page);
ClearPageError(page);
/*
@@ -482,16 +477,31 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/* A hole? We can safely clear the dirty bit */
if (!buffer_mapped(bh))
clear_buffer_dirty(bh);
- if (io->io_bio)
- ext4_io_submit(io);
+ /*
+ * Keeping dirty some buffer we cannot write? Make sure
+ * to redirty the page and keep TOWRITE tag so that
+ * racing WB_SYNC_ALL writeback does not skip the page.
+ * This happens e.g. when doing writeout for
+ * transaction commit.
+ */
+ if (buffer_dirty(bh)) {
+ if (!PageDirty(page))
+ redirty_page_for_writepage(wbc, page);
+ keep_towrite = true;
+ }
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh);
set_buffer_async_write(bh);
+ clear_buffer_dirty(bh);
nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
+ /* Nothing to submit? Just unlock the page... */
+ if (!nr_to_submit)
+ goto unlock;
+
bh = head = page_buffers(page);
/*
@@ -532,27 +542,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
redirty_page_for_writepage(wbc, page);
do {
- clear_buffer_async_write(bh);
+ if (buffer_async_write(bh)) {
+ clear_buffer_async_write(bh);
+ set_buffer_dirty(bh);
+ }
bh = bh->b_this_page;
} while (bh != head);
goto unlock;
}
}
+ if (keep_towrite)
+ set_page_writeback_keepwrite(page);
+ else
+ set_page_writeback(page);
+
/* Now submit buffers to write */
do {
if (!buffer_async_write(bh))
continue;
io_submit_add_bh(io, inode,
bounce_page ? bounce_page : page, bh);
- nr_submitted++;
- clear_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
-
unlock:
unlock_page(page);
- /* Nothing submitted - we have to end page writeback */
- if (!nr_submitted)
- end_page_writeback(page);
return ret;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3d21eae267fc..d5266932ce6c 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -75,14 +75,10 @@ static void __read_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, iter_all) {
page = bv->bv_page;
- /* PG_error was set if verity failed. */
- if (bio->bi_status || PageError(page)) {
+ if (bio->bi_status)
ClearPageUptodate(page);
- /* will re-read again later */
- ClearPageError(page);
- } else {
+ else
SetPageUptodate(page);
- }
unlock_page(page);
}
if (bio->bi_private)
@@ -410,9 +406,8 @@ int ext4_mpage_readpages(struct inode *inode,
int __init ext4_init_post_read_processing(void)
{
- bio_post_read_ctx_cache =
- kmem_cache_create("ext4_bio_post_read_ctx",
- sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+ bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT);
+
if (!bio_post_read_ctx_cache)
goto fail;
bio_post_read_ctx_pool =
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 46b87ffeb304..6b91443d6bf3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1110,6 +1110,16 @@ exit_free:
return err;
}
+static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
+ ext4_group_t group)
+{
+ struct ext4_super_block *es = (struct ext4_super_block *) data;
+
+ es->s_block_group_nr = cpu_to_le16(group);
+ if (ext4_has_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
/*
* Update the backup copies of the ext4 metadata. These don't need to be part
* of the main resize transaction, because e2fsck will re-write them if there
@@ -1158,7 +1168,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
while (group < sbi->s_groups_count) {
struct buffer_head *bh;
ext4_fsblk_t backup_block;
- struct ext4_super_block *es;
+ int has_super = ext4_bg_has_super(sb, group);
+ ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group);
/* Out of journal space, and can't get more - abort - so sad */
err = ext4_resize_ensure_credits_batch(handle, 1);
@@ -1168,8 +1179,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
if (meta_bg == 0)
backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
else
- backup_block = (ext4_group_first_block_no(sb, group) +
- ext4_bg_has_super(sb, group));
+ backup_block = first_block + has_super;
bh = sb_getblk(sb, backup_block);
if (unlikely(!bh)) {
@@ -1187,10 +1197,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
memcpy(bh->b_data, data, size);
if (rest)
memset(bh->b_data + size, 0, rest);
- es = (struct ext4_super_block *) bh->b_data;
- es->s_block_group_nr = cpu_to_le16(group);
- if (ext4_has_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ if (has_super && (backup_block == first_block))
+ ext4_set_block_group_nr(sb, bh->b_data, group);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -1476,8 +1484,6 @@ static void ext4_update_super(struct super_block *sb,
* active. */
ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
reserved_blocks);
- ext4_superblock_csum_set(sb);
- unlock_buffer(sbi->s_sbh);
/* Update the free space counts */
percpu_counter_add(&sbi->s_freeclusters_counter,
@@ -1513,6 +1519,8 @@ static void ext4_update_super(struct super_block *sb,
ext4_calculate_overhead(sb);
es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
"%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
@@ -1596,8 +1604,8 @@ exit_journal:
int meta_bg = ext4_has_feature_meta_bg(sb);
sector_t old_gdb = 0;
- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block), 0);
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
+ (char *)es, sizeof(struct ext4_super_block), 0);
for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
@@ -1808,7 +1816,7 @@ errout:
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
(char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
@@ -1831,7 +1839,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_grpblk_t last;
ext4_grpblk_t add;
struct buffer_head *bh;
- int err;
ext4_group_t group;
o_blocks_count = ext4_blocks_count(es);
@@ -1886,8 +1893,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
brelse(bh);
- err = ext4_group_extend_no_check(sb, o_blocks_count, add);
- return err;
+ return ext4_group_extend_no_check(sb, o_blocks_count, add);
} /* ext4_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7cdd2138c897..16a343e8047d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -540,8 +540,7 @@ static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
if (ext4_should_journal_data(jinode->i_vfs_inode))
ret = ext4_journalled_submit_inode_data_buffers(jinode);
else
- ret = jbd2_journal_submit_inode_data_buffers(jinode);
-
+ ret = ext4_normal_submit_inode_data_buffers(jinode);
return ret;
}
@@ -1206,7 +1205,8 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_sysfs(sb);
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
- ext4_msg(sb, KERN_INFO, "unmounting filesystem.");
+ ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
+ &sb->s_uuid);
ext4_unregister_li_request(sb);
ext4_quota_off_umount(sb);
@@ -1323,6 +1323,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
inode_set_iversion(&ei->vfs_inode, 1);
+ ei->i_flags = 0;
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
atomic_set(&ei->i_prealloc_active, 0);
@@ -2247,7 +2248,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
}
- error = fs_lookup_param(fc, param, 1, &path);
+ error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
if (error) {
ext4_msg(NULL, KERN_ERR, "error: could not find "
"journal device path");
@@ -3778,7 +3779,7 @@ cont_thread:
}
if (!progress) {
elr->lr_next_sched = jiffies +
- prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ);
+ get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
}
if (time_before(elr->lr_next_sched, next_wakeup))
next_wakeup = elr->lr_next_sched;
@@ -3925,8 +3926,7 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
* spread the inode table initialization requests
* better.
*/
- elr->lr_next_sched = jiffies + prandom_u32_max(
- EXT4_DEF_LI_MAX_START_DELAY * HZ);
+ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
return elr;
}
@@ -5287,14 +5287,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount3a;
} else {
/* Nojournal mode, all journal mount options are illegal */
- if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_checksum, fs mounted w/o journal");
+ "journal_async_commit, fs mounted w/o journal");
goto failed_mount3a;
}
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit, fs mounted w/o journal");
+ "journal_checksum, fs mounted w/o journal");
goto failed_mount3a;
}
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
@@ -5655,8 +5656,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
descr = "out journal";
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
- ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- "Quota mode: %s.", descr, ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. "
+ "Quota mode: %s.", &sb->s_uuid, descr,
+ ext4_quota_mode(sb));
/* Update the s_overhead_clusters if necessary */
ext4_update_overhead(sb, false);
@@ -5723,7 +5725,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
ext4_debug("Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
- if (!S_ISREG(journal_inode->i_mode)) {
+ if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
iput(journal_inode);
return NULL;
@@ -6611,8 +6613,8 @@ static int ext4_reconfigure(struct fs_context *fc)
if (ret < 0)
return ret;
- ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.",
- ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.",
+ &sb->s_uuid, ext4_quota_mode(sb));
return 0;
}
@@ -6886,6 +6888,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
return err;
}
+static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
+{
+ switch (type) {
+ case USRQUOTA:
+ return qf_inum == EXT4_USR_QUOTA_INO;
+ case GRPQUOTA:
+ return qf_inum == EXT4_GRP_QUOTA_INO;
+ case PRJQUOTA:
+ return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
+ default:
+ BUG();
+ }
+}
+
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags)
{
@@ -6902,9 +6918,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
if (!qf_inums[type])
return -EPERM;
+ if (!ext4_check_quota_inum(type, qf_inums[type])) {
+ ext4_error(sb, "Bad quota inum: %lu, type: %d",
+ qf_inums[type], type);
+ return -EUCLEAN;
+ }
+
qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
if (IS_ERR(qf_inode)) {
- ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+ ext4_error(sb, "Bad quota inode: %lu, type: %d",
+ qf_inums[type], type);
return PTR_ERR(qf_inode);
}
@@ -6943,8 +6966,9 @@ int ext4_enable_quotas(struct super_block *sb)
if (err) {
ext4_warning(sb,
"Failed to enable quota tracking "
- "(type=%d, err=%d). Please run "
- "e2fsck to fix.", type, err);
+ "(type=%d, err=%d, ino=%lu). "
+ "Please run e2fsck to fix.", type,
+ err, qf_inums[type]);
for (type--; type >= 0; type--) {
struct inode *inode;
@@ -7031,8 +7055,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
len = i_size-off;
toread = len;
while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
bh = ext4_bread(NULL, inode, blk, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 3c640bd7ecae..30e3b65798b5 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -79,7 +79,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int res;
res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 36d6ba7190b6..7decaaf27e82 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1281,7 +1281,7 @@ retry_ref:
ce = mb_cache_entry_get(ea_block_cache, hash,
bh->b_blocknr);
if (ce) {
- ce->e_reusable = 1;
+ set_bit(MBE_REUSABLE_B, &ce->e_flags);
mb_cache_entry_put(ea_block_cache, ce);
}
}
@@ -1441,6 +1441,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
if (!err)
err = ext4_inode_attach_jinode(ea_inode);
if (err) {
+ if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+ ext4_warning_inode(ea_inode,
+ "cleanup dec ref error %d", err);
iput(ea_inode);
return ERR_PTR(err);
}
@@ -1540,7 +1543,8 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
if (err) {
- ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+ ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
iput(ea_inode);
return err;
}
@@ -2042,7 +2046,7 @@ inserted:
}
BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
if (ref == EXT4_XATTR_REFCOUNT_MAX)
- ce->e_reusable = 0;
+ clear_bit(MBE_REUSABLE_B, &ce->e_flags);
ea_bdebug(new_bh, "reusing; refcount now=%d",
ref);
ext4_xattr_block_csum_set(inode, new_bh);
@@ -2070,19 +2074,11 @@ inserted:
goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
-
- /* non-extent files can't have physical blocks past 2^32 */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-
block = ext4_new_meta_blocks(handle, inode, goal, 0,
NULL, &error);
if (error)
goto cleanup;
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
-
ea_idebug(inode, "creating block %llu",
(unsigned long long)block);
@@ -2555,7 +2551,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
- buffer = kmalloc(value_size, GFP_NOFS);
+ buffer = kvmalloc(value_size, GFP_NOFS);
b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
if (!is || !bs || !buffer || !b_entry_name) {
error = -ENOMEM;
@@ -2607,7 +2603,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
error = 0;
out:
kfree(b_entry_name);
- kfree(buffer);
+ kvfree(buffer);
if (is)
brelse(is->iloc.bh);
if (bs)
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 5bbc44a5216e..c1c74aa658ae 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -276,9 +276,11 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns,
return error;
}
-int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
+ struct inode *inode = d_inode(dentry);
+
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index a26e33cab4ff..ea2bbb3f264b 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -34,7 +34,7 @@ struct f2fs_acl_header {
#ifdef CONFIG_F2FS_FS_POSIX_ACL
extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool);
-extern int f2fs_set_acl(struct user_namespace *, struct inode *,
+extern int f2fs_set_acl(struct user_namespace *, struct dentry *,
struct posix_acl *, int);
extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
struct page *);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d315c2de136f..2b7a5cc4ed66 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1711,50 +1711,27 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
}
}
-/*
- * Update and unlock the cluster's pagecache pages, and release the reference to
- * the decompress_io_ctx that was being held for I/O completion.
- */
-static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
- bool in_task)
+static void f2fs_verify_cluster(struct work_struct *work)
{
+ struct decompress_io_ctx *dic =
+ container_of(work, struct decompress_io_ctx, verity_work);
int i;
+ /* Verify, update, and unlock the decompressed pages. */
for (i = 0; i < dic->cluster_size; i++) {
struct page *rpage = dic->rpages[i];
if (!rpage)
continue;
- /* PG_error was set if verity failed. */
- if (failed || PageError(rpage)) {
- ClearPageUptodate(rpage);
- /* will re-read again later */
- ClearPageError(rpage);
- } else {
+ if (fsverity_verify_page(rpage))
SetPageUptodate(rpage);
- }
+ else
+ ClearPageUptodate(rpage);
unlock_page(rpage);
}
- f2fs_put_dic(dic, in_task);
-}
-
-static void f2fs_verify_cluster(struct work_struct *work)
-{
- struct decompress_io_ctx *dic =
- container_of(work, struct decompress_io_ctx, verity_work);
- int i;
-
- /* Verify the cluster's decompressed pages with fs-verity. */
- for (i = 0; i < dic->cluster_size; i++) {
- struct page *rpage = dic->rpages[i];
-
- if (rpage && !fsverity_verify_page(rpage))
- SetPageError(rpage);
- }
-
- __f2fs_decompress_end_io(dic, false, true);
+ f2fs_put_dic(dic, true);
}
/*
@@ -1764,6 +1741,8 @@ static void f2fs_verify_cluster(struct work_struct *work)
void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
bool in_task)
{
+ int i;
+
if (!failed && dic->need_verity) {
/*
* Note that to avoid deadlocks, the verity work can't be done
@@ -1773,9 +1752,28 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
*/
INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
fsverity_enqueue_verify_work(&dic->verity_work);
- } else {
- __f2fs_decompress_end_io(dic, failed, in_task);
+ return;
+ }
+
+ /* Update and unlock the cluster's pagecache pages. */
+ for (i = 0; i < dic->cluster_size; i++) {
+ struct page *rpage = dic->rpages[i];
+
+ if (!rpage)
+ continue;
+
+ if (failed)
+ ClearPageUptodate(rpage);
+ else
+ SetPageUptodate(rpage);
+ unlock_page(rpage);
}
+
+ /*
+ * Release the reference to the decompress_io_ctx that was being held
+ * for I/O completion.
+ */
+ f2fs_put_dic(dic, in_task);
}
/*
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a71e818cd67b..7af75041bd81 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -116,43 +116,56 @@ struct bio_post_read_ctx {
struct f2fs_sb_info *sbi;
struct work_struct work;
unsigned int enabled_steps;
+ /*
+ * decompression_attempted keeps track of whether
+ * f2fs_end_read_compressed_page() has been called on the pages in the
+ * bio that belong to a compressed cluster yet.
+ */
+ bool decompression_attempted;
block_t fs_blkaddr;
};
+/*
+ * Update and unlock a bio's pages, and free the bio.
+ *
+ * This marks pages up-to-date only if there was no error in the bio (I/O error,
+ * decryption error, or verity error), as indicated by bio->bi_status.
+ *
+ * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk)
+ * aren't marked up-to-date here, as decompression is done on a per-compression-
+ * cluster basis rather than a per-bio basis. Instead, we only must do two
+ * things for each compressed page here: call f2fs_end_read_compressed_page()
+ * with failed=true if an error occurred before it would have normally gotten
+ * called (i.e., I/O error or decryption error, but *not* verity error), and
+ * release the bio's reference to the decompress_io_ctx of the page's cluster.
+ */
static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
+ struct bio_post_read_ctx *ctx = bio->bi_private;
- /*
- * Update and unlock the bio's pagecache pages, and put the
- * decompression context for any compressed pages.
- */
bio_for_each_segment_all(bv, bio, iter_all) {
struct page *page = bv->bv_page;
if (f2fs_is_compressed_page(page)) {
- if (bio->bi_status)
+ if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(page, true, 0,
in_task);
f2fs_put_page_dic(page, in_task);
continue;
}
- /* PG_error was set if verity failed. */
- if (bio->bi_status || PageError(page)) {
+ if (bio->bi_status)
ClearPageUptodate(page);
- /* will re-read again later */
- ClearPageError(page);
- } else {
+ else
SetPageUptodate(page);
- }
dec_page_count(F2FS_P_SB(page), __read_io_type(page));
unlock_page(page);
}
- if (bio->bi_private)
- mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+ if (ctx)
+ mempool_free(ctx, bio_post_read_ctx_pool);
bio_put(bio);
}
@@ -185,8 +198,10 @@ static void f2fs_verify_bio(struct work_struct *work)
struct page *page = bv->bv_page;
if (!f2fs_is_compressed_page(page) &&
- !fsverity_verify_page(page))
- SetPageError(page);
+ !fsverity_verify_page(page)) {
+ bio->bi_status = BLK_STS_IOERR;
+ break;
+ }
}
} else {
fsverity_verify_bio(bio);
@@ -245,6 +260,8 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
blkaddr++;
}
+ ctx->decompression_attempted = true;
+
/*
* Optimization: if all the bio's pages are compressed, then scheduling
* the per-bio verity work is unnecessary, as verity will be fully
@@ -1062,6 +1079,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
ctx->sbi = sbi;
ctx->enabled_steps = post_read_steps;
ctx->fs_blkaddr = blkaddr;
+ ctx->decompression_attempted = false;
bio->bi_private = ctx;
}
iostat_alloc_and_bind_ctx(sbi, bio, ctx);
@@ -1089,7 +1107,6 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
bio_put(bio);
return -EFAULT;
}
- ClearPageError(page);
inc_page_count(sbi, F2FS_RD_DATA);
f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE);
__submit_bio(sbi, bio, DATA);
@@ -2141,7 +2158,6 @@ submit_and_realloc:
inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
F2FS_BLKSIZE);
- ClearPageError(page);
*last_block_in_bio = block_nr;
goto out;
out:
@@ -2289,7 +2305,6 @@ submit_and_realloc:
inc_page_count(sbi, F2FS_RD_DATA);
f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
- ClearPageError(page);
*last_block_in_bio = blkaddr;
}
@@ -2306,7 +2321,6 @@ out:
for (i = 0; i < cc->cluster_size; i++) {
if (cc->rpages[i]) {
ClearPageUptodate(cc->rpages[i]);
- ClearPageError(cc->rpages[i]);
unlock_page(cc->rpages[i]);
}
}
@@ -2403,7 +2417,6 @@ read_single_page:
#ifdef CONFIG_F2FS_FS_COMPRESSION
set_error_page:
#endif
- SetPageError(page);
zero_user_segment(page, 0, PAGE_SIZE);
unlock_page(page);
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 82cda1258227..83df6f6173d3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1025,7 +1025,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
__setattr_copy(mnt_userns, inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode));
+ err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode));
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
if (!err)
@@ -1046,7 +1046,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
const struct inode_operations f2fs_file_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
- .get_acl = f2fs_get_acl,
+ .get_inode_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
.fiemap = f2fs_fiemap,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 4546e01b2ee0..536d332d9e2e 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -282,7 +282,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
/* let's select beginning hot/small space first in no_heap mode*/
if (f2fs_need_rand_seg(sbi))
- p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec);
+ p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
else if (test_opt(sbi, NOHEAP) &&
(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
p->offset = 0;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a389772fd212..c227113b0f26 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1379,7 +1379,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
.tmpfile = f2fs_tmpfile,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
- .get_acl = f2fs_get_acl,
+ .get_inode_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
.fiemap = f2fs_fiemap,
@@ -1397,7 +1397,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
const struct inode_operations f2fs_special_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
- .get_acl = f2fs_get_acl,
+ .get_inode_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
};
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index acf3d3fa4363..b304692c0cf5 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2534,7 +2534,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
sanity_check_seg_type(sbi, seg_type);
if (f2fs_need_rand_seg(sbi))
- return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec);
+ return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
/* if segs_per_sec is large than 1, we need to keep original policy. */
if (__is_large_section(sbi))
@@ -2588,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
curseg->alloc_type = LFS;
if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
curseg->fragment_remained_chunk =
- prandom_u32_max(sbi->max_fragment_chunk) + 1;
+ get_random_u32_inclusive(1, sbi->max_fragment_chunk);
}
static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2625,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
/* To allocate block chunks in different sizes, use random number */
if (--seg->fragment_remained_chunk <= 0) {
seg->fragment_remained_chunk =
- prandom_u32_max(sbi->max_fragment_chunk) + 1;
+ get_random_u32_inclusive(1, sbi->max_fragment_chunk);
seg->next_blkoff +=
- prandom_u32_max(sbi->max_fragment_hole) + 1;
+ get_random_u32_inclusive(1, sbi->max_fragment_hole);
}
}
}
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index af191371c352..3626eb585a98 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -17,7 +17,7 @@ struct fat_fid {
#define FAT_FID_SIZE_WITHOUT_PARENT 3
#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32))
-/**
+/*
* Look up a directory inode given its starting cluster.
*/
static struct inode *fat_dget(struct super_block *sb, int i_logstart)
@@ -135,7 +135,7 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
return type;
}
-/**
+/*
* Map a NFS file handle to a corresponding dentry.
* The dentry may or may not be connected to the filesystem root.
*/
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index ed40ce5742fd..edb3712dcfa5 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -138,15 +138,16 @@ EXPORT_SYMBOL(__fs_parse);
* @fc: The filesystem context to log errors through.
* @param: The parameter.
* @want_bdev: T if want a blockdev
+ * @flags: Pathwalk flags passed to filename_lookup()
* @_path: The result of the lookup
*/
int fs_lookup_param(struct fs_context *fc,
struct fs_parameter *param,
bool want_bdev,
+ unsigned int flags,
struct path *_path)
{
struct filename *f;
- unsigned int flags = 0;
bool put_f;
int ret;
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 451d8a077e12..bce2492186d0 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -605,6 +605,14 @@ again:
set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags);
queue = true;
}
+ /*
+ * We could race with cookie_lru which may set LRU_DISCARD bit
+ * but has yet to run the cookie state machine. If this happens
+ * and another thread tries to use the cookie, clear LRU_DISCARD
+ * so we don't end up withdrawing the cookie while in use.
+ */
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags))
+ fscache_see_cookie(cookie, fscache_cookie_see_lru_discard_clear);
break;
case FSCACHE_COOKIE_STATE_FAILED:
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
index 3af3b08a9bb3..0d2b8dec8f82 100644
--- a/fs/fscache/io.c
+++ b/fs/fscache/io.c
@@ -286,7 +286,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
* taken into account.
*/
- iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+ iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
fscache_write(cres, start, &iter, fscache_wreq_done, wreq);
return;
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 337cb29a8dd5..a4850aee2639 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -53,9 +53,10 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
+ struct inode *inode = d_inode(dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
const char *name;
int ret;
@@ -98,7 +99,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
return ret;
}
- if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+ if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) &&
!capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index c7d882a9fe33..a06fbb1a8a5b 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -545,7 +545,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
{
struct fuse_dev *fud = file->private_data;
struct cuse_conn *cc = fc_to_cc(fud->fc);
- int rc;
/* remove from the conntbl, no more access from this point on */
mutex_lock(&cuse_lock);
@@ -560,9 +559,7 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
cdev_del(cc->cdev);
}
- rc = fuse_dev_release(inode, file); /* puts the base reference */
-
- return rc;
+ return fuse_dev_release(inode, file);
}
static struct file_operations cuse_channel_fops; /* initialized during init */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b4a6e0a1b945..c73d9c4132f6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1498,7 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1546,7 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -2267,8 +2267,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
* Check against file->f_op because CUSE
* uses the same ioctl handler.
*/
- if (old->f_op == file->f_op &&
- old->f_cred->user_ns == file->f_cred->user_ns)
+ if (old->f_op == file->f_op)
fud = fuse_get_dev(old);
if (fud) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bb97a384dc5d..cd1a071b625a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -214,7 +214,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (inode && fuse_is_bad(inode))
goto invalid;
else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
- (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) {
+ (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) {
struct fuse_entry_out outarg;
FUSE_ARGS(args);
struct fuse_forget_link *forget;
@@ -1170,7 +1170,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
}
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
- u64 child_nodeid, struct qstr *name)
+ u64 child_nodeid, struct qstr *name, u32 flags)
{
int err = -ENOTDIR;
struct inode *parent;
@@ -1197,7 +1197,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
goto unlock;
fuse_dir_changed(parent);
- fuse_invalidate_entry(entry);
+ if (!(flags & FUSE_EXPIRE_ONLY))
+ d_invalidate(entry);
+ fuse_invalidate_entry_cache(entry);
if (child_nodeid != 0 && d_really_is_positive(entry)) {
inode_lock(d_inode(entry));
@@ -1235,6 +1237,18 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
return err;
}
+static inline bool fuse_permissible_uidgid(struct fuse_conn *fc)
+{
+ const struct cred *cred = current_cred();
+
+ return (uid_eq(cred->euid, fc->user_id) &&
+ uid_eq(cred->suid, fc->user_id) &&
+ uid_eq(cred->uid, fc->user_id) &&
+ gid_eq(cred->egid, fc->group_id) &&
+ gid_eq(cred->sgid, fc->group_id) &&
+ gid_eq(cred->gid, fc->group_id));
+}
+
/*
* Calling into a user-controlled filesystem gives the filesystem
* daemon ptrace-like capabilities over the current process. This
@@ -1248,26 +1262,19 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
* for which the owner of the mount has ptrace privilege. This
* excludes processes started by other users, suid or sgid processes.
*/
-int fuse_allow_current_process(struct fuse_conn *fc)
+bool fuse_allow_current_process(struct fuse_conn *fc)
{
- const struct cred *cred;
-
- if (allow_sys_admin_access && capable(CAP_SYS_ADMIN))
- return 1;
+ bool allow;
if (fc->allow_other)
- return current_in_userns(fc->user_ns);
+ allow = current_in_userns(fc->user_ns);
+ else
+ allow = fuse_permissible_uidgid(fc);
- cred = current_cred();
- if (uid_eq(cred->euid, fc->user_id) &&
- uid_eq(cred->suid, fc->user_id) &&
- uid_eq(cred->uid, fc->user_id) &&
- gid_eq(cred->egid, fc->group_id) &&
- gid_eq(cred->sgid, fc->group_id) &&
- gid_eq(cred->gid, fc->group_id))
- return 1;
+ if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN))
+ allow = true;
- return 0;
+ return allow;
}
static int fuse_access(struct inode *inode, int mask)
@@ -1935,7 +1942,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
.permission = fuse_permission,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
- .get_acl = fuse_get_acl,
+ .get_inode_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
.fileattr_get = fuse_fileattr_get,
.fileattr_set = fuse_fileattr_set,
@@ -1957,7 +1964,7 @@ static const struct inode_operations fuse_common_inode_operations = {
.permission = fuse_permission,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
- .get_acl = fuse_get_acl,
+ .get_inode_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
.fileattr_get = fuse_fileattr_get,
.fileattr_set = fuse_fileattr_set,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 89f4741728ba..875314ee6f59 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1313,7 +1313,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
return err;
if (fc->handle_killpriv_v2 &&
- should_remove_suid(file_dentry(file))) {
+ setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
goto writethrough;
}
@@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
return res;
}
+static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
+}
+
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
+ bool exclusive_lock =
+ !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
+ iocb->ki_flags & IOCB_APPEND ||
+ fuse_direct_write_extending_i_size(iocb, from);
+
+ /*
+ * Take exclusive lock if
+ * - Parallel direct writes are disabled - a user space decision
+ * - Parallel direct writes are enabled and i_size is being extended.
+ * This might not be needed at all, but needs further investigation.
+ */
+ if (exclusive_lock)
+ inode_lock(inode);
+ else {
+ inode_lock_shared(inode);
+
+ /* A race with truncate might have come up as the decision for
+ * the lock type was done without holding the lock, check again.
+ */
+ if (fuse_direct_write_extending_i_size(iocb, from)) {
+ inode_unlock_shared(inode);
+ inode_lock(inode);
+ exclusive_lock = true;
+ }
+ }
- /* Don't allow parallel writes to the same file */
- inode_lock(inode);
res = generic_write_checks(iocb, from);
if (res > 0) {
if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
@@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
- inode_unlock(inode);
+ if (exclusive_lock)
+ inode_unlock(inode);
+ else
+ inode_unlock_shared(inode);
return res;
}
@@ -2931,6 +2967,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (iov_iter_rw(iter) == WRITE) {
fuse_write_update_attr(inode, pos, ret);
+ /* For extending writes we already hold exclusive lock */
if (ret < 0 && offset + count > i_size)
fuse_do_truncate(file);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 98a9cf531873..c673faefdcb9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1179,7 +1179,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr);
/**
* Is current process allowed to perform filesystem operation?
*/
-int fuse_allow_current_process(struct fuse_conn *fc);
+bool fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
@@ -1220,7 +1220,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
* then the dentry is unhashed (d_delete()).
*/
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
- u64 child_nodeid, struct qstr *name);
+ u64 child_nodeid, struct qstr *name, u32 flags);
int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
@@ -1269,7 +1269,7 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
struct posix_acl;
struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu);
-int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
/* readdir.c */
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 61d8afcb10a3..fcce94ace2c2 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -255,7 +255,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
ap.args.in_pages = true;
err = -EFAULT;
- iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
+ iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size);
for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
if (c != PAGE_SIZE && iov_iter_count(&ii))
@@ -324,7 +324,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
goto out;
err = -EFAULT;
- iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
+ iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred);
for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
if (c != PAGE_SIZE && iov_iter_count(&ii))
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index e8deaacf1832..dc603479b30e 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -547,9 +547,9 @@ retry_locked:
* Contents of the page are now protected against changing by holding
* the page lock.
*/
- addr = kmap(page);
+ addr = kmap_local_page(page);
res = fuse_parse_cache(ff, addr, size, ctx);
- kunmap(page);
+ kunmap_local(addr);
unlock_page(page);
put_page(page);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 734d1f05d823..3dcde4912413 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -109,9 +109,10 @@ out:
return error;
}
-int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
bool need_unlock = false;
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index cd180ca7c959..b8de8c148f5c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,7 +13,7 @@
extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 04a201584fa7..1371e067d2a7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1997,7 +1997,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns,
else {
error = gfs2_setattr_simple(inode, attr);
if (!error && attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, inode,
+ error = posix_acl_chmod(&init_user_ns, dentry,
inode->i_mode);
}
@@ -2149,7 +2149,7 @@ static const struct inode_operations gfs2_file_iops = {
.getattr = gfs2_getattr,
.listxattr = gfs2_listxattr,
.fiemap = gfs2_fiemap,
- .get_acl = gfs2_get_acl,
+ .get_inode_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
.update_time = gfs2_update_time,
.fileattr_get = gfs2_fileattr_get,
@@ -2171,7 +2171,7 @@ static const struct inode_operations gfs2_dir_iops = {
.getattr = gfs2_getattr,
.listxattr = gfs2_listxattr,
.fiemap = gfs2_fiemap,
- .get_acl = gfs2_get_acl,
+ .get_inode_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
.update_time = gfs2_update_time,
.atomic_open = gfs2_atomic_open,
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c4526f16355d..a0746be3c1de 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -458,6 +458,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
/* panic? */
return -EIO;
+ if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
+ return -EIO;
fd.search_key->cat = HFS_I(main_inode)->cat_key;
if (hfs_brec_find(&fd))
/* panic? */
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index 39f5e343bf4d..fdb0edb8a607 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -109,7 +109,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr
if (nls_io) {
wchar_t ch;
- while (srclen > 0) {
+ while (srclen > 0 && dstlen > 0) {
size = nls_io->char2uni(src, srclen, &ch);
if (size < 0) {
ch = '?';
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a5db2e3b2980..6aa919e59483 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -198,6 +198,8 @@ struct hfsplus_sb_info {
#define HFSPLUS_SB_HFSX 3
#define HFSPLUS_SB_CASEFOLD 4
#define HFSPLUS_SB_NOBARRIER 5
+#define HFSPLUS_SB_UID 6
+#define HFSPLUS_SB_GID 7
static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
{
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index aeab83ed1c9c..b675581aa9d0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -192,11 +192,11 @@ static void hfsplus_get_perms(struct inode *inode,
mode = be16_to_cpu(perms->mode);
i_uid_write(inode, be32_to_cpu(perms->owner));
- if (!i_uid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode))
inode->i_uid = sbi->uid;
i_gid_write(inode, be32_to_cpu(perms->group));
- if (!i_gid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode))
inode->i_gid = sbi->gid;
if (dir) {
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 047e05c57560..c94a58762ad6 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -140,6 +140,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
if (!uid_valid(sbi->uid)) {
pr_err("invalid uid specified\n");
return 0;
+ } else {
+ set_bit(HFSPLUS_SB_UID, &sbi->flags);
}
break;
case opt_gid:
@@ -151,6 +153,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
if (!gid_valid(sbi->gid)) {
pr_err("invalid gid specified\n");
return 0;
+ } else {
+ set_bit(HFSPLUS_SB_GID, &sbi->flags);
}
break;
case opt_part:
diff --git a/fs/inode.c b/fs/inode.c
index b608528efd3a..f453eb58fd03 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1949,40 +1949,12 @@ skip_update:
EXPORT_SYMBOL(touch_atime);
/*
- * The logic we want is
- *
- * if suid or (sgid and xgrp)
- * remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
- umode_t mode = d_inode(dentry)->i_mode;
- int kill = 0;
-
- /* suid always must be killed */
- if (unlikely(mode & S_ISUID))
- kill = ATTR_KILL_SUID;
-
- /*
- * sgid without any exec bits is just a mandatory locking mark; leave
- * it alone. If some exec bits are set, it's a real sgid; kill it.
- */
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- kill |= ATTR_KILL_SGID;
-
- if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
- return kill;
-
- return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-
-/*
* Return mask of changes for notify_change() that need to be done as a
* response to write or truncate. Return 0 if nothing has to be changed.
* Negative value on error (change should be denied).
*/
-int dentry_needs_remove_privs(struct dentry *dentry)
+int dentry_needs_remove_privs(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
int mask = 0;
@@ -1991,7 +1963,7 @@ int dentry_needs_remove_privs(struct dentry *dentry)
if (IS_NOSEC(inode))
return 0;
- mask = should_remove_suid(dentry);
+ mask = setattr_should_drop_suidgid(mnt_userns, inode);
ret = security_inode_need_killpriv(dentry);
if (ret < 0)
return ret;
@@ -2023,7 +1995,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
- kill = dentry_needs_remove_privs(dentry);
+ kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry);
if (kill < 0)
return kill;
@@ -2071,9 +2043,6 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
sync_it |= S_VERSION;
- if (!sync_it)
- return 0;
-
return sync_it;
}
@@ -2354,15 +2323,15 @@ EXPORT_SYMBOL(inode_init_owner);
bool inode_owner_or_capable(struct user_namespace *mnt_userns,
const struct inode *inode)
{
- kuid_t i_uid;
+ vfsuid_t vfsuid;
struct user_namespace *ns;
- i_uid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid))
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
return true;
ns = current_user_ns();
- if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
+ if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER))
return true;
return false;
}
@@ -2488,6 +2457,28 @@ struct timespec64 current_time(struct inode *inode)
EXPORT_SYMBOL(current_time);
/**
+ * in_group_or_capable - check whether caller is CAP_FSETID privileged
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ * @vfsgid: the new/current vfsgid of @inode
+ *
+ * Check wether @vfsgid is in the caller's group list or if the caller is
+ * privileged with CAP_FSETID over @inode. This can be used to determine
+ * whether the setgid bit can be kept or must be dropped.
+ *
+ * Return: true if the caller is sufficiently privileged, false if not.
+ */
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode, vfsgid_t vfsgid)
+{
+ if (vfsgid_in_group_p(vfsgid))
+ return true;
+ if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ return true;
+ return false;
+}
+
+/**
* mode_strip_sgid - handle the sgid bit for non-directories
* @mnt_userns: User namespace of the mount the inode was created from
* @dir: parent directory inode
@@ -2508,11 +2499,9 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
return mode;
if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
return mode;
- if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
- return mode;
- if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+ if (in_group_or_capable(mnt_userns, dir,
+ i_gid_into_vfsgid(mnt_userns, dir)))
return mode;
-
return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/fs/internal.h b/fs/internal.h
index 6f0386b34fae..a803cc3cf716 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -150,7 +150,9 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern int dentry_needs_remove_privs(struct dentry *dentry);
+int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry);
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode, vfsgid_t vfsgid);
/*
* fs-writeback.c
@@ -225,12 +227,39 @@ struct xattr_ctx {
};
-ssize_t do_getxattr(struct user_namespace *mnt_userns,
+ssize_t do_getxattr(struct mnt_idmap *idmap,
struct dentry *d,
struct xattr_ctx *ctx);
int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
-int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct xattr_ctx *ctx);
+int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, const void *kvalue, size_t size);
+ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, void *kvalue, size_t size);
+#else
+static inline int do_set_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, const char *acl_name,
+ const void *kvalue, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, const char *acl_name,
+ void *kvalue, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+#endif
ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
+
+/*
+ * fs/attr.c
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+ const struct inode *inode);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 885a7a6cc53e..4810438b7856 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -207,14 +207,13 @@ int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
}
/* Send all the data buffers related to an inode */
-int jbd2_submit_inode_data(struct jbd2_inode *jinode)
+int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
{
-
if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
return 0;
trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
- return jbd2_journal_submit_inode_data_buffers(jinode);
+ return journal->j_submit_inode_data_buffers(jinode);
}
EXPORT_SYMBOL(jbd2_submit_inode_data);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index e945e3484788..8bb58ce5c06c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -229,10 +229,11 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
return rc;
}
-int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int rc, xprefix;
+ struct inode *inode = d_inode(dentry);
switch (type) {
case ACL_TYPE_ACCESS:
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 9d9fb7cf093e..ca36a6eca594 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu);
-int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index c0aabbcbfd58..f399b390b5f6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -62,7 +62,7 @@ const struct inode_operations jffs2_dir_inode_operations =
.rmdir = jffs2_rmdir,
.mknod = jffs2_mknod,
.rename = jffs2_rename,
- .get_acl = jffs2_get_acl,
+ .get_inode_acl = jffs2_get_acl,
.set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
.listxattr = jffs2_listxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index ba86acbe12d3..3cf71befa475 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -64,7 +64,7 @@ const struct file_operations jffs2_file_operations =
const struct inode_operations jffs2_file_inode_operations =
{
- .get_acl = jffs2_get_acl,
+ .get_inode_acl = jffs2_get_acl,
.set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
.listxattr = jffs2_listxattr,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 39cec28096a7..66af51c41619 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -202,7 +202,7 @@ int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
rc = jffs2_do_setattr(inode, iattr);
if (!rc && (iattr->ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
return rc;
}
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a653f34c6e26..3b667eccc73b 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -94,12 +94,13 @@ out:
return rc;
}
-int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int rc;
tid_t tid;
int update_mode = 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
tid = txBegin(inode->i_sb, 0);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 332dc9ac47a9..88663465aecd 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -123,7 +123,7 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
mark_inode_dirty(inode);
if (iattr->ia_valid & ATTR_MODE)
- rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
return rc;
}
@@ -133,7 +133,7 @@ const struct inode_operations jfs_file_inode_operations = {
.fileattr_get = jfs_fileattr_get,
.fileattr_set = jfs_fileattr_set,
#ifdef CONFIG_JFS_POSIX_ACL
- .get_acl = jfs_get_acl,
+ .get_inode_acl = jfs_get_acl,
.set_acl = jfs_set_acl,
#endif
};
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 3de40286d31f..f0704a25835f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -8,7 +8,7 @@
#ifdef CONFIG_JFS_POSIX_ACL
struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu);
-int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 6b838d3ae7c2..765838578a72 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -155,7 +155,7 @@ int dbMount(struct inode *ipbmap)
struct bmap *bmp;
struct dbmap_disk *dbmp_le;
struct metapage *mp;
- int i;
+ int i, err;
/*
* allocate/initialize the in-memory bmap descriptor
@@ -170,8 +170,8 @@ int dbMount(struct inode *ipbmap)
BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
PSIZE, 0);
if (mp == NULL) {
- kfree(bmp);
- return -EIO;
+ err = -EIO;
+ goto err_kfree_bmp;
}
/* copy the on-disk bmap descriptor to its in-memory version. */
@@ -181,9 +181,8 @@ int dbMount(struct inode *ipbmap)
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
if (!bmp->db_numag) {
- release_metapage(mp);
- kfree(bmp);
- return -EINVAL;
+ err = -EINVAL;
+ goto err_release_metapage;
}
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
@@ -194,6 +193,16 @@ int dbMount(struct inode *ipbmap)
bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+ if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
+ if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
for (i = 0; i < MAXAG; i++)
bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
@@ -214,6 +223,12 @@ int dbMount(struct inode *ipbmap)
BMAP_LOCK_INIT(bmp);
return (0);
+
+err_release_metapage:
+ release_metapage(mp);
+err_kfree_bmp:
+ kfree(bmp);
+ return err;
}
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
index 1c984214e95e..a0ee4ccea66e 100644
--- a/fs/jfs/jfs_extent.h
+++ b/fs/jfs/jfs_extent.h
@@ -10,9 +10,7 @@
(addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
extern int extAlloc(struct inode *, s64, s64, xad_t *, bool);
-extern int extFill(struct inode *, xad_t *);
extern int extHint(struct inode *, s64, xad_t *);
-extern int extRealloc(struct inode *, s64, xad_t *, bool);
extern int extRecord(struct inode *, xad_t *);
#endif /* _H_JFS_EXTENT */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 799d3837e7c2..390cbfce391f 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -310,8 +310,8 @@ int diRead(struct inode *ip)
iagno = INOTOIAG(ip->i_ino);
/* read the iag */
- imap = JFS_IP(ipimap)->i_imap;
IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+ imap = JFS_IP(ipimap)->i_imap;
rc = diIAGRead(imap, iagno, &mp);
IREAD_UNLOCK(ipimap);
if (rc) {
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 48d1f70f786c..b83aae56a1f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -234,11 +234,15 @@ int jfs_mount_rw(struct super_block *sb, int remount)
truncate_inode_pages(sbi->ipimap->i_mapping, 0);
truncate_inode_pages(sbi->ipbmap->i_mapping, 0);
+
+ IWRITE_LOCK(sbi->ipimap, RDWRLOCK_IMAP);
diUnmount(sbi->ipimap, 1);
if ((rc = diMount(sbi->ipimap))) {
+ IWRITE_UNLOCK(sbi->ipimap);
jfs_err("jfs_mount_rw: diMount failed!");
return rc;
}
+ IWRITE_UNLOCK(sbi->ipimap);
dbUnmount(sbi->ipbmap, 1);
if ((rc = dbMount(sbi->ipbmap))) {
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 3e8b13e6aa01..8ec43f53f686 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,6 @@ int jfs_umount(struct super_block *sb)
/*
* close secondary aggregate inode allocation map
*/
- ipaimap2 = sbi->ipaimap2;
if (ipaimap2) {
diUnmount(ipaimap2, 0);
diFreeSpecial(ipaimap2);
@@ -78,7 +77,6 @@ int jfs_umount(struct super_block *sb)
/*
* close aggregate inode allocation map
*/
- ipaimap = sbi->ipaimap;
diUnmount(ipaimap, 0);
diFreeSpecial(ipaimap);
sbi->ipaimap = NULL;
@@ -89,7 +87,7 @@ int jfs_umount(struct super_block *sb)
dbUnmount(ipbmap, 0);
diFreeSpecial(ipbmap);
- sbi->ipimap = NULL;
+ sbi->ipbmap = NULL;
/*
* Make sure all metadata makes it to disk before we mark
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index c50167a7bc50..0d33816d251d 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -25,7 +25,7 @@ struct jfs_ea_list {
struct jfs_ea ea[]; /* Variable length list */
};
-/* Macros for defining maxiumum number of bytes supported for EAs */
+/* Macros for defining maximum number of bytes supported for EAs */
#define MAXEASIZE 65535
#define MAXEALISTSIZE MAXEASIZE
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 142caafc73b1..ad7592191d76 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -96,12 +96,8 @@ extern int xtInsert(tid_t tid, struct inode *ip,
extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
int flag);
extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
-extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
- int flag);
extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type);
extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size);
-extern int xtRelocate(tid_t tid, struct inode *ip,
- xad_t * oxad, s64 nxaddr, int xtype);
extern int xtAppend(tid_t tid,
struct inode *ip, int xflag, s64 xoff, int maxblocks,
int *xlenp, s64 * xaddrp, int flag);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 9db4f5789c0e..a38d14eed047 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -946,7 +946,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip,
if (ssize <= IDATASIZE) {
ip->i_op = &jfs_fast_symlink_inode_operations;
- ip->i_link = JFS_IP(ip)->i_inline;
+ ip->i_link = JFS_IP(ip)->i_inline_all;
memcpy(ip->i_link, name, ssize);
ip->i_size = ssize - 1;
@@ -1525,7 +1525,7 @@ const struct inode_operations jfs_dir_inode_operations = {
.fileattr_get = jfs_fileattr_get,
.fileattr_set = jfs_fileattr_set,
#ifdef CONFIG_JFS_POSIX_ACL
- .get_acl = jfs_get_acl,
+ .get_inode_acl = jfs_get_acl,
.set_acl = jfs_set_acl,
#endif
};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 85d4f44f2ac4..d2f82cb7db1b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -745,8 +745,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
len = i_size-off;
toread = len;
while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, toread);
tmp_bh.b_state = 0;
tmp_bh.b_size = i_blocksize(inode);
@@ -785,8 +784,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
inode_lock(inode);
while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, towrite);
tmp_bh.b_state = 0;
tmp_bh.b_size = i_blocksize(inode);
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index b2fc85d440d0..9306e10753f9 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -2487,9 +2487,9 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr,
fattr->cf_dacls = NULL;
if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
- fattr->cf_acls = get_acl(inode, ACL_TYPE_ACCESS);
+ fattr->cf_acls = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (S_ISDIR(inode->i_mode))
- fattr->cf_dacls = get_acl(inode, ACL_TYPE_DEFAULT);
+ fattr->cf_dacls = get_inode_acl(inode, ACL_TYPE_DEFAULT);
}
}
@@ -2956,7 +2956,7 @@ int smb2_open(struct ksmbd_work *work)
struct inode *inode = d_inode(path.dentry);
posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns,
- inode,
+ path.dentry,
d_inode(path.dentry->d_parent));
if (posix_acl_rc)
ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc);
@@ -2972,7 +2972,7 @@ int smb2_open(struct ksmbd_work *work)
if (rc) {
if (posix_acl_rc)
ksmbd_vfs_set_init_posix_acl(user_ns,
- inode);
+ path.dentry);
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_ACL_XATTR)) {
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index b05ff9b146b5..ab5c68cc0e13 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -1289,7 +1289,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
}
if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
- posix_acls = get_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
+ posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
if (posix_acls && !found) {
unsigned int id = -1;
@@ -1386,14 +1386,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry);
/* Update posix acls */
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) {
- rc = set_posix_acl(user_ns, inode,
+ rc = set_posix_acl(user_ns, path->dentry,
ACL_TYPE_ACCESS, fattr.cf_acls);
if (rc < 0)
ksmbd_debug(SMB,
"Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
rc);
if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) {
- rc = set_posix_acl(user_ns, inode,
+ rc = set_posix_acl(user_ns, path->dentry,
ACL_TYPE_DEFAULT, fattr.cf_dacls);
if (rc)
ksmbd_debug(SMB,
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 94b8ed4ef870..ff0e7a4fcd4d 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -321,7 +321,7 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end,
unsigned char type)
{
struct file_lock *flock;
- struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+ struct file_lock_context *ctx = locks_inode_context(file_inode(filp));
int error = 0;
if (!ctx || list_empty_careful(&ctx->flc_posix))
@@ -1321,7 +1321,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns,
sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) ||
!strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) {
- err = ksmbd_vfs_remove_xattr(user_ns, dentry, name);
+ err = vfs_remove_acl(user_ns, dentry, name);
if (err)
ksmbd_debug(SMB,
"remove acl xattr failed : %s\n", name);
@@ -1375,7 +1375,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
return NULL;
- posix_acls = get_acl(inode, acl_type);
+ posix_acls = get_inode_acl(inode, acl_type);
if (!posix_acls)
return NULL;
@@ -1824,10 +1824,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock)
}
int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
- struct inode *inode)
+ struct dentry *dentry)
{
struct posix_acl_state acl_state;
struct posix_acl *acls;
+ struct inode *inode = d_inode(dentry);
int rc;
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
@@ -1856,14 +1857,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
return -ENOMEM;
}
posix_state_to_acl(&acl_state, acls->a_entries);
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
rc);
else if (S_ISDIR(inode->i_mode)) {
posix_state_to_acl(&acl_state, acls->a_entries);
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
- acls);
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
rc);
@@ -1874,16 +1874,17 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
}
int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
- struct inode *inode, struct inode *parent_inode)
+ struct dentry *dentry, struct inode *parent_inode)
{
struct posix_acl *acls;
struct posix_acl_entry *pace;
+ struct inode *inode = d_inode(dentry);
int rc, i;
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
return -EOPNOTSUPP;
- acls = get_acl(parent_inode, ACL_TYPE_DEFAULT);
+ acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT);
if (!acls)
return -ENOENT;
pace = acls->a_entries;
@@ -1895,12 +1896,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
}
}
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
rc);
if (S_ISDIR(inode->i_mode)) {
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT,
acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 593059ca8511..0d73d735cc39 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -160,8 +160,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns,
struct dentry *dentry,
struct xattr_dos_attrib *da);
int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
- struct inode *inode);
+ struct dentry *dentry);
int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
- struct inode *inode,
+ struct dentry *dentry,
struct inode *parent_inode);
#endif /* __KSMBD_VFS_H__ */
diff --git a/fs/libfs.c b/fs/libfs.c
index 682d56345a1c..aada4e7c8713 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -995,8 +995,8 @@ out:
EXPORT_SYMBOL_GPL(simple_attr_read);
/* interpret the buffer as a number to call the set function with */
-ssize_t simple_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct simple_attr *attr;
unsigned long long val;
@@ -1017,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
goto out;
attr->set_buf[size] = '\0';
- ret = kstrtoull(attr->set_buf, 0, &val);
+ if (is_signed)
+ ret = kstrtoll(attr->set_buf, 0, &val);
+ else
+ ret = kstrtoull(attr->set_buf, 0, &val);
if (ret)
goto out;
ret = attr->set(attr->data, val);
@@ -1027,8 +1030,21 @@ out:
mutex_unlock(&attr->mutex);
return ret;
}
+
+ssize_t simple_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(simple_attr_write);
+ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(simple_attr_write_signed);
+
/**
* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
* @sb: filesystem to do the file handle conversion on
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index e1c4617de771..720684345817 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -207,7 +207,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
{
struct inode *inode = nlmsvc_file_inode(file);
struct file_lock *fl;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct nlm_host *lockhost;
if (!flctx || list_empty_careful(&flctx->flc_posix))
@@ -262,7 +262,7 @@ nlm_file_inuse(struct nlm_file *file)
{
struct inode *inode = nlmsvc_file_inode(file);
struct file_lock *fl;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
return 1;
diff --git a/fs/locks.c b/fs/locks.c
index 607f94a0e789..8f01bee17715 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -175,7 +175,7 @@ locks_get_lock_context(struct inode *inode, int type)
struct file_lock_context *ctx;
/* paired with cmpxchg() below */
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (likely(ctx) || type == F_UNLCK)
goto out;
@@ -194,7 +194,7 @@ locks_get_lock_context(struct inode *inode, int type)
*/
if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
kmem_cache_free(flctx_cache, ctx);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
}
out:
trace_locks_get_lock_context(inode, type, ctx);
@@ -247,7 +247,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list,
void
locks_free_lock_context(struct inode *inode)
{
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx = locks_inode_context(inode);
if (unlikely(ctx)) {
locks_check_ctx_lists(inode);
@@ -891,7 +891,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
void *owner;
void (*func)(void);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
fl->fl_type = F_UNLCK;
return;
@@ -1483,7 +1483,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
new_fl->fl_flags = type;
/* typically we will check that ctx is non-NULL before calling */
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx) {
WARN_ON_ONCE(1);
goto free_lock;
@@ -1588,7 +1588,7 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
struct file_lock_context *ctx;
struct file_lock *fl;
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
fl = list_first_entry_or_null(&ctx->flc_lease,
@@ -1634,7 +1634,7 @@ int fcntl_getlease(struct file *filp)
int type = F_UNLCK;
LIST_HEAD(dispose);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
@@ -1823,7 +1823,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
struct file_lock_context *ctx;
LIST_HEAD(dispose);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx) {
trace_generic_delete_lease(inode, NULL);
return error;
@@ -2096,7 +2096,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
* throw a warning to let people know that they don't actually work.
*/
if (cmd & LOCK_MAND) {
- pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+ pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid);
return 0;
}
@@ -2146,6 +2146,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
+ WARN_ON_ONCE(filp != fl->fl_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_GETLK, fl);
posix_test_lock(filp, fl);
@@ -2295,6 +2296,7 @@ out:
*/
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
+ WARN_ON_ONCE(filp != fl->fl_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, cmd, fl);
else
@@ -2561,7 +2563,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx || list_empty(&ctx->flc_posix))
return;
@@ -2634,7 +2636,7 @@ void locks_remove_file(struct file *filp)
{
struct file_lock_context *ctx;
- ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
+ ctx = locks_inode_context(locks_inode(filp));
if (!ctx)
return;
@@ -2663,12 +2665,36 @@ void locks_remove_file(struct file *filp)
*/
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
+ WARN_ON_ONCE(filp != fl->fl_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_CANCELLK, fl);
return 0;
}
EXPORT_SYMBOL_GPL(vfs_cancel_lock);
+/**
+ * vfs_inode_has_locks - are any file locks held on @inode?
+ * @inode: inode to check for locks
+ *
+ * Return true if there are any FL_POSIX or FL_FLOCK locks currently
+ * set on @inode.
+ */
+bool vfs_inode_has_locks(struct inode *inode)
+{
+ struct file_lock_context *ctx;
+ bool ret;
+
+ ctx = locks_inode_context(inode);
+ if (!ctx)
+ return false;
+
+ spin_lock(&ctx->flc_lock);
+ ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock);
+ spin_unlock(&ctx->flc_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vfs_inode_has_locks);
+
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -2839,7 +2865,7 @@ void show_fd_locks(struct seq_file *f,
struct file_lock_context *ctx;
int id = 0;
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx)
return;
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e272ad738faf..2a4b8b549e93 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -100,8 +100,9 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
atomic_set(&entry->e_refcnt, 2);
entry->e_key = key;
entry->e_value = value;
- entry->e_reusable = reusable;
- entry->e_referenced = 0;
+ entry->e_flags = 0;
+ if (reusable)
+ set_bit(MBE_REUSABLE_B, &entry->e_flags);
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
@@ -165,7 +166,8 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
while (node) {
entry = hlist_bl_entry(node, struct mb_cache_entry,
e_hash_list);
- if (entry->e_key == key && entry->e_reusable &&
+ if (entry->e_key == key &&
+ test_bit(MBE_REUSABLE_B, &entry->e_flags) &&
atomic_inc_not_zero(&entry->e_refcnt))
goto out;
node = node->next;
@@ -284,7 +286,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
void mb_cache_entry_touch(struct mb_cache *cache,
struct mb_cache_entry *entry)
{
- entry->e_referenced = 1;
+ set_bit(MBE_REFERENCED_B, &entry->e_flags);
}
EXPORT_SYMBOL(mb_cache_entry_touch);
@@ -309,9 +311,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
entry = list_first_entry(&cache->c_list,
struct mb_cache_entry, e_list);
/* Drop initial hash reference if there is no user */
- if (entry->e_referenced ||
+ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) ||
atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
- entry->e_referenced = 0;
+ clear_bit(MBE_REFERENCED_B, &entry->e_flags);
list_move_tail(&entry->e_list, &cache->c_list);
continue;
}
diff --git a/fs/namei.c b/fs/namei.c
index 9155ecb547ce..720270dc9fe5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -297,13 +297,13 @@ static int check_acl(struct user_namespace *mnt_userns,
acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
if (!acl)
return -EAGAIN;
- /* no ->get_acl() calls in RCU mode... */
+ /* no ->get_inode_acl() calls in RCU mode... */
if (is_uncached_acl(acl))
return -ECHILD;
return posix_acl_permission(mnt_userns, inode, acl, mask);
}
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl) {
@@ -336,11 +336,11 @@ static int acl_permission_check(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
- kuid_t i_uid;
+ vfsuid_t vfsuid;
/* Are we the owner? If so, ACL's don't matter */
- i_uid = i_uid_into_mnt(mnt_userns, inode);
- if (likely(uid_eq(current_fsuid(), i_uid))) {
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
mask &= 7;
mode >>= 6;
return (mask & ~mode) ? -EACCES : 0;
@@ -362,8 +362,8 @@ static int acl_permission_check(struct user_namespace *mnt_userns,
* about? Need to check group ownership if so.
*/
if (mask & (mode ^ (mode >> 3))) {
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (in_group_p(kgid))
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (vfsgid_in_group_p(vfsgid))
mode >>= 3;
}
@@ -581,7 +581,7 @@ struct nameidata {
struct nameidata *saved;
unsigned root_seq;
int dfd;
- kuid_t dir_uid;
+ vfsuid_t dir_vfsuid;
umode_t dir_mode;
} __randomize_layout;
@@ -1095,15 +1095,15 @@ fs_initcall(init_fs_namei_sysctls);
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
struct user_namespace *mnt_userns;
- kuid_t i_uid;
+ vfsuid_t vfsuid;
if (!sysctl_protected_symlinks)
return 0;
mnt_userns = mnt_user_ns(nd->path.mnt);
- i_uid = i_uid_into_mnt(mnt_userns, inode);
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
/* Allowed if owner and follower match. */
- if (uid_eq(current_cred()->fsuid, i_uid))
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
return 0;
/* Allowed if parent directory not sticky and world-writable. */
@@ -1111,7 +1111,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
return 0;
/* Allowed if parent directory and link owner match. */
- if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
+ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
return 0;
if (nd->flags & LOOKUP_RCU)
@@ -1183,8 +1183,8 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link)
struct inode *inode = link->dentry->d_inode;
/* Inode writeback is not safe when the uid or gid are invalid. */
- if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
if (!sysctl_protected_hardlinks)
@@ -1232,13 +1232,13 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns,
struct nameidata *nd, struct inode *const inode)
{
umode_t dir_mode = nd->dir_mode;
- kuid_t dir_uid = nd->dir_uid;
+ vfsuid_t dir_vfsuid = nd->dir_vfsuid;
if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
(!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
likely(!(dir_mode & S_ISVTX)) ||
- uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
- uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
+ vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) ||
+ vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid()))
return 0;
if (likely(dir_mode & 0002) ||
@@ -2307,7 +2307,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
OK:
/* pathname or trailing symlink, done */
if (!depth) {
- nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
+ nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode);
nd->dir_mode = nd->inode->i_mode;
nd->flags &= ~LOOKUP_PARENT;
return 0;
@@ -2885,9 +2885,9 @@ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
{
kuid_t fsuid = current_fsuid();
- if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
+ if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid))
return 0;
- if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
+ if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid))
return 0;
return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
}
@@ -2926,8 +2926,8 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
BUG_ON(victim->d_parent->d_inode != dir);
/* Inode writeback is not safe when the uid or gid are invalid. */
- if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
diff --git a/fs/namespace.c b/fs/namespace.c
index df137ba19d37..ab467ee58341 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -75,6 +75,22 @@ static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+struct mnt_idmap {
+ struct user_namespace *owner;
+ refcount_t count;
+};
+
+/*
+ * Carries the initial idmapping of 0:0:4294967295 which is an identity
+ * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
+ * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
+ */
+struct mnt_idmap nop_mnt_idmap = {
+ .owner = &init_user_ns,
+ .count = REFCOUNT_INIT(1),
+};
+EXPORT_SYMBOL_GPL(nop_mnt_idmap);
+
struct mount_kattr {
unsigned int attr_set;
unsigned int attr_clr;
@@ -82,6 +98,7 @@ struct mount_kattr {
unsigned int lookup_flags;
bool recurse;
struct user_namespace *mnt_userns;
+ struct mnt_idmap *mnt_idmap;
};
/* /sys/fs */
@@ -193,6 +210,104 @@ int mnt_get_count(struct mount *mnt)
#endif
}
+/**
+ * mnt_idmap_owner - retrieve owner of the mount's idmapping
+ * @idmap: mount idmapping
+ *
+ * This helper will go away once the conversion to use struct mnt_idmap
+ * everywhere has finished at which point the helper will be unexported.
+ *
+ * Only code that needs to perform permission checks based on the owner of the
+ * idmapping will get access to it. All other code will solely rely on
+ * idmappings. This will get us type safety so it's impossible to conflate
+ * filesystems idmappings with mount idmappings.
+ *
+ * Return: The owner of the idmapping.
+ */
+struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap)
+{
+ return idmap->owner;
+}
+EXPORT_SYMBOL_GPL(mnt_idmap_owner);
+
+/**
+ * mnt_user_ns - retrieve owner of an idmapped mount
+ * @mnt: the relevant vfsmount
+ *
+ * This helper will go away once the conversion to use struct mnt_idmap
+ * everywhere has finished at which point the helper will be unexported.
+ *
+ * Only code that needs to perform permission checks based on the owner of the
+ * idmapping will get access to it. All other code will solely rely on
+ * idmappings. This will get us type safety so it's impossible to conflate
+ * filesystems idmappings with mount idmappings.
+ *
+ * Return: The owner of the idmapped.
+ */
+struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
+{
+ struct mnt_idmap *idmap = mnt_idmap(mnt);
+
+ /* Return the actual owner of the filesystem instead of the nop. */
+ if (idmap == &nop_mnt_idmap &&
+ !initial_idmapping(mnt->mnt_sb->s_user_ns))
+ return mnt->mnt_sb->s_user_ns;
+ return mnt_idmap_owner(idmap);
+}
+EXPORT_SYMBOL_GPL(mnt_user_ns);
+
+/**
+ * alloc_mnt_idmap - allocate a new idmapping for the mount
+ * @mnt_userns: owning userns of the idmapping
+ *
+ * Allocate a new struct mnt_idmap which carries the idmapping of the mount.
+ *
+ * Return: On success a new idmap, on error an error pointer is returned.
+ */
+static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
+{
+ struct mnt_idmap *idmap;
+
+ idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
+ if (!idmap)
+ return ERR_PTR(-ENOMEM);
+
+ idmap->owner = get_user_ns(mnt_userns);
+ refcount_set(&idmap->count, 1);
+ return idmap;
+}
+
+/**
+ * mnt_idmap_get - get a reference to an idmapping
+ * @idmap: the idmap to bump the reference on
+ *
+ * If @idmap is not the @nop_mnt_idmap bump the reference count.
+ *
+ * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
+ */
+static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
+{
+ if (idmap != &nop_mnt_idmap)
+ refcount_inc(&idmap->count);
+
+ return idmap;
+}
+
+/**
+ * mnt_idmap_put - put a reference to an idmapping
+ * @idmap: the idmap to put the reference on
+ *
+ * If this is a non-initial idmapping, put the reference count when a mount is
+ * released and free it if we're the last user.
+ */
+static inline void mnt_idmap_put(struct mnt_idmap *idmap)
+{
+ if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
+ put_user_ns(idmap->owner);
+ kfree(idmap);
+ }
+}
+
static struct mount *alloc_vfsmnt(const char *name)
{
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -232,7 +347,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
- mnt->mnt.mnt_userns = &init_user_ns;
+ mnt->mnt.mnt_idmap = &nop_mnt_idmap;
}
return mnt;
@@ -602,11 +717,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
- struct user_namespace *mnt_userns;
-
- mnt_userns = mnt_user_ns(&mnt->mnt);
- if (!initial_idmapping(mnt_userns))
- put_user_ns(mnt_userns);
+ mnt_idmap_put(mnt_idmap(&mnt->mnt));
kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
@@ -1009,7 +1120,6 @@ static struct mount *skip_mnt_tree(struct mount *p)
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
struct mount *mnt;
- struct user_namespace *fs_userns;
if (!fc->root)
return ERR_PTR(-EINVAL);
@@ -1027,10 +1137,6 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
- fs_userns = mnt->mnt.mnt_sb->s_user_ns;
- if (!initial_idmapping(fs_userns))
- mnt->mnt.mnt_userns = get_user_ns(fs_userns);
-
lock_mount_hash();
list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
unlock_mount_hash();
@@ -1120,9 +1226,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
atomic_inc(&sb->s_active);
- mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
- if (!initial_idmapping(mnt->mnt.mnt_userns))
- mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
+ mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
+
mnt->mnt.mnt_sb = sb;
mnt->mnt.mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
@@ -3515,8 +3620,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
q = next_mnt(q, new);
if (!q)
break;
+ // an mntns binding we'd skipped?
while (p->mnt.mnt_root != q->mnt.mnt_root)
- p = next_mnt(p, old);
+ p = next_mnt(skip_mnt_tree(p), old);
}
namespace_unlock();
@@ -3981,14 +4087,14 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
struct vfsmount *m = &mnt->mnt;
struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
- if (!kattr->mnt_userns)
+ if (!kattr->mnt_idmap)
return 0;
/*
* Creating an idmapped mount with the filesystem wide idmapping
* doesn't make sense so block that. We don't allow mushy semantics.
*/
- if (kattr->mnt_userns == fs_userns)
+ if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns)
return -EINVAL;
/*
@@ -4028,7 +4134,7 @@ static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
{
return (!(kattr->attr_set & MNT_READONLY) ||
(mnt->mnt.mnt_flags & MNT_READONLY)) &&
- !kattr->mnt_userns;
+ !kattr->mnt_idmap;
}
static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
@@ -4082,27 +4188,18 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
- struct user_namespace *mnt_userns, *old_mnt_userns;
-
- if (!kattr->mnt_userns)
+ if (!kattr->mnt_idmap)
return;
/*
- * We're the only ones able to change the mount's idmapping. So
- * mnt->mnt.mnt_userns is stable and we can retrieve it directly.
- */
- old_mnt_userns = mnt->mnt.mnt_userns;
-
- mnt_userns = get_user_ns(kattr->mnt_userns);
- /* Pairs with smp_load_acquire() in mnt_user_ns(). */
- smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
-
- /*
- * If this is an idmapped filesystem drop the reference we've taken
- * in vfs_create_mount() before.
+ * Pairs with smp_load_acquire() in mnt_idmap().
+ *
+ * Since we only allow a mount to change the idmapping once and
+ * verified this in can_idmap_mount() we know that the mount has
+ * @nop_mnt_idmap attached to it. So there's no need to drop any
+ * references.
*/
- if (!initial_idmapping(old_mnt_userns))
- put_user_ns(old_mnt_userns);
+ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
}
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4136,6 +4233,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
if (path->dentry != mnt->mnt.mnt_root)
return -EINVAL;
+ if (kattr->mnt_userns) {
+ struct mnt_idmap *mnt_idmap;
+
+ mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
+ if (IS_ERR(mnt_idmap))
+ return PTR_ERR(mnt_idmap);
+ kattr->mnt_idmap = mnt_idmap;
+ }
+
if (kattr->propagation) {
/*
* Only take namespace_lock() if we're actually changing
@@ -4323,6 +4429,9 @@ static void finish_mount_kattr(struct mount_kattr *kattr)
{
put_user_ns(kattr->mnt_userns);
kattr->mnt_userns = NULL;
+
+ if (kattr->mnt_idmap)
+ mnt_idmap_put(kattr->mnt_idmap);
}
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index e374767d1b68..7f753380e047 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -23,7 +23,7 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
struct iov_iter iter;
- iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
iov_iter_zero(iov_iter_count(&iter), &iter);
@@ -49,7 +49,7 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
struct iov_iter iter;
netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
@@ -208,7 +208,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
continue;
}
- iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
subreq->start, subreq->len);
atomic_inc(&rreq->nr_copy_ops);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ead8a0e06abf..cf7365581031 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -146,7 +146,7 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state
{
struct inode *inode = state->inode;
struct file_lock *fl;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct list_head *list;
int status = 0;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index e861d7bae305..e731c00a9fcb 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -252,7 +252,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page)
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_read_operation(&cres, cookie);
if (ret < 0)
@@ -282,7 +282,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_write_operation(&cres, cookie);
if (ret < 0)
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 03a4e679fd99..df9ca56db347 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -12,7 +12,7 @@
*/
#ifdef CONFIG_NFS_V3_ACL
extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu);
-extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
struct posix_acl *dfacl);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 93de0b58647a..74d11e3c4205 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -255,23 +255,24 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
}
-int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
+ struct inode *inode = d_inode(dentry);
int status;
if (S_ISDIR(inode->i_mode)) {
switch(type) {
case ACL_TYPE_ACCESS:
- alloc = get_acl(inode, ACL_TYPE_DEFAULT);
+ alloc = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(alloc))
goto fail;
dfacl = alloc;
break;
case ACL_TYPE_DEFAULT:
- alloc = get_acl(inode, ACL_TYPE_ACCESS);
+ alloc = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(alloc))
goto fail;
dfacl = acl;
@@ -312,7 +313,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
struct posix_acl *acl;
char *p = data + *result;
- acl = get_acl(inode, type);
+ acl = get_inode_acl(inode, type);
if (IS_ERR_OR_NULL(acl))
return 0;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2e7579626cf0..4bf208a0a8e9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -998,7 +998,7 @@ static const struct inode_operations nfs3_dir_inode_operations = {
.setattr = nfs_setattr,
#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .get_acl = nfs3_get_acl,
+ .get_inode_acl = nfs3_get_acl,
.set_acl = nfs3_set_acl,
#endif
};
@@ -1009,7 +1009,7 @@ static const struct inode_operations nfs3_file_inode_operations = {
.setattr = nfs_setattr,
#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .get_acl = nfs3_get_acl,
+ .get_inode_acl = nfs3_get_acl,
.set_acl = nfs3_set_acl,
#endif
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a2d2d5d1b088..dd18344648f3 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1501,7 +1501,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
struct file_lock *fl;
struct nfs4_lock_state *lsp;
int status = 0;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct list_head *list;
if (flctx == NULL)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 317cedfa52bf..16be6dae524f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1055,7 +1055,7 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev,
if (prev) {
if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
return 0;
- flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
+ flctx = locks_inode_context(d_inode(nfs_req_openctx(req)->dentry));
if (flctx != NULL &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock)) &&
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f41d24b54fd1..80c240e50952 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1185,7 +1185,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct nfs_lock_context *l_ctx;
- struct file_lock_context *flctx = file_inode(file)->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(file_inode(file));
struct nfs_page *req;
int do_flush, status;
/*
@@ -1321,7 +1321,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page,
struct inode *inode, unsigned int pagelen)
{
int ret;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct file_lock *fl;
if (file->f_flags & O_DSYNC)
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 13e6e6897f6c..c43c25a8da2e 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -55,7 +55,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
goto out;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -69,7 +69,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
/* Check how Solaris handles requests for the Default ACL
of a non-directory! */
- acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
resp->status = nfserrno(PTR_ERR(acl));
goto fail;
@@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
inode_lock(inode);
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS,
argp->acl_access);
if (error)
goto out_drop_lock;
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT,
argp->acl_default);
if (error)
goto out_drop_lock;
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 2fb9ee356455..9daa621817d8 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -47,7 +47,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
resp->mask = argp->mask;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -61,7 +61,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
/* Check how Solaris handles requests for the Default ACL
of a non-directory! */
- acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
resp->status = nfserrno(PTR_ERR(acl));
goto fail;
@@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
inode_lock(inode);
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS,
argp->acl_access);
if (error)
goto out_drop_lock;
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT,
argp->acl_default);
out_drop_lock:
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index bb8e2f6d7d03..518203821790 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -135,7 +135,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
unsigned int flags = 0;
int size = 0;
- pacl = get_acl(inode, ACL_TYPE_ACCESS);
+ pacl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (!pacl)
pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -147,7 +147,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
if (S_ISDIR(inode->i_mode)) {
flags = NFS4_ACL_DIR;
- dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+ dpacl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(dpacl)) {
error = PTR_ERR(dpacl);
goto rel_pacl;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 836bd825ca4a..da8d0ea66229 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4758,7 +4758,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
static bool nfsd4_deleg_present(const struct inode *inode)
{
- struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx);
+ struct file_lock_context *ctx = locks_inode_context(inode);
return ctx && !list_empty_careful(&ctx->flc_lease);
}
@@ -5897,7 +5897,7 @@ nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo)
list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
nf = stp->st_stid.sc_file;
- ctx = nf->fi_inode->i_flctx;
+ ctx = locks_inode_context(nf->fi_inode);
if (!ctx)
continue;
if (locks_owner_has_blockers(ctx, lo))
@@ -7713,7 +7713,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
}
inode = locks_inode(nf->nf_file);
- flctx = inode->i_flctx;
+ flctx = locks_inode_context(inode);
if (flctx && !list_empty_careful(&flctx->flc_posix)) {
spin_lock(&flctx->flc_lock);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 849a720ab43f..08a929607641 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -480,12 +480,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
attr->na_seclabel->data, attr->na_seclabel->len);
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl)
attr->na_aclerr = set_posix_acl(&init_user_ns,
- inode, ACL_TYPE_ACCESS,
+ dentry, ACL_TYPE_ACCESS,
attr->na_pacl);
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) &&
!attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode))
attr->na_aclerr = set_posix_acl(&init_user_ns,
- inode, ACL_TYPE_DEFAULT,
+ dentry, ACL_TYPE_DEFAULT,
attr->na_dpacl);
inode_unlock(inode);
if (size_change)
@@ -943,7 +943,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
ssize_t host_err;
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_kvec(&iter, READ, vec, vlen, *count);
+ iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count);
host_err = vfs_iter_read(file, &iter, &ppos, 0);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
@@ -1033,7 +1033,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
if (stable && !use_wgather)
flags |= RWF_SYNC;
- iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
+ iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt);
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index c8b89b4f94e0..2064e6473d30 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,6 +13,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/random.h>
+#include <linux/log2.h>
#include <linux/crc32.h>
#include "nilfs.h"
#include "segment.h"
@@ -193,6 +194,34 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
}
/**
+ * nilfs_get_blocksize - get block size from raw superblock data
+ * @sb: super block instance
+ * @sbp: superblock raw data buffer
+ * @blocksize: place to store block size
+ *
+ * nilfs_get_blocksize() calculates the block size from the block size
+ * exponent information written in @sbp and stores it in @blocksize,
+ * or aborts with an error message if it's too large.
+ *
+ * Return Value: On success, 0 is returned. If the block size is too
+ * large, -EINVAL is returned.
+ */
+static int nilfs_get_blocksize(struct super_block *sb,
+ struct nilfs_super_block *sbp, int *blocksize)
+{
+ unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
+
+ if (unlikely(shift_bits >
+ ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) {
+ nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB",
+ shift_bits);
+ return -EINVAL;
+ }
+ *blocksize = BLOCK_SIZE << shift_bits;
+ return 0;
+}
+
+/**
* load_nilfs - load and recover the nilfs
* @nilfs: the_nilfs structure to be released
* @sb: super block instance used to recover past segment
@@ -245,11 +274,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
/* verify consistency between two super blocks */
- blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
+ err = nilfs_get_blocksize(sb, sbp[0], &blocksize);
+ if (err)
+ goto scan_error;
+
if (blocksize != nilfs->ns_blocksize) {
nilfs_warn(sb,
"blocksize differs between two super blocks (%d != %d)",
blocksize, nilfs->ns_blocksize);
+ err = -EINVAL;
goto scan_error;
}
@@ -443,11 +476,33 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp)
return crc == le32_to_cpu(sbp->s_sum);
}
-static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+/**
+ * nilfs_sb2_bad_offset - check the location of the second superblock
+ * @sbp: superblock raw data buffer
+ * @offset: byte offset of second superblock calculated from device size
+ *
+ * nilfs_sb2_bad_offset() checks if the position on the second
+ * superblock is valid or not based on the filesystem parameters
+ * stored in @sbp. If @offset points to a location within the segment
+ * area, or if the parameters themselves are not normal, it is
+ * determined to be invalid.
+ *
+ * Return Value: true if invalid, false if valid.
+ */
+static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
{
- return offset < ((le64_to_cpu(sbp->s_nsegments) *
- le32_to_cpu(sbp->s_blocks_per_segment)) <<
- (le32_to_cpu(sbp->s_log_block_size) + 10));
+ unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
+ u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+ u64 nsegments = le64_to_cpu(sbp->s_nsegments);
+ u64 index;
+
+ if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS ||
+ shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)
+ return true;
+
+ index = offset >> (shift_bits + BLOCK_SIZE_BITS);
+ do_div(index, blocks_per_segment);
+ return index < nsegments;
}
static void nilfs_release_super_block(struct the_nilfs *nilfs)
@@ -586,9 +641,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
if (err)
goto failed_sbh;
- blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
- if (blocksize < NILFS_MIN_BLOCK_SIZE ||
- blocksize > NILFS_MAX_BLOCK_SIZE) {
+ err = nilfs_get_blocksize(sb, sbp, &blocksize);
+ if (err)
+ goto failed_sbh;
+
+ if (blocksize < NILFS_MIN_BLOCK_SIZE) {
nilfs_err(sb,
"couldn't mount because of unsupported filesystem blocksize %d",
blocksize);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 4f2ffc7ef296..c5e4a886593d 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -802,7 +802,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
setattr_copy(mnt_userns, inode, attr);
if (mode != inode->i_mode) {
- err = ntfs_acl_chmod(mnt_userns, inode);
+ err = ntfs_acl_chmod(mnt_userns, dentry);
if (err)
goto out;
@@ -1255,7 +1255,7 @@ const struct inode_operations ntfs_file_inode_operations = {
.setattr = ntfs3_setattr,
.listxattr = ntfs_listxattr,
.permission = ntfs_permission,
- .get_acl = ntfs_get_acl,
+ .get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.fiemap = ntfs_fiemap,
};
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index bc22cc321a74..053cc0e0f8b5 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -367,7 +367,7 @@ const struct inode_operations ntfs_dir_inode_operations = {
.mknod = ntfs_mknod,
.rename = ntfs_rename,
.permission = ntfs_permission,
- .get_acl = ntfs_get_acl,
+ .get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
@@ -379,7 +379,7 @@ const struct inode_operations ntfs_special_inode_operations = {
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
.listxattr = ntfs_listxattr,
- .get_acl = ntfs_get_acl,
+ .get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
};
// clang-format on
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 2c791222c4e2..a4d292809a33 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -843,7 +843,7 @@ int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2,
/* globals from xattr.c */
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu);
-int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct inode *dir);
@@ -852,7 +852,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
#define ntfs_set_acl NULL
#endif
-int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode);
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry);
int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
int mask);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 7de8718c68a9..aafe98ee0b21 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -619,10 +619,10 @@ out:
/*
* ntfs_set_acl - inode_operations::set_acl
*/
-int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
- return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false);
+ return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false);
}
/*
@@ -664,8 +664,9 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
/*
* ntfs_acl_chmod - Helper for ntfs3_setattr().
*/
-int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry)
{
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
if (!(sb->s_flags & SB_POSIXACL))
@@ -674,7 +675,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- return posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ return posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
}
/*
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 23a72a423955..9f19cf9a5a9f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -260,12 +260,13 @@ static int ocfs2_set_acl(handle_t *handle,
return ret;
}
-int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
struct buffer_head *bh = NULL;
int status, had_lock;
struct ocfs2_lock_holder oh;
+ struct inode *inode = d_inode(dentry);
had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
if (had_lock < 0)
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 95a57c888ab6..a897c4e41b26 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -17,7 +17,7 @@ struct ocfs2_acl_entry {
};
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu);
-int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b13d344d40b6..60b97c92e2b2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -335,7 +335,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg)
/* negotiate timeout must be less than write timeout. */
schedule_delayed_work(&reg->hr_nego_timeout_work,
msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
- memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
+ bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES);
}
static void o2hb_disarm_timeout(struct o2hb_region *reg)
@@ -375,7 +375,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
if (reg->hr_last_hb_status)
return;
- o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
/* lowest node as master node to make negotiate decision. */
master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES);
@@ -386,8 +386,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
config_item_name(&reg->hr_item), reg->hr_bdev);
set_bit(master_node, reg->hr_nego_node_bitmap);
}
- if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
- sizeof(reg->hr_nego_node_bitmap))) {
+ if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
+ O2NM_MAX_NODES)) {
/* check negotiate bitmap every second to do timeout
* approve decision.
*/
@@ -856,8 +856,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
* live nodes heartbeat on it. In other words, the region has been
* added to all nodes.
*/
- if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
- sizeof(o2hb_live_node_bitmap)))
+ if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ O2NM_MAX_NODES))
goto unlock;
printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
@@ -1087,7 +1087,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* If a node is not configured but is in the livemap, we still need
* to read the slot so as to be able to remove it from the livemap.
*/
- o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
i = -1;
while ((i = find_next_bit(live_node_bitmap,
O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
@@ -1437,11 +1437,11 @@ void o2hb_init(void)
for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
INIT_LIST_HEAD(&o2hb_live_slots[i]);
- memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
- memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
- memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
- memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
- memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+ bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES);
+ bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS);
o2hb_dependent_users = 0;
@@ -1450,23 +1450,21 @@ void o2hb_init(void)
/* if we're already in a callback then we're already serialized by the sem */
static void o2hb_fill_node_map_from_callback(unsigned long *map,
- unsigned bytes)
+ unsigned int bits)
{
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memcpy(map, &o2hb_live_node_bitmap, bytes);
+ bitmap_copy(map, o2hb_live_node_bitmap, bits);
}
/*
* get a map of all nodes that are heartbeating in any regions
*/
-void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+void o2hb_fill_node_map(unsigned long *map, unsigned int bits)
{
/* callers want to serialize this map and callbacks so that they
* can trust that they don't miss nodes coming to the party */
down_read(&o2hb_callback_sem);
spin_lock(&o2hb_live_lock);
- o2hb_fill_node_map_from_callback(map, bytes);
+ o2hb_fill_node_map_from_callback(map, bits);
spin_unlock(&o2hb_live_lock);
up_read(&o2hb_callback_sem);
}
@@ -2460,7 +2458,7 @@ int o2hb_check_node_heartbeating_no_sem(u8 node_num)
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
spin_lock(&o2hb_live_lock);
- o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
@@ -2477,7 +2475,7 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 1d4100abf6f8..8ef8c1b9eeb7 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -59,7 +59,7 @@ int o2hb_register_callback(const char *region_uuid,
void o2hb_unregister_callback(const char *region_uuid,
struct o2hb_callback_func *hc);
void o2hb_fill_node_map(unsigned long *map,
- unsigned bytes);
+ unsigned int bits);
void o2hb_exit(void);
void o2hb_init(void);
int o2hb_check_node_heartbeating_no_sem(u8 node_num);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7524994e3199..35c05c18de59 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -438,7 +438,7 @@ static int o2net_fill_bitmap(char *buf, int len)
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
int i = -1, out = 0;
- o2net_fill_node_map(map, sizeof(map));
+ o2net_fill_node_map(map, O2NM_MAX_NODES);
while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 27fee68f860a..2f61d39e4e50 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -54,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
return -EINVAL;
read_lock(&cluster->cl_nodes_lock);
- memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+ bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES);
read_unlock(&cluster->cl_nodes_lock);
return 0;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index f660c0dbdb63..37d222bdfc8c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -900,7 +900,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
struct kvec vec = { .iov_len = len, .iov_base = data, };
struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
- iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len);
return sock_recvmsg(sock, &msg, MSG_DONTWAIT);
}
@@ -990,14 +990,12 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
}
/* Get a map of all nodes to which this node is currently connected to */
-void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+void o2net_fill_node_map(unsigned long *map, unsigned int bits)
{
struct o2net_sock_container *sc;
int node, ret;
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memset(map, 0, bytes);
+ bitmap_zero(map, bits);
for (node = 0; node < O2NM_MAX_NODES; ++node) {
if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
continue;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fd2022712167..20f790a47484 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1094,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err)
static inline void dlm_node_iter_init(unsigned long *map,
struct dlm_node_iter *iter)
{
- memcpy(iter->node_map, map, sizeof(iter->node_map));
+ bitmap_copy(iter->node_map, map, O2NM_MAX_NODES);
iter->curnode = -1;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c4eccd499db8..5c04dde99981 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1576,8 +1576,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm,
spin_lock(&dlm->spinlock);
/* For now, we restart the process if the node maps have
* changed at all */
- ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
- sizeof(dlm->live_nodes_map));
+ ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map,
+ O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
if (ret)
@@ -1604,13 +1604,11 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
/* group sem locking should work for us here -- we're already
* registered for heartbeat events so filling this should be
* atomic wrt getting those handlers called. */
- o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+ o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES);
spin_lock(&dlm->spinlock);
- memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
-
+ bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES);
__dlm_set_joining_node(dlm, dlm->node_num);
-
spin_unlock(&dlm->spinlock);
node = -1;
@@ -1643,8 +1641,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
* yes_resp_map. Copy that into our domain map and send a join
* assert message to clean up everyone elses state. */
spin_lock(&dlm->spinlock);
- memcpy(dlm->domain_map, ctxt->yes_resp_map,
- sizeof(ctxt->yes_resp_map));
+ bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES);
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
@@ -2009,9 +2006,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
dlm->recovery_map, &(dlm->recovery_map[0]));
- memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
- memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
- memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+ bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->domain_map, O2NM_MAX_NODES);
dlm->dlm_thread_task = NULL;
dlm->dlm_reco_thread_task = NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 227da5b1b6ab..d610da8e2f24 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -258,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
mle->type = type;
INIT_HLIST_NODE(&mle->master_hash_node);
INIT_LIST_HEAD(&mle->hb_events);
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
spin_lock_init(&mle->spinlock);
init_waitqueue_head(&mle->wq);
atomic_set(&mle->woken, 0);
kref_init(&mle->mle_refs);
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
mle->master = O2NM_MAX_NODES;
mle->new_master = O2NM_MAX_NODES;
mle->inuse = 0;
@@ -290,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
atomic_inc(&dlm->mle_cur_count[mle->type]);
/* copy off the node_map and register hb callbacks on our copy */
- memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
- memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+ bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES);
+ bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES);
clear_bit(dlm->node_num, mle->vote_map);
clear_bit(dlm->node_num, mle->node_map);
@@ -572,7 +572,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
spin_unlock(&dlm->track_lock);
memset(res->lvb, 0, DLM_LVB_LEN);
- memset(res->refmap, 0, sizeof(res->refmap));
+ bitmap_zero(res->refmap, O2NM_MAX_NODES);
}
struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
@@ -1036,10 +1036,10 @@ recheck:
spin_lock(&mle->spinlock);
m = mle->master;
- map_changed = (memcmp(mle->vote_map, mle->node_map,
- sizeof(mle->vote_map)) != 0);
- voting_done = (memcmp(mle->vote_map, mle->response_map,
- sizeof(mle->vote_map)) == 0);
+ map_changed = !bitmap_equal(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
+ voting_done = bitmap_equal(mle->vote_map, mle->response_map,
+ O2NM_MAX_NODES);
/* restart if we hit any errors */
if (map_changed) {
@@ -1277,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
/* now blank out everything, as if we had never
* contacted anyone */
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
/* reset the vote_map to the current node_map */
- memcpy(mle->vote_map, mle->node_map,
- sizeof(mle->node_map));
+ bitmap_copy(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
/* put myself into the maybe map */
if (mle->type != DLM_MLE_BLOCK)
set_bit(dlm->node_num, mle->maybe_map);
@@ -2094,7 +2094,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
flags = item->u.am.flags;
spin_lock(&dlm->spinlock);
- memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+ bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
clear_bit(dlm->node_num, nodemap);
@@ -3447,7 +3447,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = 0;
}
- memset(iter.node_map, 0, sizeof(iter.node_map));
+ bitmap_zero(iter.node_map, O2NM_MAX_NODES);
set_bit(old_master, iter.node_map);
mlog(0, "doing assert master of %.*s back to %u\n",
res->lockname.len, res->lockname.name, old_master);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 52ad342fec3e..50da8af988c1 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -733,7 +733,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_reco_node_data *ndata;
spin_lock(&dlm->spinlock);
- memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+ bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES);
/* nodes can only be removed (by dying) after dropping
* this lock, and death will be trapped later, so this should do */
spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9c67edd215d5..5c60b6bc85bf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
}
}
- if (file && should_remove_suid(file->f_path.dentry)) {
+ if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
ret = __ocfs2_write_remove_suid(inode, di_bh);
if (ret) {
mlog_errno(ret);
@@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* inode. There's also the dinode i_size state which
* can be lost via setattr during extending writes (we
* set inode->i_size at the end of a write. */
- if (should_remove_suid(dentry)) {
+ if (setattr_should_drop_suidgid(&init_user_ns, inode)) {
if (meta_level == 0) {
ocfs2_inode_unlock_for_extent_tree(inode,
&di_bh,
@@ -2712,7 +2712,7 @@ const struct inode_operations ocfs2_file_iops = {
.permission = ocfs2_permission,
.listxattr = ocfs2_listxattr,
.fiemap = ocfs2_fiemap,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
.fileattr_get = ocfs2_fileattr_get,
.fileattr_set = ocfs2_fileattr_set,
@@ -2722,7 +2722,7 @@ const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
};
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 126671e6caed..3fb98b4569a2 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -157,7 +157,7 @@ static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
replay_map->rm_state = REPLAY_DONE;
}
-static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
{
struct ocfs2_replay_map *replay_map = osb->replay_map;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 969d0aa28718..41c382f68529 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -150,6 +150,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb);
void ocfs2_recovery_exit(struct ocfs2_super *osb);
int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
+void ocfs2_free_replay_slots(struct ocfs2_super *osb);
/*
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 05f32989bad6..a8fd51afb794 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2915,7 +2915,7 @@ const struct inode_operations ocfs2_dir_iops = {
.permission = ocfs2_permission,
.listxattr = ocfs2_listxattr,
.fiemap = ocfs2_fiemap,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
.fileattr_get = ocfs2_fileattr_get,
.fileattr_set = ocfs2_fileattr_set,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740b64238312..a503c553bab2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -560,8 +560,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
u32 nlink = le16_to_cpu(di->i_links_count);
u32 hi = le16_to_cpu(di->i_links_count_hi);
- if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
- nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+ nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
return nlink;
}
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 88f75f7f02d7..c973c03f6fd8 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -273,17 +273,17 @@ static int o2cb_cluster_check(void)
*/
#define O2CB_MAP_STABILIZE_COUNT 60
for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
- o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ o2hb_fill_node_map(hbmap, O2NM_MAX_NODES);
if (!test_bit(node_num, hbmap)) {
printk(KERN_ERR "o2cb: %s heartbeat has not been "
"started.\n", (o2hb_global_heartbeat_active() ?
"Global" : "Local"));
return -EINVAL;
}
- o2net_fill_node_map(netmap, sizeof(netmap));
+ o2net_fill_node_map(netmap, O2NM_MAX_NODES);
/* Force set the current node to allow easy compare */
set_bit(node_num, netmap);
- if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES))
return 0;
if (i < O2CB_MAP_STABILIZE_COUNT - 1)
msleep(1000);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 317126261523..a8d5ca98fa57 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -669,6 +669,8 @@ static struct ctl_table_header *ocfs2_table_header;
static int __init ocfs2_stack_glue_init(void)
{
+ int ret;
+
strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table);
@@ -678,7 +680,11 @@ static int __init ocfs2_stack_glue_init(void)
return -ENOMEM; /* or something. */
}
- return ocfs2_sysfs_init();
+ ret = ocfs2_sysfs_init();
+ if (ret)
+ unregister_sysctl_table(ocfs2_table_header);
+
+ return ret;
}
static void __exit ocfs2_stack_glue_exit(void)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 42c993e53924..0b0e6a132101 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1159,6 +1159,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
out_dismount:
atomic_set(&osb->vol_state, VOLUME_DISABLED);
wake_up(&osb->osb_mount_event);
+ ocfs2_free_replay_slots(osb);
ocfs2_dismount_volume(sb, 1);
goto out;
@@ -1822,12 +1823,14 @@ static int ocfs2_mount_volume(struct super_block *sb)
status = ocfs2_truncate_log_init(osb);
if (status < 0) {
mlog_errno(status);
- goto out_system_inodes;
+ goto out_check_volume;
}
ocfs2_super_unlock(osb, 1);
return 0;
+out_check_volume:
+ ocfs2_free_replay_slots(osb);
out_system_inodes:
if (osb->local_alloc_state == OCFS2_LA_ENABLED)
ocfs2_shutdown_local_alloc(osb);
diff --git a/fs/open.c b/fs/open.c
index a81319b6177f..9d0197db15e7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -54,7 +54,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
}
/* Remove suid, sgid, and file capabilities on truncate too */
- ret = dentry_needs_remove_privs(dentry);
+ ret = dentry_needs_remove_privs(mnt_userns, dentry);
if (ret < 0)
return ret;
if (ret)
@@ -723,10 +723,10 @@ retry_deleg:
return -EINVAL;
if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
return -EINVAL;
- if (!S_ISDIR(inode->i_mode))
- newattrs.ia_valid |=
- ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
inode_lock(inode);
+ if (!S_ISDIR(inode->i_mode))
+ newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
+ setattr_should_drop_sgid(mnt_userns, inode);
/* Continue to send actual fs values, not the mount values. */
error = security_path_chown(
path,
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 605e5a3506ec..c5da2091cefb 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -64,8 +64,7 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl,
- int type)
+int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int error = 0;
void *value = NULL;
@@ -119,12 +118,13 @@ out:
return error;
}
-int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error;
struct iattr iattr;
int rc;
+ struct inode *inode = d_inode(dentry);
memset(&iattr, 0, sizeof iattr);
@@ -153,46 +153,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
rc = __orangefs_set_acl(inode, acl, type);
if (!rc && (iattr.ia_valid == ATTR_MODE))
- rc = __orangefs_setattr(inode, &iattr);
+ rc = __orangefs_setattr_mode(dentry, &iattr);
return rc;
}
-
-int orangefs_init_acl(struct inode *inode, struct inode *dir)
-{
- struct posix_acl *default_acl, *acl;
- umode_t mode = inode->i_mode;
- struct iattr iattr;
- int error = 0;
-
- error = posix_acl_create(dir, &mode, &default_acl, &acl);
- if (error)
- return error;
-
- if (default_acl) {
- error = __orangefs_set_acl(inode, default_acl,
- ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- } else {
- inode->i_default_acl = NULL;
- }
-
- if (acl) {
- if (!error)
- error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
- posix_acl_release(acl);
- } else {
- inode->i_acl = NULL;
- }
-
- /* If mode of the inode was changed, then do a forcible ->setattr */
- if (mode != inode->i_mode) {
- memset(&iattr, 0, sizeof iattr);
- inode->i_mode = mode;
- iattr.ia_mode = mode;
- iattr.ia_valid |= ATTR_MODE;
- __orangefs_setattr(inode, &iattr);
- }
-
- return error;
-}
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 7a8c0c6e698d..370bd3bbf5e4 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -53,7 +53,7 @@ static int orangefs_writepage_locked(struct page *page,
bv.bv_len = wlen;
bv.bv_offset = off % PAGE_SIZE;
WARN_ON(wlen == 0);
- iov_iter_bvec(&iter, WRITE, &bv, 1, wlen);
+ iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen);
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
len, wr, NULL, NULL);
@@ -112,7 +112,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
else
ow->bv[i].bv_offset = 0;
}
- iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len);
+ iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len);
WARN_ON(ow->off >= len);
if (ow->off + ow->len > len)
@@ -270,7 +270,7 @@ static void orangefs_readahead(struct readahead_control *rac)
offset = readahead_pos(rac);
i_pages = &rac->mapping->i_pages;
- iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
+ iov_iter_xarray(&iter, ITER_DEST, i_pages, offset, readahead_length(rac));
/* read in the pages. */
if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
@@ -303,7 +303,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
bv.bv_page = &folio->page;
bv.bv_len = folio_size(folio);
bv.bv_offset = 0;
- iov_iter_bvec(&iter, READ, &bv, 1, folio_size(folio));
+ iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio));
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
folio_size(folio), inode->i_size, NULL, NULL, file);
@@ -828,15 +828,23 @@ again:
spin_unlock(&inode->i_lock);
mark_inode_dirty(inode);
- if (iattr->ia_valid & ATTR_MODE)
- /* change mod on a file that has ACLs */
- ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
-
ret = 0;
out:
return ret;
}
+int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr)
+{
+ int ret;
+ struct inode *inode = d_inode(dentry);
+
+ ret = __orangefs_setattr(inode, iattr);
+ /* change mode on a file that has ACLs */
+ if (!ret && (iattr->ia_valid & ATTR_MODE))
+ ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
+ return ret;
+}
+
/*
* Change attributes of an object referenced by dentry.
*/
@@ -849,7 +857,7 @@ int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
ret = setattr_prepare(&init_user_ns, dentry, iattr);
if (ret)
goto out;
- ret = __orangefs_setattr(d_inode(dentry), iattr);
+ ret = __orangefs_setattr_mode(dentry, iattr);
sync_inode_metadata(d_inode(dentry), 1);
out:
gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n",
@@ -967,7 +975,7 @@ static int orangefs_fileattr_set(struct user_namespace *mnt_userns,
/* ORANGEFS2 implementation of VFS inode operations for files */
static const struct inode_operations orangefs_file_inode_operations = {
- .get_acl = orangefs_get_acl,
+ .get_inode_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
.setattr = orangefs_setattr,
.getattr = orangefs_getattr,
@@ -1097,8 +1105,9 @@ struct inode *orangefs_iget(struct super_block *sb,
* Allocate an inode for a newly created file and insert it into the inode hash.
*/
struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
- int mode, dev_t dev, struct orangefs_object_kref *ref)
+ umode_t mode, dev_t dev, struct orangefs_object_kref *ref)
{
+ struct posix_acl *acl = NULL, *default_acl = NULL;
unsigned long hash = orangefs_handle_hash(ref);
struct inode *inode;
int error;
@@ -1115,6 +1124,10 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
if (!inode)
return ERR_PTR(-ENOMEM);
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error)
+ goto out_iput;
+
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
@@ -1125,6 +1138,19 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_init_iops(inode);
inode->i_rdev = dev;
+ if (default_acl) {
+ error = __orangefs_set_acl(inode, default_acl,
+ ACL_TYPE_DEFAULT);
+ if (error)
+ goto out_iput;
+ }
+
+ if (acl) {
+ error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ if (error)
+ goto out_iput;
+ }
+
error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
if (error < 0)
goto out_iput;
@@ -1132,10 +1158,22 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
gossip_debug(GOSSIP_INODE_DEBUG,
"Initializing ACL's for inode %pU\n",
get_khandle_from_ino(inode));
- orangefs_init_acl(inode, dir);
+ if (mode != inode->i_mode) {
+ struct iattr iattr = {
+ .ia_mode = mode,
+ .ia_valid = ATTR_MODE,
+ };
+ inode->i_mode = mode;
+ __orangefs_setattr(inode, &iattr);
+ __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ }
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
return inode;
out_iput:
iput(inode);
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
return ERR_PTR(error);
}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 600e8eee541f..75c1a3dcf68c 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -430,7 +430,7 @@ static int orangefs_rename(struct user_namespace *mnt_userns,
/* ORANGEFS implementation of VFS inode operations for directories */
const struct inode_operations orangefs_dir_inode_operations = {
.lookup = orangefs_lookup,
- .get_acl = orangefs_get_acl,
+ .get_inode_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
.create = orangefs_create,
.unlink = orangefs_unlink,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index b5940ec1836a..6e0cc01b3a14 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -103,13 +103,13 @@ enum orangefs_vfs_op_states {
#define ORANGEFS_CACHE_CREATE_FLAGS 0
#endif
-extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
extern const struct xattr_handler *orangefs_xattr_handlers[];
extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
extern int orangefs_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl,
+ struct dentry *dentry, struct posix_acl *acl,
int type);
+int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
/*
* orangefs data structures
@@ -356,11 +356,12 @@ void fsid_key_table_finalize(void);
vm_fault_t orangefs_page_mkwrite(struct vm_fault *);
struct inode *orangefs_new_inode(struct super_block *sb,
struct inode *dir,
- int mode,
+ umode_t mode,
dev_t dev,
struct orangefs_object_kref *ref);
int __orangefs_setattr(struct inode *, struct iattr *);
+int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr);
int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index dd188c7996b3..6708e54b0e30 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -96,7 +96,7 @@ config OVERLAY_FS_XINO_AUTO
depends on 64BIT
help
If this config option is enabled then overlay filesystems will use
- unused high bits in undelying filesystem inode numbers to map all
+ unused high bits in underlying filesystem inode numbers to map all
inodes to a unified address space. The mapped 64bit inode numbers
might not be compatible with applications that expect 32bit inodes.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index f436d8847f08..6e4e65ee050d 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -44,6 +44,35 @@ static bool ovl_must_copy_xattr(const char *name)
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
}
+static int ovl_copy_acl(struct ovl_fs *ofs, const struct path *path,
+ struct dentry *dentry, const char *acl_name)
+{
+ int err;
+ struct posix_acl *clone, *real_acl = NULL;
+
+ real_acl = ovl_get_acl_path(path, acl_name, false);
+ if (!real_acl)
+ return 0;
+
+ if (IS_ERR(real_acl)) {
+ err = PTR_ERR(real_acl);
+ if (err == -ENODATA || err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ clone = posix_acl_clone(real_acl, GFP_KERNEL);
+ posix_acl_release(real_acl); /* release original acl */
+ if (!clone)
+ return -ENOMEM;
+
+ err = ovl_do_set_acl(ofs, dentry, acl_name, clone);
+
+ /* release cloned acl */
+ posix_acl_release(clone);
+ return err;
+}
+
int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new)
{
struct dentry *old = oldpath->dentry;
@@ -93,6 +122,15 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de
error = 0;
continue; /* Discard */
}
+
+ if (is_posix_acl_xattr(name)) {
+ error = ovl_copy_acl(OVL_FS(sb), oldpath, new, name);
+ if (!error)
+ continue;
+ /* POSIX ACLs must be copied. */
+ break;
+ }
+
retry:
size = ovl_do_getxattr(oldpath, name, value, value_size);
if (size == -ERANGE)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6b03457f72bb..f61e37f4c8ff 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -435,28 +435,12 @@ out:
}
static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry,
- const char *name, const struct posix_acl *acl)
+ const char *acl_name, struct posix_acl *acl)
{
- void *buffer;
- size_t size;
- int err;
-
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
return 0;
- size = posix_acl_xattr_size(acl->a_count);
- buffer = kmalloc(size, GFP_KERNEL);
- if (!buffer)
- return -ENOMEM;
-
- err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- if (err < 0)
- goto out_free;
-
- err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE);
-out_free:
- kfree(buffer);
- return err;
+ return ovl_do_set_acl(ofs, upperdentry, acl_name, acl);
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
@@ -592,28 +576,42 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
goto out_revert_creds;
}
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (override_cred) {
+ if (!attr->hardlink) {
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_revert_creds;
+ /*
+ * In the creation cases(create, mkdir, mknod, symlink),
+ * ovl should transfer current's fs{u,g}id to underlying
+ * fs. Because underlying fs want to initialize its new
+ * inode owner using current's fs{u,g}id. And in this
+ * case, the @inode is a new inode that is initialized
+ * in inode_init_owner() to current's fs{u,g}id. So use
+ * the inode's i_{u,g}id to override the cred's fs{u,g}id.
+ *
+ * But in the other hardlink case, ovl_link() does not
+ * create a new inode, so just use the ovl mounter's
+ * fs{u,g}id.
+ */
override_cred->fsuid = inode->i_uid;
override_cred->fsgid = inode->i_gid;
- if (!attr->hardlink) {
- err = security_dentry_create_files_as(dentry,
- attr->mode, &dentry->d_name, old_cred,
- override_cred);
- if (err) {
- put_cred(override_cred);
- goto out_revert_creds;
- }
+ err = security_dentry_create_files_as(dentry,
+ attr->mode, &dentry->d_name, old_cred,
+ override_cred);
+ if (err) {
+ put_cred(override_cred);
+ goto out_revert_creds;
}
put_cred(override_creds(override_cred));
put_cred(override_cred);
-
- if (!ovl_dentry_is_whiteout(dentry))
- err = ovl_create_upper(dentry, inode, attr);
- else
- err = ovl_create_over_whiteout(dentry, inode, attr);
}
+
+ if (!ovl_dentry_is_whiteout(dentry))
+ err = ovl_create_upper(dentry, inode, attr);
+ else
+ err = ovl_create_over_whiteout(dentry, inode, attr);
+
out_revert_creds:
revert_creds(old_cred);
return err;
@@ -1311,7 +1309,9 @@ const struct inode_operations ovl_dir_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
+ .get_inode_acl = ovl_get_inode_acl,
.get_acl = ovl_get_acl,
+ .set_acl = ovl_set_acl,
.update_time = ovl_update_time,
.fileattr_get = ovl_fileattr_get,
.fileattr_set = ovl_fileattr_set,
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index e065a5b9a442..a25bb3453dde 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -339,7 +339,7 @@ out_iput:
return dentry;
}
-/* Get the upper or lower dentry in stach whose on layer @idx */
+/* Get the upper or lower dentry in stack whose on layer @idx */
static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -463,7 +463,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
/* Get connected upper overlay dir from index */
if (index) {
- struct dentry *upper = ovl_index_upper(ofs, index);
+ struct dentry *upper = ovl_index_upper(ofs, index, true);
dput(index);
if (IS_ERR_OR_NULL(upper))
@@ -739,7 +739,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
/* Then try to get a connected upper dir by index */
if (index && d_is_dir(index)) {
- struct dentry *upper = ovl_index_upper(ofs, index);
+ struct dentry *upper = ovl_index_upper(ofs, index, true);
err = PTR_ERR(upper);
if (IS_ERR_OR_NULL(upper))
@@ -796,7 +796,7 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type)
return ERR_PTR(-ENOMEM);
/* Copy unaligned inner fh into aligned buffer */
- memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET);
+ memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET);
return fh;
}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index a1a22f58ba18..c9d0c362c7ef 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -34,7 +34,7 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
-/* No atime modificaton nor notify on underlying */
+/* No atime modification nor notify on underlying */
#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
static struct file *ovl_open_realfile(const struct file *file,
@@ -96,6 +96,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
spin_lock(&file->f_lock);
file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags;
+ file->f_iocb_flags = iocb_flags(file);
spin_unlock(&file->f_lock);
return 0;
@@ -517,9 +518,16 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
const struct cred *old_cred;
int ret;
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ goto out_unlock;
+
ret = ovl_real_fdget(file, &real);
if (ret)
- return ret;
+ goto out_unlock;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = vfs_fallocate(real.file, mode, offset, len);
@@ -530,6 +538,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
fdput(real);
+out_unlock:
+ inode_unlock(inode);
+
return ret;
}
@@ -567,14 +578,23 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
const struct cred *old_cred;
loff_t ret;
+ inode_lock(inode_out);
+ if (op != OVL_DEDUPE) {
+ /* Update mode */
+ ovl_copyattr(inode_out);
+ ret = file_remove_privs(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
ret = ovl_real_fdget(file_out, &real_out);
if (ret)
- return ret;
+ goto out_unlock;
ret = ovl_real_fdget(file_in, &real_in);
if (ret) {
fdput(real_out);
- return ret;
+ goto out_unlock;
}
old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
@@ -603,6 +623,9 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
fdput(real_in);
fdput(real_out);
+out_unlock:
+ inode_unlock(inode_out);
+
return ret;
}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 9e61511de7a7..ee6dfa577c93 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -14,6 +14,8 @@
#include <linux/fileattr.h>
#include <linux/security.h>
#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include "overlayfs.h"
@@ -460,7 +462,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
* of the POSIX ACLs retrieved from the lower layer to this function to not
* alter the POSIX ACLs for the underlying filesystem.
*/
-static void ovl_idmap_posix_acl(struct inode *realinode,
+static void ovl_idmap_posix_acl(const struct inode *realinode,
struct user_namespace *mnt_userns,
struct posix_acl *acl)
{
@@ -485,6 +487,64 @@ static void ovl_idmap_posix_acl(struct inode *realinode,
}
/*
+ * The @noperm argument is used to skip permission checking and is a temporary
+ * measure. Quoting Miklos from an earlier discussion:
+ *
+ * > So there are two paths to getting an acl:
+ * > 1) permission checking and 2) retrieving the value via getxattr(2).
+ * > This is a similar situation as reading a symlink vs. following it.
+ * > When following a symlink overlayfs always reads the link on the
+ * > underlying fs just as if it was a readlink(2) call, calling
+ * > security_inode_readlink() instead of security_inode_follow_link().
+ * > This is logical: we are reading the link from the underlying storage,
+ * > and following it on overlayfs.
+ * >
+ * > Applying the same logic to acl: we do need to call the
+ * > security_inode_getxattr() on the underlying fs, even if just want to
+ * > check permissions on overlay. This is currently not done, which is an
+ * > inconsistency.
+ * >
+ * > Maybe adding the check to ovl_get_acl() is the right way to go, but
+ * > I'm a little afraid of a performance regression. Will look into that.
+ *
+ * Until we have made a decision allow this helper to take the @noperm
+ * argument. We should hopefully be able to remove it soon.
+ */
+struct posix_acl *ovl_get_acl_path(const struct path *path,
+ const char *acl_name, bool noperm)
+{
+ struct posix_acl *real_acl, *clone;
+ struct user_namespace *mnt_userns;
+ struct inode *realinode = d_inode(path->dentry);
+
+ mnt_userns = mnt_user_ns(path->mnt);
+
+ if (noperm)
+ real_acl = get_inode_acl(realinode, posix_acl_type(acl_name));
+ else
+ real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name);
+ if (IS_ERR_OR_NULL(real_acl))
+ return real_acl;
+
+ if (!is_idmapped_mnt(path->mnt))
+ return real_acl;
+
+ /*
+ * We cannot alter the ACLs returned from the relevant layer as that
+ * would alter the cached values filesystem wide for the lower
+ * filesystem. Instead we can clone the ACLs and then apply the
+ * relevant idmapping of the layer.
+ */
+ clone = posix_acl_clone(real_acl, GFP_KERNEL);
+ posix_acl_release(real_acl); /* release original acl */
+ if (!clone)
+ return ERR_PTR(-ENOMEM);
+
+ ovl_idmap_posix_acl(realinode, mnt_userns, clone);
+ return clone;
+}
+
+/*
* When the relevant layer is an idmapped mount we need to take the idmapping
* of the layer into account and translate any ACL_{GROUP,USER} values
* according to the idmapped mount.
@@ -495,10 +555,12 @@ static void ovl_idmap_posix_acl(struct inode *realinode,
*
* This is obviously only relevant when idmapped layers are used.
*/
-struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
+struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
+ bool rcu, bool noperm)
{
struct inode *realinode = ovl_inode_real(inode);
- struct posix_acl *acl, *clone;
+ struct posix_acl *acl;
struct path realpath;
if (!IS_POSIXACL(realinode))
@@ -512,40 +574,115 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
}
if (rcu) {
+ /*
+ * If the layer is idmapped drop out of RCU path walk
+ * so we can clone the ACLs.
+ */
+ if (is_idmapped_mnt(realpath.mnt))
+ return ERR_PTR(-ECHILD);
+
acl = get_cached_acl_rcu(realinode, type);
} else {
const struct cred *old_cred;
old_cred = ovl_override_creds(inode->i_sb);
- acl = get_acl(realinode, type);
+ acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm);
revert_creds(old_cred);
}
- /*
- * If there are no POSIX ACLs, or we encountered an error,
- * or the layer isn't idmapped we don't need to do anything.
- */
- if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl))
- return acl;
+
+ return acl;
+}
+
+static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
+ struct posix_acl *acl, int type)
+{
+ int err;
+ struct path realpath;
+ const char *acl_name;
+ const struct cred *old_cred;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *upperdentry = ovl_dentry_upper(dentry);
+ struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+
+ err = ovl_want_write(dentry);
+ if (err)
+ return err;
/*
- * We only get here if the layer is idmapped. So drop out of RCU path
- * walk so we can clone the ACLs. There's no need to release the ACLs
- * since get_cached_acl_rcu() doesn't take a reference on the ACLs.
+ * If ACL is to be removed from a lower file, check if it exists in
+ * the first place before copying it up.
*/
- if (rcu)
- return ERR_PTR(-ECHILD);
+ acl_name = posix_acl_xattr_name(type);
+ if (!acl && !upperdentry) {
+ struct posix_acl *real_acl;
- clone = posix_acl_clone(acl, GFP_KERNEL);
- if (!clone)
- clone = ERR_PTR(-ENOMEM);
+ ovl_path_lower(dentry, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry,
+ acl_name);
+ revert_creds(old_cred);
+ if (IS_ERR(real_acl)) {
+ err = PTR_ERR(real_acl);
+ goto out_drop_write;
+ }
+ posix_acl_release(real_acl);
+ }
+
+ if (!upperdentry) {
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ realdentry = ovl_dentry_upper(dentry);
+ }
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (acl)
+ err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
else
- ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone);
+ err = ovl_do_remove_acl(ofs, realdentry, acl_name);
+ revert_creds(old_cred);
+
+ /* copy c/mtime */
+ ovl_copyattr(inode);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+ return err;
+}
+
+int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+ int err;
+ struct inode *inode = d_inode(dentry);
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *realinode = ovl_inode_real(inode);
+
+ if (!IS_POSIXACL(d_inode(workdir)))
+ return -EOPNOTSUPP;
+ if (!realinode->i_op->set_acl)
+ return -EOPNOTSUPP;
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ if (!inode_owner_or_capable(&init_user_ns, inode))
+ return -EPERM;
+
/*
- * Since we're not in RCU path walk we always need to release the
- * original ACLs.
+ * Check if sgid bit needs to be cleared (actual setacl operation will
+ * be done with mounter's capabilities and so that won't do it for us).
*/
- posix_acl_release(acl);
- return clone;
+ if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS &&
+ !in_group_p(inode->i_gid) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
+ struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
+
+ err = ovl_setattr(&init_user_ns, dentry, &iattr);
+ if (err)
+ return err;
+ }
+
+ return ovl_set_or_remove_acl(dentry, inode, acl, type);
}
#endif
@@ -721,7 +858,9 @@ static const struct inode_operations ovl_file_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
+ .get_inode_acl = ovl_get_inode_acl,
.get_acl = ovl_get_acl,
+ .set_acl = ovl_set_acl,
.update_time = ovl_update_time,
.fiemap = ovl_fiemap,
.fileattr_get = ovl_fileattr_get,
@@ -741,7 +880,9 @@ static const struct inode_operations ovl_special_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
+ .get_inode_acl = ovl_get_inode_acl,
.get_acl = ovl_get_acl,
+ .set_acl = ovl_set_acl,
.update_time = ovl_update_time,
};
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 0fd1d5fdfc72..46753134533a 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -487,7 +487,8 @@ fail:
}
/* Get upper dentry from index */
-struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
+ bool connected)
{
struct ovl_fh *fh;
struct dentry *upper;
@@ -499,7 +500,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
if (IS_ERR_OR_NULL(fh))
return ERR_CAST(fh);
- upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true);
+ upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected);
kfree(fh);
if (IS_ERR_OR_NULL(upper))
@@ -572,7 +573,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
* directly from the index dentry, but for dir index we first need to
* decode the upper directory.
*/
- upper = ovl_index_upper(ofs, index);
+ upper = ovl_index_upper(ofs, index, false);
if (IS_ERR_OR_NULL(upper)) {
err = PTR_ERR(upper);
/*
@@ -1085,6 +1086,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
.mnt = ovl_upper_mnt(ofs),
};
+ /*
+ * It's safe to assign upperredirect here: the previous
+ * assignment of happens only if upperdentry is non-NULL, and
+ * this one only if upperdentry is NULL.
+ */
upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
if (IS_ERR(upperredirect)) {
err = PTR_ERR(upperredirect);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index eee8f08d32b6..1df7f850ff3b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -8,6 +8,8 @@
#include <linux/uuid.h>
#include <linux/fs.h>
#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include "ovl_entry.h"
#undef pr_fmt
@@ -108,7 +110,7 @@ struct ovl_fh {
u8 padding[3]; /* make sure fb.fid is 32bit aligned */
union {
struct ovl_fb fb;
- u8 buf[0];
+ DECLARE_FLEX_ARRAY(u8, buf);
};
} __packed;
@@ -278,6 +280,18 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox));
}
+static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry,
+ const char *acl_name, struct posix_acl *acl)
+{
+ return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl);
+}
+
+static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
+ const char *acl_name)
+{
+ return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name);
+}
+
static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
struct dentry *olddentry, struct inode *newdir,
struct dentry *newdentry, unsigned int flags)
@@ -401,7 +415,7 @@ const char *ovl_dentry_get_redirect(struct dentry *dentry);
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
-u64 ovl_dentry_version_get(struct dentry *dentry);
+u64 ovl_inode_version_get(struct inode *inode);
bool ovl_is_whiteout(struct dentry *dentry);
struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
@@ -525,7 +539,8 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
enum ovl_xattr ox, struct dentry *real, bool is_upper,
bool set);
-struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index);
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
+ bool connected);
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
struct qstr *name);
@@ -570,9 +585,9 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs);
* lower dir was removed under it and possibly before it was rotated from upper
* to lower layer.
*/
-static inline bool ovl_dir_is_real(struct dentry *dir)
+static inline bool ovl_dir_is_real(struct inode *dir)
{
- return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
+ return !ovl_test_flag(OVL_WHITEOUTS, dir);
}
/* inode.c */
@@ -594,9 +609,33 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
#ifdef CONFIG_FS_POSIX_ACL
-struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu);
+struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
+ bool rcu, bool noperm);
+static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type,
+ bool rcu)
+{
+ return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true);
+}
+static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false);
+}
+int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type);
+struct posix_acl *ovl_get_acl_path(const struct path *path,
+ const char *acl_name, bool noperm);
#else
-#define ovl_get_acl NULL
+#define ovl_get_inode_acl NULL
+#define ovl_get_acl NULL
+#define ovl_set_acl NULL
+static inline struct posix_acl *ovl_get_acl_path(const struct path *path,
+ const char *acl_name,
+ bool noperm)
+{
+ return NULL;
+}
#endif
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 2b210640036c..8cd2b9947de1 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -235,15 +235,15 @@ void ovl_dir_cache_free(struct inode *inode)
}
}
-static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
- if (ovl_dir_cache(d_inode(dentry)) == cache)
- ovl_set_dir_cache(d_inode(dentry), NULL);
+ if (ovl_dir_cache(inode) == cache)
+ ovl_set_dir_cache(inode, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
@@ -323,15 +323,15 @@ static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
- struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = file_inode(file);
bool is_real;
- if (cache && ovl_dentry_version_get(dentry) != cache->version) {
- ovl_cache_put(od, dentry);
+ if (cache && ovl_inode_version_get(inode) != cache->version) {
+ ovl_cache_put(od, inode);
od->cache = NULL;
od->cursor = NULL;
}
- is_real = ovl_dir_is_real(dentry);
+ is_real = ovl_dir_is_real(inode);
if (od->is_real != is_real) {
/* is_real can only become false when dir is copied up */
if (WARN_ON(is_real))
@@ -394,9 +394,10 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
+ struct inode *inode = d_inode(dentry);
- cache = ovl_dir_cache(d_inode(dentry));
- if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+ cache = ovl_dir_cache(inode);
+ if (cache && ovl_inode_version_get(inode) == cache->version) {
WARN_ON(!cache->refcount);
cache->refcount++;
return cache;
@@ -418,8 +419,8 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
return ERR_PTR(res);
}
- cache->version = ovl_dentry_version_get(dentry);
- ovl_set_dir_cache(d_inode(dentry), cache);
+ cache->version = ovl_inode_version_get(inode);
+ ovl_set_dir_cache(inode, cache);
return cache;
}
@@ -596,16 +597,17 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
{
int res;
struct dentry *dentry = path->dentry;
+ struct inode *inode = d_inode(dentry);
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct ovl_dir_cache *cache;
- cache = ovl_dir_cache(d_inode(dentry));
- if (cache && ovl_dentry_version_get(dentry) == cache->version)
+ cache = ovl_dir_cache(inode);
+ if (cache && ovl_inode_version_get(inode) == cache->version)
return cache;
/* Impure cache is not refcounted, free it here */
- ovl_dir_cache_free(d_inode(dentry));
- ovl_set_dir_cache(d_inode(dentry), NULL);
+ ovl_dir_cache_free(inode);
+ ovl_set_dir_cache(inode, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
@@ -627,13 +629,13 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
OVL_XATTR_IMPURE);
ovl_drop_write(dentry);
}
- ovl_clear_flag(OVL_IMPURE, d_inode(dentry));
+ ovl_clear_flag(OVL_IMPURE, inode);
kfree(cache);
return NULL;
}
- cache->version = ovl_dentry_version_get(dentry);
- ovl_set_dir_cache(d_inode(dentry), cache);
+ cache->version = ovl_inode_version_get(inode);
+ ovl_set_dir_cache(inode, cache);
return cache;
}
@@ -675,7 +677,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
static bool ovl_is_impure_dir(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
- struct inode *dir = d_inode(file->f_path.dentry);
+ struct inode *dir = file_inode(file);
/*
* Only upper dir can be impure, but if we are in the middle of
@@ -893,7 +895,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
struct file *realfile;
int err;
- err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb));
+ err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
if (err <= 0)
return err;
@@ -913,7 +915,7 @@ static int ovl_dir_release(struct inode *inode, struct file *file)
if (od->cache) {
inode_lock(inode);
- ovl_cache_put(od, file->f_path.dentry);
+ ovl_cache_put(od, inode);
inode_unlock(inode);
}
fput(od->realfile);
@@ -942,7 +944,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
return PTR_ERR(realfile);
}
od->realfile = realfile;
- od->is_real = ovl_dir_is_real(file->f_path.dentry);
+ od->is_real = ovl_dir_is_real(inode);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
@@ -1071,14 +1073,10 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
int err;
struct inode *dir = path->dentry->d_inode;
LIST_HEAD(list);
- struct rb_root root = RB_ROOT;
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
- .ctx.actor = ovl_fill_merge,
- .dentry = NULL,
+ .ctx.actor = ovl_fill_plain,
.list = &list,
- .root = &root,
- .is_lowest = false,
};
bool incompat = false;
@@ -1159,14 +1157,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
struct inode *dir = indexdir->d_inode;
struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
LIST_HEAD(list);
- struct rb_root root = RB_ROOT;
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
- .ctx.actor = ovl_fill_merge,
- .dentry = NULL,
+ .ctx.actor = ovl_fill_plain,
.list = &list,
- .root = &root,
- .is_lowest = false,
};
err = ovl_dir_read(&path, &rdd);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a29a8afe9b26..85b891152a2c 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -139,11 +139,16 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
unsigned int flags, bool weak)
{
struct ovl_entry *oe = dentry->d_fsdata;
+ struct inode *inode = d_inode_rcu(dentry);
struct dentry *upper;
unsigned int i;
int ret = 1;
- upper = ovl_dentry_upper(dentry);
+ /* Careful in RCU mode */
+ if (!inode)
+ return -ECHILD;
+
+ upper = ovl_i_dentry_upper(inode);
if (upper)
ret = ovl_revalidate_real(upper, flags, weak);
@@ -813,13 +818,11 @@ retry:
* allowed as upper are limited to "normal" ones, where checking
* for the above two errors is sufficient.
*/
- err = ovl_do_removexattr(ofs, work,
- XATTR_NAME_POSIX_ACL_DEFAULT);
+ err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_DEFAULT);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
- err = ovl_do_removexattr(ofs, work,
- XATTR_NAME_POSIX_ACL_ACCESS);
+ err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_ACCESS);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
@@ -1001,83 +1004,6 @@ static unsigned int ovl_split_lowerdirs(char *str)
return ctr;
}
-static int __maybe_unused
-ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- return ovl_xattr_get(dentry, inode, handler->name, buffer, size);
-}
-
-static int __maybe_unused
-ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct dentry *workdir = ovl_workdir(dentry);
- struct inode *realinode = ovl_inode_real(inode);
- struct posix_acl *acl = NULL;
- int err;
-
- /* Check that everything is OK before copy-up */
- if (value) {
- /* The above comment can be understood in two ways:
- *
- * 1. We just want to check whether the basic POSIX ACL format
- * is ok. For example, if the header is correct and the size
- * is sane.
- * 2. We want to know whether the ACL_{GROUP,USER} entries can
- * be mapped according to the underlying filesystem.
- *
- * Currently, we only check 1. If we wanted to check 2. we
- * would need to pass the mnt_userns and the fs_userns of the
- * underlying filesystem. But frankly, I think checking 1. is
- * enough to start the copy-up.
- */
- acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- err = -EOPNOTSUPP;
- if (!IS_POSIXACL(d_inode(workdir)))
- goto out_acl_release;
- if (!realinode->i_op->set_acl)
- goto out_acl_release;
- if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) {
- err = acl ? -EACCES : 0;
- goto out_acl_release;
- }
- err = -EPERM;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- goto out_acl_release;
-
- posix_acl_release(acl);
-
- /*
- * Check if sgid bit needs to be cleared (actual setacl operation will
- * be done with mounter's capabilities and so that won't do it for us).
- */
- if (unlikely(inode->i_mode & S_ISGID) &&
- handler->flags == ACL_TYPE_ACCESS &&
- !in_group_p(inode->i_gid) &&
- !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
- struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
-
- err = ovl_setattr(&init_user_ns, dentry, &iattr);
- if (err)
- return err;
- }
-
- err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags);
- return err;
-
-out_acl_release:
- posix_acl_release(acl);
- return err;
-}
-
static int ovl_own_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *buffer, size_t size)
@@ -1110,22 +1036,6 @@ static int ovl_other_xattr_set(const struct xattr_handler *handler,
return ovl_xattr_set(dentry, inode, name, value, size, flags);
}
-static const struct xattr_handler __maybe_unused
-ovl_posix_acl_access_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = ovl_posix_acl_xattr_get,
- .set = ovl_posix_acl_xattr_set,
-};
-
-static const struct xattr_handler __maybe_unused
-ovl_posix_acl_default_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = ovl_posix_acl_xattr_get,
- .set = ovl_posix_acl_xattr_set,
-};
-
static const struct xattr_handler ovl_own_trusted_xattr_handler = {
.prefix = OVL_XATTR_TRUSTED_PREFIX,
.get = ovl_own_xattr_get,
@@ -1146,8 +1056,8 @@ static const struct xattr_handler ovl_other_xattr_handler = {
static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
#ifdef CONFIG_FS_POSIX_ACL
- &ovl_posix_acl_access_xattr_handler,
- &ovl_posix_acl_default_xattr_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
&ovl_own_trusted_xattr_handler,
&ovl_other_xattr_handler,
@@ -1156,8 +1066,8 @@ static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
static const struct xattr_handler *ovl_user_xattr_handlers[] = {
#ifdef CONFIG_FS_POSIX_ACL
- &ovl_posix_acl_access_xattr_handler,
- &ovl_posix_acl_default_xattr_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
&ovl_own_user_xattr_handler,
&ovl_other_xattr_handler,
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 81a57a8d80d9..bde291623c8c 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -463,7 +463,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity)
* which have been copied up and have origins), so only need to note
* changes to impure entries.
*/
- if (!ovl_dir_is_real(dentry) || impurity)
+ if (!ovl_dir_is_real(inode) || impurity)
OVL_I(inode)->version++;
}
@@ -475,10 +475,8 @@ void ovl_dir_modified(struct dentry *dentry, bool impurity)
ovl_dir_version_inc(dentry, impurity);
}
-u64 ovl_dentry_version_get(struct dentry *dentry)
+u64 ovl_inode_version_get(struct inode *inode)
{
- struct inode *inode = d_inode(dentry);
-
WARN_ON(!inode_is_locked(inode));
return OVL_I(inode)->version;
}
@@ -1104,13 +1102,18 @@ void ovl_copyattr(struct inode *inode)
struct path realpath;
struct inode *realinode;
struct user_namespace *real_mnt_userns;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
ovl_i_path_real(inode, &realpath);
realinode = d_inode(realpath.dentry);
real_mnt_userns = mnt_user_ns(realpath.mnt);
- inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode);
- inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode);
+ vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode);
+ vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode);
+
+ inode->i_uid = vfsuid_into_kuid(vfsuid);
+ inode->i_gid = vfsgid_into_kgid(vfsgid);
inode->i_mode = realinode->i_mode;
inode->i_atime = realinode->i_atime;
inode->i_mtime = realinode->i_mtime;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 74dc0f571dc9..d7bc81fc0840 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -25,6 +25,11 @@
#include <linux/namei.h>
#include <linux/mnt_idmapping.h>
#include <linux/iversion.h>
+#include <linux/security.h>
+#include <linux/evm.h>
+#include <linux/fsnotify.h>
+
+#include "internal.h"
static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
@@ -64,7 +69,7 @@ struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
if (acl == ACL_DONT_CACHE) {
struct posix_acl *ret;
- ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU);
+ ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU);
if (!IS_ERR(ret))
acl = ret;
}
@@ -106,15 +111,17 @@ void forget_all_cached_acls(struct inode *inode)
}
EXPORT_SYMBOL(forget_all_cached_acls);
-struct posix_acl *get_acl(struct inode *inode, int type)
+static struct posix_acl *__get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct inode *inode,
+ int type)
{
- void *sentinel;
+ struct posix_acl *sentinel;
struct posix_acl **p;
struct posix_acl *acl;
/*
* The sentinel is used to detect when another operation like
- * set_cached_acl() or forget_cached_acl() races with get_acl().
+ * set_cached_acl() or forget_cached_acl() races with get_inode_acl().
* It is guaranteed that is_uncached_acl(sentinel) is true.
*/
@@ -133,25 +140,27 @@ struct posix_acl *get_acl(struct inode *inode, int type)
* current value of the ACL will not be ACL_NOT_CACHED and so our own
* sentinel will not be set; another task will update the cache. We
* could wait for that other task to complete its job, but it's easier
- * to just call ->get_acl to fetch the ACL ourself. (This is going to
- * be an unlikely race.)
+ * to just call ->get_inode_acl to fetch the ACL ourself. (This is
+ * going to be an unlikely race.)
*/
cmpxchg(p, ACL_NOT_CACHED, sentinel);
/*
- * Normally, the ACL returned by ->get_acl will be cached.
+ * Normally, the ACL returned by ->get{_inode}_acl will be cached.
* A filesystem can prevent that by calling
- * forget_cached_acl(inode, type) in ->get_acl.
+ * forget_cached_acl(inode, type) in ->get{_inode}_acl.
*
- * If the filesystem doesn't have a get_acl() function at all, we'll
- * just create the negative cache entry.
+ * If the filesystem doesn't have a get{_inode}_ acl() function at all,
+ * we'll just create the negative cache entry.
*/
- if (!inode->i_op->get_acl) {
+ if (dentry && inode->i_op->get_acl) {
+ acl = inode->i_op->get_acl(mnt_userns, dentry, type);
+ } else if (inode->i_op->get_inode_acl) {
+ acl = inode->i_op->get_inode_acl(inode, type, false);
+ } else {
set_cached_acl(inode, type, NULL);
return NULL;
}
- acl = inode->i_op->get_acl(inode, type, false);
-
if (IS_ERR(acl)) {
/*
* Remove our sentinel so that we don't block future attempts
@@ -169,7 +178,12 @@ struct posix_acl *get_acl(struct inode *inode, int type)
posix_acl_release(acl);
return acl;
}
-EXPORT_SYMBOL(get_acl);
+
+struct posix_acl *get_inode_acl(struct inode *inode, int type)
+{
+ return __get_acl(&init_user_ns, NULL, inode, type);
+}
+EXPORT_SYMBOL(get_inode_acl);
/*
* Init a fresh posix_acl
@@ -578,19 +592,20 @@ EXPORT_SYMBOL(__posix_acl_chmod);
* posix_acl_chmod - chmod a posix acl
*
* @mnt_userns: user namespace of the mount @inode was found from
- * @inode: inode to check permissions on
+ * @dentry: dentry to check permissions on
* @mode: the new mode of @inode
*
- * If the inode has been found through an idmapped mount the user namespace of
+ * If the dentry has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*/
int
- posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode,
+ posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry,
umode_t mode)
{
+ struct inode *inode = d_inode(dentry);
struct posix_acl *acl;
int ret = 0;
@@ -599,7 +614,7 @@ int
if (!inode->i_op->set_acl)
return -EOPNOTSUPP;
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR_OR_NULL(acl)) {
if (acl == ERR_PTR(-EOPNOTSUPP))
return 0;
@@ -609,7 +624,7 @@ int
ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
if (ret)
return ret;
- ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS);
+ ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
return ret;
}
@@ -629,7 +644,7 @@ posix_acl_create(struct inode *dir, umode_t *mode,
if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
return 0;
- p = get_acl(dir, ACL_TYPE_DEFAULT);
+ p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
*mode &= ~current_umask();
return 0;
@@ -732,118 +747,32 @@ static int posix_acl_fix_xattr_common(const void *value, size_t size)
return count;
}
-void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
- const struct inode *inode,
- void *value, size_t size)
-{
- struct posix_acl_xattr_header *header = value;
- struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
- struct user_namespace *fs_userns = i_user_ns(inode);
- int count;
- vfsuid_t vfsuid;
- vfsgid_t vfsgid;
- kuid_t uid;
- kgid_t gid;
-
- if (no_idmapping(mnt_userns, i_user_ns(inode)))
- return;
-
- count = posix_acl_fix_xattr_common(value, size);
- if (count <= 0)
- return;
-
- for (end = entry + count; entry != end; entry++) {
- switch (le16_to_cpu(entry->e_tag)) {
- case ACL_USER:
- uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsuid = make_vfsuid(mnt_userns, fs_userns, uid);
- entry->e_id = cpu_to_le32(from_kuid(&init_user_ns,
- vfsuid_into_kuid(vfsuid)));
- break;
- case ACL_GROUP:
- gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsgid = make_vfsgid(mnt_userns, fs_userns, gid);
- entry->e_id = cpu_to_le32(from_kgid(&init_user_ns,
- vfsgid_into_kgid(vfsgid)));
- break;
- default:
- break;
- }
- }
-}
-
-static void posix_acl_fix_xattr_userns(
- struct user_namespace *to, struct user_namespace *from,
- void *value, size_t size)
-{
- struct posix_acl_xattr_header *header = value;
- struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
- int count;
- kuid_t uid;
- kgid_t gid;
-
- count = posix_acl_fix_xattr_common(value, size);
- if (count <= 0)
- return;
-
- for (end = entry + count; entry != end; entry++) {
- switch(le16_to_cpu(entry->e_tag)) {
- case ACL_USER:
- uid = make_kuid(from, le32_to_cpu(entry->e_id));
- entry->e_id = cpu_to_le32(from_kuid(to, uid));
- break;
- case ACL_GROUP:
- gid = make_kgid(from, le32_to_cpu(entry->e_id));
- entry->e_id = cpu_to_le32(from_kgid(to, gid));
- break;
- default:
- break;
- }
- }
-}
-
-void posix_acl_fix_xattr_from_user(void *value, size_t size)
-{
- struct user_namespace *user_ns = current_user_ns();
- if (user_ns == &init_user_ns)
- return;
- posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
-}
-
-void posix_acl_fix_xattr_to_user(void *value, size_t size)
-{
- struct user_namespace *user_ns = current_user_ns();
- if (user_ns == &init_user_ns)
- return;
- posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
-}
-
/**
- * make_posix_acl - convert POSIX ACLs from uapi to VFS format using the
- * provided callbacks to map ACL_{GROUP,USER} entries into the
- * appropriate format
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
+ * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
+ * @userns: the filesystem's idmapping
* @value: the uapi representation of POSIX ACLs
* @size: the size of @void
- * @uid_cb: callback to use for mapping the uid stored in ACL_USER entries
- * @gid_cb: callback to use for mapping the gid stored in ACL_GROUP entries
*
- * The make_posix_acl() helper is an abstraction to translate from uapi format
- * into the VFS format allowing the caller to specific callbacks to map
- * ACL_{GROUP,USER} entries into the expected format. This is used in
- * posix_acl_from_xattr() and vfs_set_acl_prepare() and avoids pointless code
- * duplication.
+ * Filesystems that store POSIX ACLs in the unaltered uapi format should use
+ * posix_acl_from_xattr() when reading them from the backing store and
+ * converting them into the struct posix_acl VFS format. The helper is
+ * specifically intended to be called from the acl inode operation.
+ *
+ * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
+ * in ACL_{GROUP,USER} entries into idmapping in @userns.
+ *
+ * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
+ * If it did it calling it from the get acl inode operation would return POSIX
+ * ACLs mapped according to an idmapped mount which would mean that the value
+ * couldn't be cached for the filesystem. Idmapped mounts are taken into
+ * account on the fly during permission checking or right at the VFS -
+ * userspace boundary before reporting them to the user.
*
* Return: Allocated struct posix_acl on success, NULL for a valid header but
* without actual POSIX ACL entries, or ERR_PTR() encoded error code.
*/
-static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns, const void *value, size_t size,
- kuid_t (*uid_cb)(struct user_namespace *, struct user_namespace *,
- const struct posix_acl_xattr_entry *),
- kgid_t (*gid_cb)(struct user_namespace *, struct user_namespace *,
- const struct posix_acl_xattr_entry *))
+struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
+ const void *value, size_t size)
{
const struct posix_acl_xattr_header *header = value;
const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
@@ -874,12 +803,14 @@ static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns,
break;
case ACL_USER:
- acl_e->e_uid = uid_cb(mnt_userns, fs_userns, entry);
+ acl_e->e_uid = make_kuid(userns,
+ le32_to_cpu(entry->e_id));
if (!uid_valid(acl_e->e_uid))
goto fail;
break;
case ACL_GROUP:
- acl_e->e_gid = gid_cb(mnt_userns, fs_userns, entry);
+ acl_e->e_gid = make_kgid(userns,
+ le32_to_cpu(entry->e_id));
if (!gid_valid(acl_e->e_gid))
goto fail;
break;
@@ -894,181 +825,6 @@ fail:
posix_acl_release(acl);
return ERR_PTR(-EINVAL);
}
-
-/**
- * vfs_set_acl_prepare_kuid - map ACL_USER uid according to mount- and
- * filesystem idmapping
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_USER entry in POSIX ACL uapi format
- *
- * The uid stored as ACL_USER entry in @e is a kuid_t stored as a raw {g,u}id
- * value. The vfs_set_acl_prepare_kuid() will recover the kuid_t through
- * KUIDT_INIT() and then map it according to the idmapped mount. The resulting
- * kuid_t is the value which the filesystem can map up into a raw backing store
- * id in the filesystem's idmapping.
- *
- * This is used in vfs_set_acl_prepare() to generate the proper VFS
- * representation of POSIX ACLs with ACL_USER entries during setxattr().
- *
- * Return: A kuid in @fs_userns for the uid stored in @e.
- */
-static inline kuid_t
-vfs_set_acl_prepare_kuid(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const struct posix_acl_xattr_entry *e)
-{
- kuid_t kuid = KUIDT_INIT(le32_to_cpu(e->e_id));
- return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid));
-}
-
-/**
- * vfs_set_acl_prepare_kgid - map ACL_GROUP gid according to mount- and
- * filesystem idmapping
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_GROUP entry in POSIX ACL uapi format
- *
- * The gid stored as ACL_GROUP entry in @e is a kgid_t stored as a raw {g,u}id
- * value. The vfs_set_acl_prepare_kgid() will recover the kgid_t through
- * KGIDT_INIT() and then map it according to the idmapped mount. The resulting
- * kgid_t is the value which the filesystem can map up into a raw backing store
- * id in the filesystem's idmapping.
- *
- * This is used in vfs_set_acl_prepare() to generate the proper VFS
- * representation of POSIX ACLs with ACL_GROUP entries during setxattr().
- *
- * Return: A kgid in @fs_userns for the gid stored in @e.
- */
-static inline kgid_t
-vfs_set_acl_prepare_kgid(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const struct posix_acl_xattr_entry *e)
-{
- kgid_t kgid = KGIDT_INIT(le32_to_cpu(e->e_id));
- return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid));
-}
-
-/**
- * vfs_set_acl_prepare - convert POSIX ACLs from uapi to VFS format taking
- * mount and filesystem idmappings into account
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- * @value: the uapi representation of POSIX ACLs
- * @size: the size of @void
- *
- * When setting POSIX ACLs with ACL_{GROUP,USER} entries they need to be
- * mapped according to the relevant mount- and filesystem idmapping. It is
- * important that the ACL_{GROUP,USER} entries in struct posix_acl will be
- * mapped into k{g,u}id_t that are supposed to be mapped up in the filesystem
- * idmapping. This is crucial since the resulting struct posix_acl might be
- * cached filesystem wide. The vfs_set_acl_prepare() function will take care to
- * perform all necessary idmappings.
- *
- * Note, that since basically forever the {g,u}id values encoded as
- * ACL_{GROUP,USER} entries in the uapi POSIX ACLs passed via @value contain
- * values that have been mapped according to the caller's idmapping. In other
- * words, POSIX ACLs passed in uapi format as @value during setxattr() contain
- * {g,u}id values in their ACL_{GROUP,USER} entries that should actually have
- * been stored as k{g,u}id_t.
- *
- * This means, vfs_set_acl_prepare() needs to first recover the k{g,u}id_t by
- * calling K{G,U}IDT_INIT(). Afterwards they can be interpreted as vfs{g,u}id_t
- * through from_vfs{g,u}id() to account for any idmapped mounts. The
- * vfs_set_acl_prepare_k{g,u}id() helpers will take care to generate the
- * correct k{g,u}id_t.
- *
- * The filesystem will then receive the POSIX ACLs ready to be cached
- * filesystem wide and ready to be written to the backing store taking the
- * filesystem's idmapping into account.
- *
- * Return: Allocated struct posix_acl on success, NULL for a valid header but
- * without actual POSIX ACL entries, or ERR_PTR() encoded error code.
- */
-struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const void *value, size_t size)
-{
- return make_posix_acl(mnt_userns, fs_userns, value, size,
- vfs_set_acl_prepare_kuid,
- vfs_set_acl_prepare_kgid);
-}
-EXPORT_SYMBOL(vfs_set_acl_prepare);
-
-/**
- * posix_acl_from_xattr_kuid - map ACL_USER uid into filesystem idmapping
- * @mnt_userns: unused
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_USER entry in POSIX ACL uapi format
- *
- * Map the uid stored as ACL_USER entry in @e into the filesystem's idmapping.
- * This is used in posix_acl_from_xattr() to generate the proper VFS
- * representation of POSIX ACLs with ACL_USER entries.
- *
- * Return: A kuid in @fs_userns for the uid stored in @e.
- */
-static inline kuid_t
-posix_acl_from_xattr_kuid(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const struct posix_acl_xattr_entry *e)
-{
- return make_kuid(fs_userns, le32_to_cpu(e->e_id));
-}
-
-/**
- * posix_acl_from_xattr_kgid - map ACL_GROUP gid into filesystem idmapping
- * @mnt_userns: unused
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_GROUP entry in POSIX ACL uapi format
- *
- * Map the gid stored as ACL_GROUP entry in @e into the filesystem's idmapping.
- * This is used in posix_acl_from_xattr() to generate the proper VFS
- * representation of POSIX ACLs with ACL_GROUP entries.
- *
- * Return: A kgid in @fs_userns for the gid stored in @e.
- */
-static inline kgid_t
-posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const struct posix_acl_xattr_entry *e)
-{
- return make_kgid(fs_userns, le32_to_cpu(e->e_id));
-}
-
-/**
- * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
- * @fs_userns: the filesystem's idmapping
- * @value: the uapi representation of POSIX ACLs
- * @size: the size of @void
- *
- * Filesystems that store POSIX ACLs in the unaltered uapi format should use
- * posix_acl_from_xattr() when reading them from the backing store and
- * converting them into the struct posix_acl VFS format. The helper is
- * specifically intended to be called from the ->get_acl() inode operation.
- *
- * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
- * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The
- * posix_acl_from_xattr_k{g,u}id() helpers will take care to generate the
- * correct k{g,u}id_t. The returned struct posix_acl can be cached.
- *
- * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
- * If it did it calling is from the ->get_acl() inode operation would return
- * POSIX ACLs mapped according to an idmapped mount which would mean that the
- * value couldn't be cached for the filesystem. Idmapped mounts are taken into
- * account on the fly during permission checking or right at the VFS -
- * userspace boundary before reporting them to the user.
- *
- * Return: Allocated struct posix_acl on success, NULL for a valid header but
- * without actual POSIX ACL entries, or ERR_PTR() encoded error code.
- */
-struct posix_acl *
-posix_acl_from_xattr(struct user_namespace *fs_userns,
- const void *value, size_t size)
-{
- return make_posix_acl(&init_user_ns, fs_userns, value, size,
- posix_acl_from_xattr_kuid,
- posix_acl_from_xattr_kgid);
-}
EXPORT_SYMBOL (posix_acl_from_xattr);
/*
@@ -1113,35 +869,76 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
}
EXPORT_SYMBOL (posix_acl_to_xattr);
-static int
-posix_acl_xattr_get(const struct xattr_handler *handler,
- struct dentry *unused, struct inode *inode,
- const char *name, void *value, size_t size)
-{
- struct posix_acl *acl;
- int error;
+/**
+ * vfs_posix_acl_to_xattr - convert from kernel to userspace representation
+ * @idmap: idmap of the mount
+ * @inode: inode the posix acls are set on
+ * @acl: the posix acls as represented by the vfs
+ * @buffer: the buffer into which to convert @acl
+ * @size: size of @buffer
+ *
+ * This converts @acl from the VFS representation in the filesystem idmapping
+ * to the uapi form reportable to userspace. And mount and caller idmappings
+ * are handled appropriately.
+ *
+ * Return: On success, the size of the stored uapi posix acls, on error a
+ * negative errno.
+ */
+static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap,
+ struct inode *inode,
+ const struct posix_acl *acl, void *buffer,
+ size_t size)
- if (!IS_POSIXACL(inode))
- return -EOPNOTSUPP;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
+{
+ struct posix_acl_xattr_header *ext_acl = buffer;
+ struct posix_acl_xattr_entry *ext_entry;
+ struct user_namespace *fs_userns, *caller_userns;
+ struct user_namespace *mnt_userns;
+ ssize_t real_size, n;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
- acl = get_acl(inode, handler->flags);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
+ real_size = posix_acl_xattr_size(acl->a_count);
+ if (!buffer)
+ return real_size;
+ if (real_size > size)
+ return -ERANGE;
- error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- posix_acl_release(acl);
+ ext_entry = (void *)(ext_acl + 1);
+ ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
- return error;
+ fs_userns = i_user_ns(inode);
+ caller_userns = current_user_ns();
+ mnt_userns = mnt_idmap_owner(idmap);
+ for (n=0; n < acl->a_count; n++, ext_entry++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+ ext_entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch(acl_e->e_tag) {
+ case ACL_USER:
+ vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid);
+ ext_entry->e_id = cpu_to_le32(from_kuid(
+ caller_userns, vfsuid_into_kuid(vfsuid)));
+ break;
+ case ACL_GROUP:
+ vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid);
+ ext_entry->e_id = cpu_to_le32(from_kgid(
+ caller_userns, vfsgid_into_kgid(vfsgid)));
+ break;
+ default:
+ ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+ break;
+ }
+ }
+ return real_size;
}
int
-set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
+set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
int type, struct posix_acl *acl)
{
+ struct inode *inode = d_inode(dentry);
+
if (!IS_POSIXACL(inode))
return -EOPNOTSUPP;
if (!inode->i_op->set_acl)
@@ -1157,40 +954,10 @@ set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (ret)
return ret;
}
- return inode->i_op->set_acl(mnt_userns, inode, acl, type);
+ return inode->i_op->set_acl(mnt_userns, dentry, acl, type);
}
EXPORT_SYMBOL(set_posix_acl);
-static int
-posix_acl_xattr_set(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *unused, struct inode *inode,
- const char *name, const void *value, size_t size,
- int flags)
-{
- struct posix_acl *acl = NULL;
- int ret;
-
- if (value) {
- /*
- * By the time we end up here the {g,u}ids stored in
- * ACL_{GROUP,USER} have already been mapped according to the
- * caller's idmapping. The vfs_set_acl_prepare() helper will
- * recover them and take idmapped mounts into account. The
- * filesystem will receive the POSIX ACLs in the correct
- * format ready to be cached or written to the backing store
- * taking the filesystem idmapping into account.
- */
- acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode),
- value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- ret = set_posix_acl(mnt_userns, inode, handler->flags, acl);
- posix_acl_release(acl);
- return ret;
-}
-
static bool
posix_acl_xattr_list(struct dentry *dentry)
{
@@ -1201,8 +968,6 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
.name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = posix_acl_xattr_list,
- .get = posix_acl_xattr_get,
- .set = posix_acl_xattr_set,
};
EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
@@ -1210,15 +975,14 @@ const struct xattr_handler posix_acl_default_xattr_handler = {
.name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = posix_acl_xattr_list,
- .get = posix_acl_xattr_get,
- .set = posix_acl_xattr_set,
};
EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
-int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error;
+ struct inode *inode = d_inode(dentry);
if (type == ACL_TYPE_ACCESS) {
error = posix_acl_update_mode(mnt_userns, inode,
@@ -1252,3 +1016,252 @@ int simple_acl_create(struct inode *dir, struct inode *inode)
posix_acl_release(acl);
return 0;
}
+
+static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ struct posix_acl *acl)
+{
+ for (int n = 0; n < acl->a_count; n++) {
+ struct posix_acl_entry *acl_e = &acl->a_entries[n];
+
+ switch (acl_e->e_tag) {
+ case ACL_USER:
+ acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns,
+ VFSUIDT_INIT(acl_e->e_uid));
+ break;
+ case ACL_GROUP:
+ acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns,
+ VFSGIDT_INIT(acl_e->e_gid));
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * vfs_set_acl - set posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to set the posix acls
+ * @acl_name: the name of the posix acl
+ * @kacl: the posix acls in the appropriate VFS format
+ *
+ * This function sets @kacl. The caller must all posix_acl_release() on @kacl
+ * afterwards.
+ *
+ * Return: On success 0, on error negative errno.
+ */
+int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ const char *acl_name, struct posix_acl *kacl)
+{
+ int acl_type;
+ int error;
+ struct inode *inode = d_inode(dentry);
+ struct inode *delegated_inode = NULL;
+
+ acl_type = posix_acl_type(acl_name);
+ if (acl_type < 0)
+ return -EINVAL;
+
+ if (kacl) {
+ /*
+ * If we're on an idmapped mount translate from mount specific
+ * vfs{g,u}id_t into global filesystem k{g,u}id_t.
+ * Afterwards we can cache the POSIX ACLs filesystem wide and -
+ * if this is a filesystem with a backing store - ultimately
+ * translate them to backing store values.
+ */
+ error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl);
+ if (error)
+ return error;
+ }
+
+retry_deleg:
+ inode_lock(inode);
+
+ /*
+ * We only care about restrictions the inode struct itself places upon
+ * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
+ */
+ error = may_write_xattr(mnt_userns, inode);
+ if (error)
+ goto out_inode_unlock;
+
+ error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl);
+ if (error)
+ goto out_inode_unlock;
+
+ error = try_break_deleg(inode, &delegated_inode);
+ if (error)
+ goto out_inode_unlock;
+
+ if (inode->i_opflags & IOP_XATTR)
+ error = set_posix_acl(mnt_userns, dentry, acl_type, kacl);
+ else if (unlikely(is_bad_inode(inode)))
+ error = -EIO;
+ else
+ error = -EOPNOTSUPP;
+ if (!error) {
+ fsnotify_xattr(dentry);
+ evm_inode_post_set_acl(dentry, acl_name, kacl);
+ }
+
+out_inode_unlock:
+ inode_unlock(inode);
+
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(vfs_set_acl);
+
+/**
+ * vfs_get_acl - get posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to retrieve the posix acls
+ * @acl_name: the name of the posix acl
+ *
+ * This function retrieves @kacl from the filesystem. The caller must all
+ * posix_acl_release() on @kacl.
+ *
+ * Return: On success POSIX ACLs in VFS format, on error negative errno.
+ */
+struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *acl_name)
+{
+ struct inode *inode = d_inode(dentry);
+ struct posix_acl *acl;
+ int acl_type, error;
+
+ acl_type = posix_acl_type(acl_name);
+ if (acl_type < 0)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * The VFS has no restrictions on reading POSIX ACLs so calling
+ * something like xattr_permission() isn't needed. Only LSMs get a say.
+ */
+ error = security_inode_get_acl(mnt_userns, dentry, acl_name);
+ if (error)
+ return ERR_PTR(error);
+
+ if (!IS_POSIXACL(inode))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (S_ISLNK(inode->i_mode))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ acl = __get_acl(mnt_userns, dentry, inode, acl_type);
+ if (IS_ERR(acl))
+ return acl;
+ if (!acl)
+ return ERR_PTR(-ENODATA);
+
+ return acl;
+}
+EXPORT_SYMBOL_GPL(vfs_get_acl);
+
+/**
+ * vfs_remove_acl - remove posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to retrieve the posix acls
+ * @acl_name: the name of the posix acl
+ *
+ * This function removes posix acls.
+ *
+ * Return: On success 0, on error negative errno.
+ */
+int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ const char *acl_name)
+{
+ int acl_type;
+ int error;
+ struct inode *inode = d_inode(dentry);
+ struct inode *delegated_inode = NULL;
+
+ acl_type = posix_acl_type(acl_name);
+ if (acl_type < 0)
+ return -EINVAL;
+
+retry_deleg:
+ inode_lock(inode);
+
+ /*
+ * We only care about restrictions the inode struct itself places upon
+ * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
+ */
+ error = may_write_xattr(mnt_userns, inode);
+ if (error)
+ goto out_inode_unlock;
+
+ error = security_inode_remove_acl(mnt_userns, dentry, acl_name);
+ if (error)
+ goto out_inode_unlock;
+
+ error = try_break_deleg(inode, &delegated_inode);
+ if (error)
+ goto out_inode_unlock;
+
+ if (inode->i_opflags & IOP_XATTR)
+ error = set_posix_acl(mnt_userns, dentry, acl_type, NULL);
+ else if (unlikely(is_bad_inode(inode)))
+ error = -EIO;
+ else
+ error = -EOPNOTSUPP;
+ if (!error) {
+ fsnotify_xattr(dentry);
+ evm_inode_post_remove_acl(mnt_userns, dentry, acl_name);
+ }
+
+out_inode_unlock:
+ inode_unlock(inode);
+
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(vfs_remove_acl);
+
+int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, const void *kvalue, size_t size)
+{
+ int error;
+ struct posix_acl *acl = NULL;
+
+ if (size) {
+ /*
+ * Note that posix_acl_from_xattr() uses GFP_NOFS when it
+ * probably doesn't need to here.
+ */
+ acl = posix_acl_from_xattr(current_user_ns(), kvalue, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+
+ error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl);
+ posix_acl_release(acl);
+ return error;
+}
+
+ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, void *kvalue, size_t size)
+{
+ ssize_t error;
+ struct posix_acl *acl;
+
+ acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+
+ error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry),
+ acl, kvalue, size);
+ posix_acl_release(acl);
+ return error;
+}
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index fa762c5fbcb2..91fe1597af7b 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
static int cmdline_proc_show(struct seq_file *m, void *v)
{
@@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v)
static int __init proc_cmdline_init(void)
{
- proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ pde->size = saved_command_line_len + 1;
return 0;
}
fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index dfe6ce3505ce..e0758fe7936d 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -33,7 +33,16 @@ static int show_console_dev(struct seq_file *m, void *v)
if (con->device) {
const struct tty_driver *driver;
int index;
+
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
+ console_lock();
driver = con->device(con, &index);
+ console_unlock();
+
if (driver) {
dev = MKDEV(driver->major, driver->minor_start);
dev += index;
@@ -63,7 +72,12 @@ static void *c_start(struct seq_file *m, loff_t *pos)
struct console *con;
loff_t off = 0;
- console_lock();
+ /*
+ * Hold the console_list_lock to guarantee safe traversal of the
+ * console list. SRCU cannot be used because there is no
+ * place to store the SRCU cookie.
+ */
+ console_list_lock();
for_each_console(con)
if (off++ == *pos)
break;
@@ -74,13 +88,14 @@ static void *c_start(struct seq_file *m, loff_t *pos)
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
struct console *con = v;
+
++*pos;
- return con->next;
+ return hlist_entry_safe(con->node.next, struct console, node);
}
static void c_stop(struct seq_file *m, void *v)
{
- console_unlock();
+ console_list_unlock();
}
static const struct seq_operations consoles_op = {
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 913bef0d2a36..fc46d6fe080c 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -7,6 +7,7 @@
#include <linux/namei.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
+#include <linux/bitmap.h>
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
@@ -279,6 +280,30 @@ out:
return 0;
}
+static int proc_readfd_count(struct inode *inode, loff_t *count)
+{
+ struct task_struct *p = get_proc_task(inode);
+ struct fdtable *fdt;
+
+ if (!p)
+ return -ENOENT;
+
+ task_lock(p);
+ if (p->files) {
+ rcu_read_lock();
+
+ fdt = files_fdtable(p->files);
+ *count = bitmap_weight(fdt->open_fds, fdt->max_fds);
+
+ rcu_read_unlock();
+ }
+ task_unlock(p);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
static int proc_readfd(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
@@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns,
return rv;
}
+static int proc_fd_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ int rv = 0;
+
+ generic_fillattr(&init_user_ns, inode, stat);
+
+ /* If it's a directory, put the number of open fds there */
+ if (S_ISDIR(inode->i_mode)) {
+ rv = proc_readfd_count(inode, &stat->size);
+ if (rv < 0)
+ return rv;
+ }
+
+ return rv;
+}
+
const struct inode_operations proc_fd_inode_operations = {
.lookup = proc_lookupfd,
.permission = proc_fd_permission,
+ .getattr = proc_fd_getattr,
.setattr = proc_setattr,
};
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f2aa86c421f2..09a81e4b1273 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -199,7 +199,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
struct kvec kvec = { .iov_base = buf, .iov_len = count };
struct iov_iter iter;
- iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
return read_from_oldmem(&iter, count, ppos, false);
}
@@ -212,7 +212,7 @@ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
struct kvec kvec = { .iov_base = buf, .iov_len = count };
struct iov_iter iter;
- iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
return read_from_oldmem(&iter, count, ppos,
cc_platform_has(CC_ATTR_MEM_ENCRYPT));
@@ -437,7 +437,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
offset = (loff_t) index << PAGE_SHIFT;
kvec.iov_base = page_address(page);
kvec.iov_len = PAGE_SIZE;
- iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE);
rc = __read_vmcore(&iter, &offset);
if (rc < 0) {
@@ -1567,6 +1567,7 @@ static int __init vmcore_init(void)
return rc;
rc = parse_crash_elf_headers();
if (rc) {
+ elfcorehdr_free(elfcorehdr_addr);
pr_warn("Kdump: vmcore not initialized\n");
return rc;
}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0c034ea39954..cbc0b468c1ab 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -89,6 +89,11 @@ static char *compress =
module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
+/* How much of the kernel log to snapshot */
+unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
+module_param(kmsg_bytes, ulong, 0444);
+MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)");
+
/* Compression parameters */
static struct crypto_comp *tfm;
@@ -100,9 +105,6 @@ struct pstore_zbackend {
static char *big_oops_buf;
static size_t big_oops_buf_sz;
-/* How much of the console log to snapshot */
-unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
-
void pstore_set_kmsg_bytes(int bytes)
{
kmsg_bytes = bytes;
@@ -391,6 +393,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
const char *why;
unsigned int part = 1;
unsigned long flags = 0;
+ int saved_ret = 0;
int ret;
why = kmsg_dump_reason_str(reason);
@@ -461,12 +464,21 @@ static void pstore_dump(struct kmsg_dumper *dumper,
if (ret == 0 && reason == KMSG_DUMP_OOPS) {
pstore_new_entry = 1;
pstore_timer_kick();
+ } else {
+ /* Preserve only the first non-zero returned value. */
+ if (!saved_ret)
+ saved_ret = ret;
}
total += record.size;
part++;
}
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+
+ if (saved_ret) {
+ pr_err_once("backend (%s) writing error (%d)\n", psinfo->name,
+ saved_ret);
+ }
}
static struct kmsg_dumper pstore_dumper = {
@@ -562,8 +574,9 @@ out:
int pstore_register(struct pstore_info *psi)
{
if (backend && strcmp(backend, psi->name)) {
- pr_warn("ignoring unexpected backend '%s'\n", psi->name);
- return -EPERM;
+ pr_warn("backend '%s' already in use: ignoring '%s'\n",
+ backend, psi->name);
+ return -EBUSY;
}
/* Sanity check flags. */
@@ -662,6 +675,8 @@ void pstore_unregister(struct pstore_info *psi)
psinfo = NULL;
kfree(backend);
backend = NULL;
+
+ pr_info("Unregistered %s as persistent store backend\n", psi->name);
mutex_unlock(&psinfo_lock);
}
EXPORT_SYMBOL_GPL(pstore_unregister);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fefe3d391d3a..9a5052431fd3 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -18,10 +18,11 @@
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/compiler.h>
-#include <linux/pstore_ram.h>
#include <linux/of.h>
#include <linux/of_address.h>
+
#include "internal.h"
+#include "ram_internal.h"
#define RAMOOPS_KERNMSG_HDR "===="
#define MIN_MEM_SIZE 4096UL
@@ -451,20 +452,28 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
{
int i;
+ /* Free pmsg PRZ */
+ persistent_ram_free(&cxt->mprz);
+
+ /* Free console PRZ */
+ persistent_ram_free(&cxt->cprz);
+
/* Free dump PRZs */
if (cxt->dprzs) {
for (i = 0; i < cxt->max_dump_cnt; i++)
- persistent_ram_free(cxt->dprzs[i]);
+ persistent_ram_free(&cxt->dprzs[i]);
kfree(cxt->dprzs);
+ cxt->dprzs = NULL;
cxt->max_dump_cnt = 0;
}
/* Free ftrace PRZs */
if (cxt->fprzs) {
for (i = 0; i < cxt->max_ftrace_cnt; i++)
- persistent_ram_free(cxt->fprzs[i]);
+ persistent_ram_free(&cxt->fprzs[i]);
kfree(cxt->fprzs);
+ cxt->fprzs = NULL;
cxt->max_ftrace_cnt = 0;
}
}
@@ -548,9 +557,10 @@ static int ramoops_init_przs(const char *name,
while (i > 0) {
i--;
- persistent_ram_free(prz_ar[i]);
+ persistent_ram_free(&prz_ar[i]);
}
kfree(prz_ar);
+ prz_ar = NULL;
goto fail;
}
*paddr += zone_sz;
@@ -735,6 +745,7 @@ static int ramoops_probe(struct platform_device *pdev)
/* Make sure we didn't get bogus platform data pointer. */
if (!pdata) {
pr_err("NULL platform data\n");
+ err = -EINVAL;
goto fail_out;
}
@@ -742,6 +753,7 @@ static int ramoops_probe(struct platform_device *pdev)
!pdata->ftrace_size && !pdata->pmsg_size)) {
pr_err("The memory size and the record/console size must be "
"non-zero\n");
+ err = -EINVAL;
goto fail_out;
}
@@ -772,12 +784,17 @@ static int ramoops_probe(struct platform_device *pdev)
dump_mem_sz, cxt->record_size,
&cxt->max_dump_cnt, 0, 0);
if (err)
- goto fail_out;
+ goto fail_init;
err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr,
cxt->console_size, 0);
if (err)
- goto fail_init_cprz;
+ goto fail_init;
+
+ err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
+ cxt->pmsg_size, 0);
+ if (err)
+ goto fail_init;
cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
? nr_cpu_ids
@@ -788,12 +805,7 @@ static int ramoops_probe(struct platform_device *pdev)
(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
? PRZ_FLAG_NO_LOCK : 0);
if (err)
- goto fail_init_fprz;
-
- err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
- cxt->pmsg_size, 0);
- if (err)
- goto fail_init_mprz;
+ goto fail_init;
cxt->pstore.data = cxt;
/*
@@ -857,11 +869,7 @@ fail_buf:
kfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
- persistent_ram_free(cxt->mprz);
-fail_init_mprz:
-fail_init_fprz:
- persistent_ram_free(cxt->cprz);
-fail_init_cprz:
+fail_init:
ramoops_free_przs(cxt);
fail_out:
return err;
@@ -876,8 +884,6 @@ static int ramoops_remove(struct platform_device *pdev)
kfree(cxt->pstore.buf);
cxt->pstore.bufsize = 0;
- persistent_ram_free(cxt->mprz);
- persistent_ram_free(cxt->cprz);
ramoops_free_przs(cxt);
return 0;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index a89e33719fcf..966191d3a5ba 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -13,13 +13,14 @@
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/memblock.h>
-#include <linux/pstore_ram.h>
#include <linux/rslib.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <asm/page.h>
+#include "ram_internal.h"
+
/**
* struct persistent_ram_buffer - persistent circular RAM buffer
*
@@ -439,7 +440,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size,
phys_addr_t addr = page_start + i * PAGE_SIZE;
pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
}
- vaddr = vmap(pages, page_count, VM_MAP, prot);
+ /*
+ * VM_IOREMAP used here to bypass this region during vread()
+ * and kmap_atomic() (i.e. kcore) to avoid __va() failures.
+ */
+ vaddr = vmap(pages, page_count, VM_MAP | VM_IOREMAP, prot);
kfree(pages);
/*
@@ -543,8 +548,14 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
return 0;
}
-void persistent_ram_free(struct persistent_ram_zone *prz)
+void persistent_ram_free(struct persistent_ram_zone **_prz)
{
+ struct persistent_ram_zone *prz;
+
+ if (!_prz)
+ return;
+
+ prz = *_prz;
if (!prz)
return;
@@ -568,6 +579,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
persistent_ram_free_old(prz);
kfree(prz->label);
kfree(prz);
+ *_prz = NULL;
}
struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
@@ -604,6 +616,6 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
return prz;
err:
- persistent_ram_free(prz);
+ persistent_ram_free(&prz);
return ERR_PTR(ret);
}
diff --git a/fs/pstore/ram_internal.h b/fs/pstore/ram_internal.h
new file mode 100644
index 000000000000..5f694698351f
--- /dev/null
+++ b/fs/pstore/ram_internal.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com>
+ * Copyright (C) 2011 Kees Cook <keescook@chromium.org>
+ * Copyright (C) 2011 Google, Inc.
+ */
+
+#include <linux/pstore_ram.h>
+
+/*
+ * Choose whether access to the RAM zone requires locking or not. If a zone
+ * can be written to from different CPUs like with ftrace for example, then
+ * PRZ_FLAG_NO_LOCK is used. For all other cases, locking is required.
+ */
+#define PRZ_FLAG_NO_LOCK BIT(0)
+/*
+ * If a PRZ should only have a single-boot lifetime, this marks it as
+ * getting wiped after its contents get copied out after boot.
+ */
+#define PRZ_FLAG_ZAP_OLD BIT(1)
+
+/**
+ * struct persistent_ram_zone - Details of a persistent RAM zone (PRZ)
+ * used as a pstore backend
+ *
+ * @paddr: physical address of the mapped RAM area
+ * @size: size of mapping
+ * @label: unique name of this PRZ
+ * @type: frontend type for this PRZ
+ * @flags: holds PRZ_FLAGS_* bits
+ *
+ * @buffer_lock:
+ * locks access to @buffer "size" bytes and "start" offset
+ * @buffer:
+ * pointer to actual RAM area managed by this PRZ
+ * @buffer_size:
+ * bytes in @buffer->data (not including any trailing ECC bytes)
+ *
+ * @par_buffer:
+ * pointer into @buffer->data containing ECC bytes for @buffer->data
+ * @par_header:
+ * pointer into @buffer->data containing ECC bytes for @buffer header
+ * (i.e. all fields up to @data)
+ * @rs_decoder:
+ * RSLIB instance for doing ECC calculations
+ * @corrected_bytes:
+ * ECC corrected bytes accounting since boot
+ * @bad_blocks:
+ * ECC uncorrectable bytes accounting since boot
+ * @ecc_info:
+ * ECC configuration details
+ *
+ * @old_log:
+ * saved copy of @buffer->data prior to most recent wipe
+ * @old_log_size:
+ * bytes contained in @old_log
+ *
+ */
+struct persistent_ram_zone {
+ phys_addr_t paddr;
+ size_t size;
+ void *vaddr;
+ char *label;
+ enum pstore_type_id type;
+ u32 flags;
+
+ raw_spinlock_t buffer_lock;
+ struct persistent_ram_buffer *buffer;
+ size_t buffer_size;
+
+ char *par_buffer;
+ char *par_header;
+ struct rs_control *rs_decoder;
+ int corrected_bytes;
+ int bad_blocks;
+ struct persistent_ram_ecc_info ecc_info;
+
+ char *old_log;
+ size_t old_log_size;
+};
+
+struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
+ u32 sig, struct persistent_ram_ecc_info *ecc_info,
+ unsigned int memtype, u32 flags, char *label);
+void persistent_ram_free(struct persistent_ram_zone **_prz);
+void persistent_ram_zap(struct persistent_ram_zone *prz);
+
+int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
+ unsigned int count);
+int persistent_ram_write_user(struct persistent_ram_zone *prz,
+ const void __user *s, unsigned int count);
+
+void persistent_ram_save_old(struct persistent_ram_zone *prz);
+size_t persistent_ram_old_size(struct persistent_ram_zone *prz);
+void *persistent_ram_old(struct persistent_ram_zone *prz);
+void persistent_ram_free_old(struct persistent_ram_zone *prz);
+ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
+ char *str, size_t len);
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 017d0d4ad329..2770746bb7aa 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -761,7 +761,7 @@ static inline int notrace psz_kmsg_write_record(struct psz_context *cxt,
/* avoid destroying old data, allocate a new one */
len = zone->buffer_size + sizeof(*zone->buffer);
zone->oldbuf = zone->buffer;
- zone->buffer = kzalloc(len, GFP_KERNEL);
+ zone->buffer = kzalloc(len, GFP_ATOMIC);
if (!zone->buffer) {
zone->buffer = zone->oldbuf;
return -ENOMEM;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0427b44bfee5..f27faf5db554 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2324,6 +2324,8 @@ static int vfs_setup_quota_inode(struct inode *inode, int type)
struct super_block *sb = inode->i_sb;
struct quota_info *dqopt = sb_dqopt(sb);
+ if (is_bad_inode(inode))
+ return -EUCLEAN;
if (!S_ISREG(inode->i_mode))
return -EACCES;
if (IS_RDONLY(inode))
diff --git a/fs/read_write.c b/fs/read_write.c
index 24b9668d6377..7a2ff6157eda 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -384,7 +384,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_ubuf(&iter, READ, buf, len);
+ iov_iter_ubuf(&iter, ITER_DEST, buf, len);
ret = call_read_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -424,7 +424,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos ? *pos : 0;
- iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+ iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
ret = file->f_op->read_iter(&kiocb, &iter);
if (ret > 0) {
if (pos)
@@ -486,7 +486,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_ubuf(&iter, WRITE, (void __user *)buf, len);
+ iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
ret = call_write_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -533,7 +533,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
.iov_len = min_t(size_t, count, MAX_RW_COUNT),
};
struct iov_iter iter;
- iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+ iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
return __kernel_write_iter(file, &iter, pos);
}
/*
@@ -911,7 +911,7 @@ static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
struct iov_iter iter;
ssize_t ret;
- ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret >= 0) {
ret = do_iter_read(file, &iter, pos, flags);
kfree(iov);
@@ -928,7 +928,7 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
struct iov_iter iter;
ssize_t ret;
- ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret >= 0) {
file_start_write(file);
ret = do_iter_write(file, &iter, pos, flags);
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index d9052b8ce6dd..29c503a06db4 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -49,9 +49,9 @@ static inline int reiserfs_acl_count(size_t size)
#ifdef CONFIG_REISERFS_FS_POSIX_ACL
struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
-int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
-int reiserfs_acl_chmod(struct inode *inode);
+int reiserfs_acl_chmod(struct dentry *dentry);
int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
struct inode *dir, struct dentry *dentry,
struct inode *inode);
@@ -63,7 +63,7 @@ int reiserfs_cache_default_acl(struct inode *dir);
#define reiserfs_get_acl NULL
#define reiserfs_set_acl NULL
-static inline int reiserfs_acl_chmod(struct inode *inode)
+static inline int reiserfs_acl_chmod(struct dentry *dentry)
{
return 0;
}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6e228bfbe7ef..467d13da198f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -256,7 +256,7 @@ const struct inode_operations reiserfs_file_inode_operations = {
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
+ .get_inode_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
.fileattr_get = reiserfs_fileattr_get,
.fileattr_set = reiserfs_fileattr_set,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index b9580a6515ee..c7d1fa526dea 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3404,7 +3404,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (!error && reiserfs_posixacl(inode->i_sb)) {
if (attr->ia_valid & ATTR_MODE)
- error = reiserfs_acl_chmod(inode);
+ error = reiserfs_acl_chmod(dentry);
}
out:
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 3d7a35d6a18b..4d428e8704bc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1659,7 +1659,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
+ .get_inode_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
.fileattr_get = reiserfs_fileattr_get,
.fileattr_set = reiserfs_fileattr_set,
@@ -1683,6 +1683,6 @@ const struct inode_operations reiserfs_special_inode_operations = {
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
+ .get_inode_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
};
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d6fcddc46f5b..93fe414fed18 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
int
-reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error, error2;
@@ -26,6 +26,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
size_t jcreate_blocks;
int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
int update_mode = 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
/*
@@ -371,7 +372,7 @@ int reiserfs_cache_default_acl(struct inode *inode)
if (IS_PRIVATE(inode))
return 0;
- acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (acl && !IS_ERR(acl)) {
int size = reiserfs_acl_size(acl->a_count);
@@ -396,13 +397,15 @@ int reiserfs_cache_default_acl(struct inode *inode)
/*
* Called under i_mutex
*/
-int reiserfs_acl_chmod(struct inode *inode)
+int reiserfs_acl_chmod(struct dentry *dentry)
{
+ struct inode *inode = d_inode(dentry);
+
if (IS_PRIVATE(inode))
return 0;
if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
!reiserfs_posixacl(inode->i_sb))
return 0;
- return posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
}
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 654912d06862..290743c8d226 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -429,7 +429,7 @@ static bool allow_file_dedupe(struct file *file)
return true;
if (file->f_mode & FMODE_WRITE)
return true;
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
+ if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid()))
return true;
if (!inode_permission(mnt_userns, inode, MAY_WRITE))
return true;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 9456a2032224..f5fdaf3b1572 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -156,7 +156,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
ssize_t ret;
init_sync_kiocb(&kiocb, file);
- iov_iter_init(&iter, READ, &iov, 1, size);
+ iov_iter_init(&iter, ITER_DEST, &iov, 1, size);
kiocb.ki_pos = *ppos;
ret = seq_read_iter(&kiocb, &iter);
diff --git a/fs/splice.c b/fs/splice.c
index 0878b852b355..5969b7a1d353 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -303,7 +303,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct kiocb kiocb;
int ret;
- iov_iter_pipe(&to, READ, pipe, len);
+ iov_iter_pipe(&to, ITER_DEST, pipe, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -682,7 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
n++;
}
- iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
+ iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
ret = vfs_iter_write(out, &from, &sd.pos, 0);
if (ret <= 0)
break;
@@ -1263,9 +1263,9 @@ static int vmsplice_type(struct fd f, int *type)
if (!f.file)
return -EBADF;
if (f.file->f_mode & FMODE_WRITE) {
- *type = WRITE;
+ *type = ITER_SOURCE;
} else if (f.file->f_mode & FMODE_READ) {
- *type = READ;
+ *type = ITER_DEST;
} else {
fdput(f);
return -EBADF;
@@ -1314,7 +1314,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
if (!iov_iter_count(&iter))
error = 0;
- else if (iov_iter_rw(&iter) == WRITE)
+ else if (type == ITER_SOURCE)
error = vmsplice_to_pipe(f.file, &iter, flags);
else
error = vmsplice_to_user(f.file, &iter, flags);
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 916e78fabcaa..60fc98bdf421 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -54,9 +54,35 @@ config SQUASHFS_FILE_DIRECT
endchoice
+config SQUASHFS_DECOMP_SINGLE
+ depends on SQUASHFS
+ def_bool n
+
+config SQUASHFS_DECOMP_MULTI
+ depends on SQUASHFS
+ def_bool n
+
+config SQUASHFS_DECOMP_MULTI_PERCPU
+ depends on SQUASHFS
+ def_bool n
+
+config SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+ bool "Select the parallel decompression mode during mount"
+ depends on SQUASHFS
+ default n
+ select SQUASHFS_DECOMP_SINGLE
+ select SQUASHFS_DECOMP_MULTI
+ select SQUASHFS_DECOMP_MULTI_PERCPU
+ select SQUASHFS_MOUNT_DECOMP_THREADS
+ help
+ Compile all parallel decompression modes and specify the
+ decompression mode by setting "threads=" during mount.
+ default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE
+
choice
- prompt "Decompressor parallelisation options"
+ prompt "Select decompression parallel mode at compile time"
depends on SQUASHFS
+ depends on !SQUASHFS_CHOICE_DECOMP_BY_MOUNT
help
Squashfs now supports three parallelisation options for
decompression. Each one exhibits various trade-offs between
@@ -64,15 +90,17 @@ choice
If in doubt, select "Single threaded compression"
-config SQUASHFS_DECOMP_SINGLE
+config SQUASHFS_COMPILE_DECOMP_SINGLE
bool "Single threaded compression"
+ select SQUASHFS_DECOMP_SINGLE
help
Traditionally Squashfs has used single-threaded decompression.
Only one block (data or metadata) can be decompressed at any
one time. This limits CPU and memory usage to a minimum.
-config SQUASHFS_DECOMP_MULTI
+config SQUASHFS_COMPILE_DECOMP_MULTI
bool "Use multiple decompressors for parallel I/O"
+ select SQUASHFS_DECOMP_MULTI
help
By default Squashfs uses a single decompressor but it gives
poor performance on parallel I/O workloads when using multiple CPU
@@ -85,8 +113,9 @@ config SQUASHFS_DECOMP_MULTI
decompressors per core. It dynamically allocates decompressors
on a demand basis.
-config SQUASHFS_DECOMP_MULTI_PERCPU
+config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU
bool "Use percpu multiple decompressors for parallel I/O"
+ select SQUASHFS_DECOMP_MULTI_PERCPU
help
By default Squashfs uses a single decompressor but it gives
poor performance on parallel I/O workloads when using multiple CPU
@@ -95,9 +124,21 @@ config SQUASHFS_DECOMP_MULTI_PERCPU
This decompressor implementation uses a maximum of one
decompressor per core. It uses percpu variables to ensure
decompression is load-balanced across the cores.
-
endchoice
+config SQUASHFS_MOUNT_DECOMP_THREADS
+ bool "Add the mount parameter 'threads=' for squashfs"
+ depends on SQUASHFS
+ depends on SQUASHFS_DECOMP_MULTI
+ default n
+ help
+ Use threads= to set the decompression parallel mode and the number of threads.
+ If SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y
+ threads=<single|multi|percpu|1|2|3|...>
+ else
+ threads=<2|3|...>
+ The upper limit is num_online_cpus() * 2.
+
config SQUASHFS_XATTR
bool "Squashfs XATTR support"
depends on SQUASHFS
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 833aca92301f..bed3bb8b27fa 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -216,7 +216,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
res = -EIO;
goto out_free_bio;
}
- res = squashfs_decompress(msblk, bio, offset, length, output);
+ res = msblk->thread_ops->decompress(msblk, bio, offset, length, output);
} else {
res = copy_bio_to_actor(bio, output, offset, length);
}
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index d57bef91ab08..8893cb9b4198 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -134,7 +134,7 @@ void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
if (IS_ERR(comp_opts))
return comp_opts;
- stream = squashfs_decompressor_create(msblk, comp_opts);
+ stream = msblk->thread_ops->create(msblk, comp_opts);
if (IS_ERR(stream))
kfree(comp_opts);
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
index db9f12a3ea05..416c53eedbd1 100644
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c
@@ -29,12 +29,11 @@
#define MAX_DECOMPRESSOR (num_online_cpus() * 2)
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
{
return MAX_DECOMPRESSOR;
}
-
struct squashfs_stream {
void *comp_opts;
struct list_head strm_list;
@@ -59,7 +58,7 @@ static void put_decomp_stream(struct decomp_stream *decomp_strm,
wake_up(&stream->wait);
}
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
void *comp_opts)
{
struct squashfs_stream *stream;
@@ -103,7 +102,7 @@ out:
}
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream *stream = msblk->stream;
if (stream) {
@@ -145,7 +144,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
* If there is no available decomp and already full,
* let's wait for releasing decomp from other users.
*/
- if (stream->avail_decomp >= MAX_DECOMPRESSOR)
+ if (stream->avail_decomp >= msblk->max_thread_num)
goto wait;
/* Let's allocate new decomp */
@@ -161,7 +160,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
}
stream->avail_decomp++;
- WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR);
+ WARN_ON(stream->avail_decomp > msblk->max_thread_num);
mutex_unlock(&stream->mutex);
break;
@@ -180,7 +179,7 @@ wait:
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
int offset, int length,
struct squashfs_page_actor *output)
{
@@ -195,3 +194,10 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
msblk->decompressor->name);
return res;
}
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi = {
+ .create = squashfs_decompressor_create,
+ .destroy = squashfs_decompressor_destroy,
+ .decompress = squashfs_decompress,
+ .max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
index b881b9283b7f..1dfadf76ed9a 100644
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -25,7 +25,7 @@ struct squashfs_stream {
local_lock_t lock;
};
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
void *comp_opts)
{
struct squashfs_stream *stream;
@@ -59,7 +59,7 @@ out:
return ERR_PTR(err);
}
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream __percpu *percpu =
(struct squashfs_stream __percpu *) msblk->stream;
@@ -75,19 +75,21 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
}
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
int offset, int length, struct squashfs_page_actor *output)
{
struct squashfs_stream *stream;
+ struct squashfs_stream __percpu *percpu =
+ (struct squashfs_stream __percpu *) msblk->stream;
int res;
- local_lock(&msblk->stream->lock);
- stream = this_cpu_ptr(msblk->stream);
+ local_lock(&percpu->lock);
+ stream = this_cpu_ptr(percpu);
res = msblk->decompressor->decompress(msblk, stream->stream, bio,
offset, length, output);
- local_unlock(&msblk->stream->lock);
+ local_unlock(&percpu->lock);
if (res < 0)
ERROR("%s decompression failed, data probably corrupt\n",
@@ -96,7 +98,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
return res;
}
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
{
return num_possible_cpus();
}
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu = {
+ .create = squashfs_decompressor_create,
+ .destroy = squashfs_decompressor_destroy,
+ .decompress = squashfs_decompress,
+ .max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
index 4eb3d083d45e..6f161887710b 100644
--- a/fs/squashfs/decompressor_single.c
+++ b/fs/squashfs/decompressor_single.c
@@ -24,7 +24,7 @@ struct squashfs_stream {
struct mutex mutex;
};
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
void *comp_opts)
{
struct squashfs_stream *stream;
@@ -49,7 +49,7 @@ out:
return ERR_PTR(err);
}
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream *stream = msblk->stream;
@@ -59,7 +59,7 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
}
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
int offset, int length,
struct squashfs_page_actor *output)
{
@@ -78,7 +78,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
return res;
}
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
{
return 1;
}
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_single = {
+ .create = squashfs_decompressor_create,
+ .destroy = squashfs_decompressor_destroy,
+ .decompress = squashfs_decompress,
+ .max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 9783e01c8100..a6164fdf9435 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -38,11 +38,24 @@ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
/* decompressor_xxx.c */
-extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
-extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
-extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *,
- int, int, struct squashfs_page_actor *);
-extern int squashfs_max_decompressors(void);
+
+struct squashfs_decompressor_thread_ops {
+ void * (*create)(struct squashfs_sb_info *msblk, void *comp_opts);
+ void (*destroy)(struct squashfs_sb_info *msblk);
+ int (*decompress)(struct squashfs_sb_info *msblk, struct bio *bio,
+ int offset, int length, struct squashfs_page_actor *output);
+ int (*max_decompressors)(void);
+};
+
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_single;
+#endif
+#ifdef CONFIG_SQUASHFS_DECOMP_MULTI
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi;
+#endif
+#ifdef CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu;
+#endif
/* export.c */
extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 1e90c2575f9b..659082e9e51d 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -53,7 +53,7 @@ struct squashfs_sb_info {
__le64 *xattr_id_table;
struct mutex meta_index_mutex;
struct meta_index *meta_index;
- struct squashfs_stream *stream;
+ void *stream;
__le64 *inode_lookup_table;
u64 inode_table;
u64 directory_table;
@@ -66,5 +66,7 @@ struct squashfs_sb_info {
int xattr_ids;
unsigned int ids;
bool panic_on_errors;
+ const struct squashfs_decompressor_thread_ops *thread_ops;
+ int max_thread_num;
};
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 32565dafa7f3..e090fae48e68 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -47,10 +47,13 @@ enum Opt_errors {
enum squashfs_param {
Opt_errors,
+ Opt_threads,
};
struct squashfs_mount_opts {
enum Opt_errors errors;
+ const struct squashfs_decompressor_thread_ops *thread_ops;
+ int thread_num;
};
static const struct constant_table squashfs_param_errors[] = {
@@ -61,9 +64,66 @@ static const struct constant_table squashfs_param_errors[] = {
static const struct fs_parameter_spec squashfs_fs_parameters[] = {
fsparam_enum("errors", Opt_errors, squashfs_param_errors),
+ fsparam_string("threads", Opt_threads),
{}
};
+
+static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts)
+{
+#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+ if (strcmp(str, "single") == 0) {
+ opts->thread_ops = &squashfs_decompressor_single;
+ return 0;
+ }
+ if (strcmp(str, "multi") == 0) {
+ opts->thread_ops = &squashfs_decompressor_multi;
+ return 0;
+ }
+ if (strcmp(str, "percpu") == 0) {
+ opts->thread_ops = &squashfs_decompressor_percpu;
+ return 0;
+ }
+#endif
+ return -EINVAL;
+}
+
+static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts)
+{
+#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
+ int ret;
+ unsigned long num;
+
+ ret = kstrtoul(str, 0, &num);
+ if (ret != 0)
+ return -EINVAL;
+ if (num > 1) {
+ opts->thread_ops = &squashfs_decompressor_multi;
+ if (num > opts->thread_ops->max_decompressors())
+ return -EINVAL;
+ opts->thread_num = (int)num;
+ return 0;
+ }
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+ if (num == 1) {
+ opts->thread_ops = &squashfs_decompressor_single;
+ opts->thread_num = 1;
+ return 0;
+ }
+#endif
+#endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */
+ return -EINVAL;
+}
+
+static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts)
+{
+ int ret = squashfs_parse_param_threads_str(str, opts);
+
+ if (ret == 0)
+ return ret;
+ return squashfs_parse_param_threads_num(str, opts);
+}
+
static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct squashfs_mount_opts *opts = fc->fs_private;
@@ -78,6 +138,10 @@ static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *para
case Opt_errors:
opts->errors = result.uint_32;
break;
+ case Opt_threads:
+ if (squashfs_parse_param_threads(param->string, opts) != 0)
+ return -EINVAL;
+ break;
default:
return -EINVAL;
}
@@ -133,6 +197,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
}
msblk = sb->s_fs_info;
+ msblk->thread_ops = opts->thread_ops;
msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
@@ -168,6 +233,12 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto failed_mount;
}
+ if (opts->thread_num == 0) {
+ msblk->max_thread_num = msblk->thread_ops->max_decompressors();
+ } else {
+ msblk->max_thread_num = opts->thread_num;
+ }
+
/* Check the MAJOR & MINOR versions and lookup compression type */
msblk->decompressor = supported_squashfs_filesystem(
fc,
@@ -252,7 +323,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Allocate read_page block */
msblk->read_page = squashfs_cache_init("data",
- squashfs_max_decompressors(), msblk->block_size);
+ msblk->max_thread_num, msblk->block_size);
if (msblk->read_page == NULL) {
errorf(fc, "Failed to allocate read_page block");
goto failed_mount;
@@ -383,7 +454,7 @@ failed_mount:
squashfs_cache_delete(msblk->block_cache);
squashfs_cache_delete(msblk->fragment_cache);
squashfs_cache_delete(msblk->read_page);
- squashfs_decompressor_destroy(msblk);
+ msblk->thread_ops->destroy(msblk);
kfree(msblk->inode_lookup_table);
kfree(msblk->fragment_index);
kfree(msblk->id_table);
@@ -435,6 +506,19 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root)
else
seq_puts(s, ",errors=continue");
+#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+ if (msblk->thread_ops == &squashfs_decompressor_single) {
+ seq_puts(s, ",threads=single");
+ return 0;
+ }
+ if (msblk->thread_ops == &squashfs_decompressor_percpu) {
+ seq_puts(s, ",threads=percpu");
+ return 0;
+ }
+#endif
+#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
+ seq_printf(s, ",threads=%d", msblk->max_thread_num);
+#endif
return 0;
}
@@ -446,6 +530,16 @@ static int squashfs_init_fs_context(struct fs_context *fc)
if (!opts)
return -ENOMEM;
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+ opts->thread_ops = &squashfs_decompressor_single;
+#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI)
+ opts->thread_ops = &squashfs_decompressor_multi;
+#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU)
+ opts->thread_ops = &squashfs_decompressor_percpu;
+#else
+#error "fail: unknown squashfs decompression thread mode?"
+#endif
+ opts->thread_num = 0;
fc->fs_private = opts;
fc->ops = &squashfs_context_ops;
return 0;
@@ -478,7 +572,7 @@ static void squashfs_put_super(struct super_block *sb)
squashfs_cache_delete(sbi->block_cache);
squashfs_cache_delete(sbi->fragment_cache);
squashfs_cache_delete(sbi->read_page);
- squashfs_decompressor_destroy(sbi);
+ sbi->thread_ops->destroy(sbi);
kfree(sbi->id_table);
kfree(sbi->fragment_index);
kfree(sbi->meta_index);
@@ -568,7 +662,7 @@ static struct file_system_type squashfs_fs_type = {
.init_fs_context = squashfs_init_fs_context,
.parameters = squashfs_fs_parameters,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("squashfs");
diff --git a/fs/stat.c b/fs/stat.c
index ef50573c72a2..d6cc74ca8486 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -44,12 +44,15 @@
void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode,
struct kstat *stat)
{
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
stat->dev = inode->i_sb->s_dev;
stat->ino = inode->i_ino;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
- stat->uid = i_uid_into_mnt(mnt_userns, inode);
- stat->gid = i_gid_into_mnt(mnt_userns, inode);
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode->i_atime;
diff --git a/fs/super.c b/fs/super.c
index 8d39e4f11cfa..12c08cb20405 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1112,55 +1112,14 @@ static int test_single_super(struct super_block *s, struct fs_context *fc)
return 1;
}
-/**
- * vfs_get_super - Get a superblock with a search key set in s_fs_info.
- * @fc: The filesystem context holding the parameters
- * @keying: How to distinguish superblocks
- * @fill_super: Helper to initialise a new superblock
- *
- * Search for a superblock and create a new one if not found. The search
- * criterion is controlled by @keying. If the search fails, a new superblock
- * is created and @fill_super() is called to initialise it.
- *
- * @keying can take one of a number of values:
- *
- * (1) vfs_get_single_super - Only one superblock of this type may exist on the
- * system. This is typically used for special system filesystems.
- *
- * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
- * distinct keys (where the key is in s_fs_info). Searching for the same
- * key again will turn up the superblock for that key.
- *
- * (3) vfs_get_independent_super - Multiple superblocks may exist and are
- * unkeyed. Each call will get a new superblock.
- *
- * A permissions check is made by sget_fc() unless we're getting a superblock
- * for a kernel-internal mount or a submount.
- */
-int vfs_get_super(struct fs_context *fc,
- enum vfs_get_super_keying keying,
- int (*fill_super)(struct super_block *sb,
- struct fs_context *fc))
+static int vfs_get_super(struct fs_context *fc, bool reconf,
+ int (*test)(struct super_block *, struct fs_context *),
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc))
{
- int (*test)(struct super_block *, struct fs_context *);
struct super_block *sb;
int err;
- switch (keying) {
- case vfs_get_single_super:
- case vfs_get_single_reconf_super:
- test = test_single_super;
- break;
- case vfs_get_keyed_super:
- test = test_keyed_super;
- break;
- case vfs_get_independent_super:
- test = NULL;
- break;
- default:
- BUG();
- }
-
sb = sget_fc(fc, test, set_anon_super_fc);
if (IS_ERR(sb))
return PTR_ERR(sb);
@@ -1174,7 +1133,7 @@ int vfs_get_super(struct fs_context *fc,
fc->root = dget(sb->s_root);
} else {
fc->root = dget(sb->s_root);
- if (keying == vfs_get_single_reconf_super) {
+ if (reconf) {
err = reconfigure_super(fc);
if (err < 0) {
dput(fc->root);
@@ -1190,13 +1149,12 @@ error:
deactivate_locked_super(sb);
return err;
}
-EXPORT_SYMBOL(vfs_get_super);
int get_tree_nodev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, vfs_get_independent_super, fill_super);
+ return vfs_get_super(fc, false, NULL, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);
@@ -1204,7 +1162,7 @@ int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, vfs_get_single_super, fill_super);
+ return vfs_get_super(fc, false, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);
@@ -1212,7 +1170,7 @@ int get_tree_single_reconf(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super);
+ return vfs_get_super(fc, true, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single_reconf);
@@ -1222,7 +1180,7 @@ int get_tree_keyed(struct fs_context *fc,
void *key)
{
fc->s_fs_info = key;
- return vfs_get_super(fc, vfs_get_keyed_super, fill_super);
+ return vfs_get_super(fc, false, test_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index d4ec9bb97de9..3b8567564e7e 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -438,7 +438,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
res += blocks;
direct = 1;
}
- return blocks;
+ return res;
}
int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path,
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 3f128b9fdfbb..9c9d3f0e36a4 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2467,7 +2467,7 @@ error_dump:
static inline int chance(unsigned int n, unsigned int out_of)
{
- return !!(prandom_u32_max(out_of) + 1 <= n);
+ return !!(get_random_u32_below(out_of) + 1 <= n);
}
@@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
if (chance(1, 2)) {
d->pc_delay = 1;
/* Fail within 1 minute */
- delay = prandom_u32_max(60000);
+ delay = get_random_u32_below(60000);
d->pc_timeout = jiffies;
d->pc_timeout += msecs_to_jiffies(delay);
ubifs_warn(c, "failing after %lums", delay);
} else {
d->pc_delay = 2;
- delay = prandom_u32_max(10000);
+ delay = get_random_u32_below(10000);
/* Fail within 10000 operations */
d->pc_cnt_max = delay;
ubifs_warn(c, "failing after %lu calls", delay);
@@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
unsigned int from, to, ffs = chance(1, 2);
unsigned char *p = (void *)buf;
- from = prandom_u32_max(len);
+ from = get_random_u32_below(len);
/* Corruption span max to end of write unit */
to = min(len, ALIGN(from + 1, c->max_write_size));
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index cfbc31f709f4..c4d079328b92 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c)
if (!dbg_is_chk_gen(c))
return 0;
- if (prandom_u32_max(4))
+ if (get_random_u32_below(4))
return 0;
for (i = 0; i < c->lsave_cnt; i++)
c->lsave[i] = c->main_first;
list_for_each_entry(lprops, &c->empty_list, list)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
list_for_each_entry(lprops, &c->freeable_list, list)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
list_for_each_entry(lprops, &c->frdi_idx_list, list)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_FREE - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
return 1;
}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 01362ad5f804..a55e04822d16 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
c->ilebs[c->ileb_cnt++] = lnum;
dbg_cmt("LEB %d", lnum);
}
- if (dbg_is_chk_index(c) && !prandom_u32_max(8))
+ if (dbg_is_chk_index(c) && !get_random_u32_below(8))
return -ENOSPC;
return 0;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index dce6ae9ae306..1d7c2a812fc1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -182,11 +182,6 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int udf_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, udf_get_block, wbc);
-}
-
static int udf_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -239,12 +234,12 @@ const struct address_space_operations udf_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = udf_read_folio,
.readahead = udf_readahead,
- .writepage = udf_writepage,
.writepages = udf_writepages,
.write_begin = udf_write_begin,
.write_end = generic_write_end,
.direct_IO = udf_direct_IO,
.bmap = udf_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
/*
@@ -439,6 +434,12 @@ static int udf_get_block(struct inode *inode, sector_t block,
iinfo->i_next_alloc_goal++;
}
+ /*
+ * Block beyond EOF and prealloc extents? Just discard preallocation
+ * as it is not useful and complicates things.
+ */
+ if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents)
+ udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
phys = inode_getblk(inode, block, &err, &new);
if (!phys)
@@ -488,8 +489,6 @@ static int udf_do_extend_file(struct inode *inode,
uint32_t add;
int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
struct super_block *sb = inode->i_sb;
- struct kernel_lb_addr prealloc_loc = {};
- uint32_t prealloc_len = 0;
struct udf_inode_info *iinfo;
int err;
@@ -510,19 +509,6 @@ static int udf_do_extend_file(struct inode *inode,
~(sb->s_blocksize - 1);
}
- /* Last extent are just preallocated blocks? */
- if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
- EXT_NOT_RECORDED_ALLOCATED) {
- /* Save the extent so that we can reattach it to the end */
- prealloc_loc = last_ext->extLocation;
- prealloc_len = last_ext->extLength;
- /* Mark the extent as a hole */
- last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
- (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
- last_ext->extLocation.logicalBlockNum = 0;
- last_ext->extLocation.partitionReferenceNum = 0;
- }
-
/* Can we merge with the previous extent? */
if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
EXT_NOT_RECORDED_NOT_ALLOCATED) {
@@ -550,7 +536,7 @@ static int udf_do_extend_file(struct inode *inode,
* more extents, we may need to enter possible following
* empty indirect extent.
*/
- if (new_block_bytes || prealloc_len)
+ if (new_block_bytes)
udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
}
@@ -584,17 +570,6 @@ static int udf_do_extend_file(struct inode *inode,
}
out:
- /* Do we have some preallocated blocks saved? */
- if (prealloc_len) {
- err = udf_add_aext(inode, last_pos, &prealloc_loc,
- prealloc_len, 1);
- if (err)
- return err;
- last_ext->extLocation = prealloc_loc;
- last_ext->extLength = prealloc_len;
- count++;
- }
-
/* last_pos should point to the last written extent... */
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
last_pos->offset -= sizeof(struct short_ad);
@@ -610,13 +585,17 @@ out:
static void udf_do_extend_final_block(struct inode *inode,
struct extent_position *last_pos,
struct kernel_long_ad *last_ext,
- uint32_t final_block_len)
+ uint32_t new_elen)
{
- struct super_block *sb = inode->i_sb;
uint32_t added_bytes;
- added_bytes = final_block_len -
- (last_ext->extLength & (sb->s_blocksize - 1));
+ /*
+ * Extent already large enough? It may be already rounded up to block
+ * size...
+ */
+ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK))
+ return;
+ added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen;
last_ext->extLength += added_bytes;
UDF_I(inode)->i_lenExtents += added_bytes;
@@ -633,12 +612,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
int8_t etype;
struct super_block *sb = inode->i_sb;
sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
- unsigned long partial_final_block;
+ loff_t new_elen;
int adsize;
struct udf_inode_info *iinfo = UDF_I(inode);
struct kernel_long_ad extent;
int err = 0;
- int within_final_block;
+ bool within_last_ext;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
adsize = sizeof(struct short_ad);
@@ -647,8 +626,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
else
BUG();
+ /*
+ * When creating hole in file, just don't bother with preserving
+ * preallocation. It likely won't be very useful anyway.
+ */
+ udf_discard_prealloc(inode);
+
etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
- within_final_block = (etype != -1);
+ within_last_ext = (etype != -1);
+ /* We don't expect extents past EOF... */
+ WARN_ON_ONCE(within_last_ext &&
+ elen > ((loff_t)offset + 1) << inode->i_blkbits);
if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
(epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
@@ -664,19 +652,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
extent.extLength |= etype << 30;
}
- partial_final_block = newsize & (sb->s_blocksize - 1);
+ new_elen = ((loff_t)offset << inode->i_blkbits) |
+ (newsize & (sb->s_blocksize - 1));
/* File has extent covering the new size (could happen when extending
* inside a block)?
*/
- if (within_final_block) {
+ if (within_last_ext) {
/* Extending file within the last file block */
- udf_do_extend_final_block(inode, &epos, &extent,
- partial_final_block);
+ udf_do_extend_final_block(inode, &epos, &extent, new_elen);
} else {
- loff_t add = ((loff_t)offset << sb->s_blocksize_bits) |
- partial_final_block;
- err = udf_do_extend_file(inode, &epos, &extent, add);
+ err = udf_do_extend_file(inode, &epos, &extent, new_elen);
}
if (err < 0)
@@ -777,10 +763,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
goto out_free;
}
- /* Are we beyond EOF? */
+ /* Are we beyond EOF and preallocated extent? */
if (etype == -1) {
int ret;
loff_t hole_len;
+
isBeyondEOF = true;
if (count) {
if (c)
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ae7bc13a5298..7c95c549dd64 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1091,8 +1091,9 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
return -EINVAL;
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
- if (IS_ERR(ofi)) {
- retval = PTR_ERR(ofi);
+ if (!ofi || IS_ERR(ofi)) {
+ if (IS_ERR(ofi))
+ retval = PTR_ERR(ofi);
goto end_rename;
}
@@ -1101,8 +1102,7 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
brelse(ofibh.sbh);
tloc = lelb_to_cpu(ocfi.icb.extLocation);
- if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
- != old_inode->i_ino)
+ if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino)
goto end_rename;
nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4042d9739fb7..06eda8177b5f 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -162,7 +162,7 @@ static void udf_free_in_core_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct udf_inode_info *ei = (struct udf_inode_info *)foo;
+ struct udf_inode_info *ei = foo;
ei->i_data = NULL;
inode_init_once(&ei->vfs_inode);
@@ -820,7 +820,7 @@ static int udf_find_fileset(struct super_block *sb,
struct kernel_lb_addr *fileset,
struct kernel_lb_addr *root)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
uint16_t ident;
int ret;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 532cda99644e..036ebd892b85 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -120,60 +120,42 @@ void udf_truncate_tail_extent(struct inode *inode)
void udf_discard_prealloc(struct inode *inode)
{
- struct extent_position epos = { NULL, 0, {0, 0} };
+ struct extent_position epos = {};
+ struct extent_position prev_epos = {};
struct kernel_lb_addr eloc;
uint32_t elen;
uint64_t lbcount = 0;
int8_t etype = -1, netype;
- int adsize;
struct udf_inode_info *iinfo = UDF_I(inode);
+ int bsize = 1 << inode->i_blkbits;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
- inode->i_size == iinfo->i_lenExtents)
+ ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize))
return;
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(struct short_ad);
- else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(struct long_ad);
- else
- adsize = 0;
-
epos.block = iinfo->i_location;
/* Find the last extent in the file */
- while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
- etype = netype;
+ while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) {
+ brelse(prev_epos.bh);
+ prev_epos = epos;
+ if (prev_epos.bh)
+ get_bh(prev_epos.bh);
+
+ etype = udf_next_aext(inode, &epos, &eloc, &elen, 1);
lbcount += elen;
}
if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
- epos.offset -= adsize;
lbcount -= elen;
- extent_trunc(inode, &epos, &eloc, etype, elen, 0);
- if (!epos.bh) {
- iinfo->i_lenAlloc =
- epos.offset -
- udf_file_entry_alloc_offset(inode);
- mark_inode_dirty(inode);
- } else {
- struct allocExtDesc *aed =
- (struct allocExtDesc *)(epos.bh->b_data);
- aed->lengthAllocDescs =
- cpu_to_le32(epos.offset -
- sizeof(struct allocExtDesc));
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
- UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
- udf_update_tag(epos.bh->b_data, epos.offset);
- else
- udf_update_tag(epos.bh->b_data,
- sizeof(struct allocExtDesc));
- mark_buffer_dirty_inode(epos.bh, inode);
- }
+ udf_delete_aext(inode, prev_epos);
+ udf_free_blocks(inode->i_sb, inode, &eloc, 0,
+ DIV_ROUND_UP(elen, 1 << inode->i_blkbits));
}
/* This inode entry is in-memory only and thus we don't have to mark
* the inode dirty */
iinfo->i_lenExtents = lbcount;
brelse(epos.bh);
+ brelse(prev_epos.bh);
}
static void udf_update_alloc_ext_desc(struct inode *inode,
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 4fa620543d30..291b56dd011e 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -6,7 +6,11 @@
#include <linux/bitops.h>
#include <linux/magic.h>
-#define UDF_MAX_READ_VERSION 0x0250
+/*
+ * Even UDF 2.6 media should have version <= 0x250 but apparently there are
+ * some broken filesystems with version set to 0x260. Accommodate those.
+ */
+#define UDF_MAX_READ_VERSION 0x0260
#define UDF_MAX_WRITE_VERSION 0x0201
#define UDF_FLAG_USE_EXTENDED_FE 0
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index dbe1ce5b450a..c7fcb855e068 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -32,6 +32,11 @@ struct fsverity_hash_alg {
unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */
mempool_t req_pool; /* mempool with a preallocated hash request */
+ /*
+ * The HASH_ALGO_* constant for this algorithm. This is different from
+ * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme.
+ */
+ enum hash_algo algo_id;
};
/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 71d0fccb6d4c..6f8170cf4ae7 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -16,11 +16,13 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
.name = "sha256",
.digest_size = SHA256_DIGEST_SIZE,
.block_size = SHA256_BLOCK_SIZE,
+ .algo_id = HASH_ALGO_SHA256,
},
[FS_VERITY_HASH_ALG_SHA512] = {
.name = "sha512",
.digest_size = SHA512_DIGEST_SIZE,
.block_size = SHA512_BLOCK_SIZE,
+ .algo_id = HASH_ALGO_SHA512,
},
};
@@ -324,5 +326,9 @@ void __init fsverity_check_hash_algs(void)
*/
BUG_ON(!is_power_of_2(alg->digest_size));
BUG_ON(!is_power_of_2(alg->block_size));
+
+ /* Verify that there is a valid mapping to HASH_ALGO_*. */
+ BUG_ON(alg->algo_id == 0);
+ BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]);
}
}
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index e99c00350c28..5c79ea1b2468 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
* @alg: (out) pointer to the hash algorithm enumeration
*
* Return the file hash algorithm and digest of an fsverity protected file.
- * Assumption: before calling fsverity_get_digest(), the file must have been
- * opened.
+ * Assumption: before calling this, the file must have been opened.
*
* Return: 0 on success, -errno on failure
*/
@@ -76,27 +75,13 @@ int fsverity_get_digest(struct inode *inode,
{
const struct fsverity_info *vi;
const struct fsverity_hash_alg *hash_alg;
- int i;
vi = fsverity_get_info(inode);
if (!vi)
return -ENODATA; /* not a verity file */
hash_alg = vi->tree_params.hash_alg;
- memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE);
-
- /* convert the verity hash algorithm name to a hash_algo_name enum */
- i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name);
- if (i < 0)
- return -EINVAL;
- *alg = i;
-
- if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg]))
- return -EINVAL;
memcpy(digest, vi->file_digest, hash_alg->digest_size);
-
- pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg],
- hash_digest_size[*alg], digest);
-
+ *alg = hash_alg->algo_id;
return 0;
}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index bde8c9b7d25f..961ba248021f 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -200,9 +200,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page);
* @bio: the bio to verify
*
* Verify a set of pages that have just been read from a verity file. The pages
- * must be pagecache pages that are still locked and not yet uptodate. Pages
- * that fail verification are set to the Error state. Verification is skipped
- * for pages already in the Error state, e.g. due to fscrypt decryption failure.
+ * must be pagecache pages that are still locked and not yet uptodate. If a
+ * page fails verification, then bio->bi_status is set to an error status.
*
* This is a helper function for use by the ->readahead() method of filesystems
* that issue bios to read data directly into the page cache. Filesystems that
@@ -244,9 +243,10 @@ void fsverity_verify_bio(struct bio *bio)
unsigned long level0_ra_pages =
min(max_ra_pages, params->level0_blocks - level0_index);
- if (!PageError(page) &&
- !verify_page(inode, vi, req, page, level0_ra_pages))
- SetPageError(page);
+ if (!verify_page(inode, vi, req, page, level0_ra_pages)) {
+ bio->bi_status = BLK_STS_IOERR;
+ break;
+ }
}
fsverity_free_hash_request(params->hash_alg, req);
diff --git a/fs/xattr.c b/fs/xattr.c
index 61107b6bbed2..86668d2ce268 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -80,6 +80,31 @@ xattr_resolve_name(struct inode *inode, const char **name)
return ERR_PTR(-EOPNOTSUPP);
}
+/**
+ * may_write_xattr - check whether inode allows writing xattr
+ * @mnt_userns: User namespace of the mount the inode was found from
+ * @inode: the inode on which to set an xattr
+ *
+ * Check whether the inode allows writing xattrs. Specifically, we can never
+ * set or remove an extended attribute on a read-only filesystem or on an
+ * immutable / append-only inode.
+ *
+ * We also need to ensure that the inode has a mapping in the mount to
+ * not risk writing back invalid i_{g,u}id values.
+ *
+ * Return: On success zero is returned. On error a negative errno is returned.
+ */
+int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode)
+{
+ if (IS_IMMUTABLE(inode))
+ return -EPERM;
+ if (IS_APPEND(inode))
+ return -EPERM;
+ if (HAS_UNMAPPED_ID(mnt_userns, inode))
+ return -EPERM;
+ return 0;
+}
+
/*
* Check permissions for extended attribute access. This is a bit complicated
* because different namespaces have very different rules.
@@ -88,20 +113,12 @@ static int
xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
const char *name, int mask)
{
- /*
- * We can never set or remove an extended attribute on a read-only
- * filesystem or on an immutable / append-only inode.
- */
if (mask & MAY_WRITE) {
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- /*
- * Updating an xattr will likely cause i_uid and i_gid
- * to be writen back improperly if their true value is
- * unknown to the vfs.
- */
- if (HAS_UNMAPPED_ID(mnt_userns, inode))
- return -EPERM;
+ int ret;
+
+ ret = may_write_xattr(mnt_userns, inode);
+ if (ret)
+ return ret;
}
/*
@@ -172,6 +189,9 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
{
const struct xattr_handler *handler;
+ if (is_posix_acl_xattr(name))
+ return -EOPNOTSUPP;
+
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
@@ -282,12 +302,6 @@ out:
}
EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
-static inline bool is_posix_acl_xattr(const char *name)
-{
- return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
- (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
-}
-
int
vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
const char *name, const void *value, size_t size, int flags)
@@ -399,6 +413,9 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
{
const struct xattr_handler *handler;
+ if (is_posix_acl_xattr(name))
+ return -EOPNOTSUPP;
+
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
@@ -437,10 +454,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return ret;
}
nolsm:
- error = __vfs_getxattr(dentry, inode, name, value, size);
- if (error > 0 && is_posix_acl_xattr(name))
- posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size);
- return error;
+ return __vfs_getxattr(dentry, inode, name, value, size);
}
EXPORT_SYMBOL_GPL(vfs_getxattr);
@@ -471,6 +485,9 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct inode *inode = d_inode(dentry);
const struct xattr_handler *handler;
+ if (is_posix_acl_xattr(name))
+ return -EOPNOTSUPP;
+
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
@@ -580,23 +597,19 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
return error;
}
-static void setxattr_convert(struct user_namespace *mnt_userns,
- struct dentry *d, struct xattr_ctx *ctx)
-{
- if (ctx->size && is_posix_acl_xattr(ctx->kname->name))
- posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size);
-}
-
-int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct xattr_ctx *ctx)
{
- setxattr_convert(mnt_userns, dentry, ctx);
- return vfs_setxattr(mnt_userns, dentry, ctx->kname->name,
+ if (is_posix_acl_xattr(ctx->kname->name))
+ return do_set_acl(idmap, dentry, ctx->kname->name,
+ ctx->kvalue, ctx->size);
+
+ return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name,
ctx->kvalue, ctx->size, ctx->flags);
}
static long
-setxattr(struct user_namespace *mnt_userns, struct dentry *d,
+setxattr(struct mnt_idmap *idmap, struct dentry *d,
const char __user *name, const void __user *value, size_t size,
int flags)
{
@@ -614,7 +627,7 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error)
return error;
- error = do_setxattr(mnt_userns, d, &ctx);
+ error = do_setxattr(idmap, d, &ctx);
kvfree(ctx.kvalue);
return error;
@@ -633,7 +646,7 @@ retry:
return error;
error = mnt_want_write(path.mnt);
if (!error) {
- error = setxattr(mnt_user_ns(path.mnt), path.dentry, name,
+ error = setxattr(mnt_idmap(path.mnt), path.dentry, name,
value, size, flags);
mnt_drop_write(path.mnt);
}
@@ -670,7 +683,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
- error = setxattr(file_mnt_user_ns(f.file),
+ error = setxattr(file_mnt_idmap(f.file),
f.file->f_path.dentry, name,
value, size, flags);
mnt_drop_write_file(f.file);
@@ -683,7 +696,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
* Extended attribute GET operations
*/
ssize_t
-do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
struct xattr_ctx *ctx)
{
ssize_t error;
@@ -697,10 +710,12 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
return -ENOMEM;
}
- error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
+ if (is_posix_acl_xattr(ctx->kname->name))
+ error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size);
+ else
+ error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname,
+ ctx->kvalue, ctx->size);
if (error > 0) {
- if (is_posix_acl_xattr(kname))
- posix_acl_fix_xattr_to_user(ctx->kvalue, error);
if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
@@ -713,7 +728,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
}
static ssize_t
-getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+getxattr(struct mnt_idmap *idmap, struct dentry *d,
const char __user *name, void __user *value, size_t size)
{
ssize_t error;
@@ -732,7 +747,7 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error < 0)
return error;
- error = do_getxattr(mnt_userns, d, &ctx);
+ error = do_getxattr(idmap, d, &ctx);
kvfree(ctx.kvalue);
return error;
@@ -748,7 +763,7 @@ retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
- error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size);
+ error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -778,7 +793,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
if (!f.file)
return error;
audit_file(f.file);
- error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry,
+ error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry,
name, value, size);
fdput(f);
return error;
@@ -863,7 +878,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
* Extended attribute REMOVE operations
*/
static long
-removexattr(struct user_namespace *mnt_userns, struct dentry *d,
+removexattr(struct mnt_idmap *idmap, struct dentry *d,
const char __user *name)
{
int error;
@@ -875,7 +890,10 @@ removexattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error < 0)
return error;
- return vfs_removexattr(mnt_userns, d, kname);
+ if (is_posix_acl_xattr(kname))
+ return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname);
+
+ return vfs_removexattr(mnt_idmap_owner(idmap), d, kname);
}
static int path_removexattr(const char __user *pathname,
@@ -889,7 +907,7 @@ retry:
return error;
error = mnt_want_write(path.mnt);
if (!error) {
- error = removexattr(mnt_user_ns(path.mnt), path.dentry, name);
+ error = removexattr(mnt_idmap(path.mnt), path.dentry, name);
mnt_drop_write(path.mnt);
}
path_put(&path);
@@ -922,7 +940,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
- error = removexattr(file_mnt_user_ns(f.file),
+ error = removexattr(file_mnt_idmap(f.file),
f.file->f_path.dentry, name);
mnt_drop_write_file(f.file);
}
@@ -1140,7 +1158,7 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size,
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
char *buffer, size_t size)
{
- bool trusted = capable(CAP_SYS_ADMIN);
+ bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
struct simple_xattr *xattr;
ssize_t remaining_size = size;
int err = 0;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index de79f5d07f65..989cf341779b 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1516,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock(
#ifdef DEBUG
/* Randomly don't execute the first algorithm. */
- if (prandom_u32_max(2))
+ if (get_random_u32_below(2))
return 0;
#endif
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 94db50eb706a..5118dedf9267 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc(
/* randomly do sparse inode allocations */
if (xfs_has_sparseinodes(tp->t_mountp) &&
igeo->ialloc_min_blks < igeo->ialloc_blks)
- do_sparse = prandom_u32_max(2);
+ do_sparse = get_random_u32_below(2);
#endif
/*
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b744c62052b6..a05f44eb8178 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -242,12 +242,13 @@ xfs_acl_set_mode(
}
int
-xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
umode_t mode;
bool set_mode = false;
int error = 0;
+ struct inode *inode = d_inode(dentry);
if (!acl)
goto set_acl;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 263404d0bfda..dcd176149c7a 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -11,7 +11,7 @@ struct posix_acl;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu);
-extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
void xfs_forget_acl(struct inode *inode, const char *name);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c6b2aabd6f18..822e6a0e9d1a 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -279,7 +279,7 @@ xfs_errortag_test(
ASSERT(error_tag < XFS_ERRTAG_MAX);
randfactor = mp->m_errortag[error_tag];
- if (!randfactor || prandom_u32_max(randfactor))
+ if (!randfactor || get_random_u32_below(randfactor))
return false;
xfs_warn_ratelimited(mp,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2e10e1c66ad6..712238305bc3 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -651,6 +651,7 @@ xfs_vn_change_ok(
static int
xfs_setattr_nonsize(
struct user_namespace *mnt_userns,
+ struct dentry *dentry,
struct xfs_inode *ip,
struct iattr *iattr)
{
@@ -757,7 +758,7 @@ xfs_setattr_nonsize(
* Posix ACL code seems to care about this issue either.
*/
if (mask & ATTR_MODE) {
- error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
if (error)
return error;
}
@@ -779,6 +780,7 @@ out_dqrele:
STATIC int
xfs_setattr_size(
struct user_namespace *mnt_userns,
+ struct dentry *dentry,
struct xfs_inode *ip,
struct iattr *iattr)
{
@@ -810,7 +812,7 @@ xfs_setattr_size(
* Use the regular setattr path to update the timestamps.
*/
iattr->ia_valid &= ~ATTR_SIZE;
- return xfs_setattr_nonsize(mnt_userns, ip, iattr);
+ return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
}
/*
@@ -987,7 +989,7 @@ xfs_vn_setattr_size(
error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (error)
return error;
- return xfs_setattr_size(mnt_userns, ip, iattr);
+ return xfs_setattr_size(mnt_userns, dentry, ip, iattr);
}
STATIC int
@@ -1019,7 +1021,7 @@ xfs_vn_setattr(
error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (!error)
- error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
+ error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
}
return error;
@@ -1101,7 +1103,7 @@ xfs_vn_tmpfile(
}
static const struct inode_operations xfs_inode_operations = {
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -1128,7 +1130,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -1155,7 +1157,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,