diff options
Diffstat (limited to 'fs')
276 files changed, 4970 insertions, 3986 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 4dac4a0dc5f4..c397c51f80d9 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -17,34 +17,64 @@ #include "v9fs_vfs.h" #include "fid.h" -static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) +static struct posix_acl *v9fs_fid_get_acl(struct p9_fid *fid, const char *name) { ssize_t size; void *value = NULL; struct posix_acl *acl = NULL; size = v9fs_fid_xattr_get(fid, name, NULL, 0); - if (size > 0) { - value = kzalloc(size, GFP_NOFS); - if (!value) - return ERR_PTR(-ENOMEM); - size = v9fs_fid_xattr_get(fid, name, value, size); - if (size > 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - goto err_out; - } - } else if (size == -ENODATA || size == 0 || - size == -ENOSYS || size == -EOPNOTSUPP) { - acl = NULL; - } else - acl = ERR_PTR(-EIO); - -err_out: + if (size < 0) + return ERR_PTR(size); + if (size == 0) + return ERR_PTR(-ENODATA); + + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + + size = v9fs_fid_xattr_get(fid, name, value, size); + if (size < 0) + acl = ERR_PTR(size); + else if (size == 0) + acl = ERR_PTR(-ENODATA); + else + acl = posix_acl_from_xattr(&init_user_ns, value, size); kfree(value); return acl; } +static struct posix_acl *v9fs_acl_get(struct dentry *dentry, const char *name) +{ + struct p9_fid *fid; + struct posix_acl *acl = NULL; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return ERR_CAST(fid); + + acl = v9fs_fid_get_acl(fid, name); + p9_fid_put(fid); + return acl; +} + +static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, const char *name) +{ + int retval; + struct posix_acl *acl = NULL; + + acl = v9fs_fid_get_acl(fid, name); + if (!IS_ERR(acl)) + return acl; + + retval = PTR_ERR(acl); + if (retval == -ENODATA || retval == -ENOSYS || retval == -EOPNOTSUPP) + return NULL; + + /* map everything else to -EIO */ + return ERR_PTR(-EIO); +} + int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { int retval = 0; @@ -89,7 +119,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu) +struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu) { struct v9fs_session_info *v9ses; @@ -109,6 +139,112 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu) } +struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ + struct v9fs_session_info *v9ses; + + v9ses = v9fs_dentry2v9ses(dentry); + /* We allow set/get/list of acl when access=client is not specified. */ + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) + return v9fs_acl_get(dentry, posix_acl_xattr_name(type)); + return v9fs_get_cached_acl(d_inode(dentry), type); +} + +int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + int retval; + size_t size = 0; + void *value = NULL; + const char *acl_name; + struct v9fs_session_info *v9ses; + struct inode *inode = d_inode(dentry); + + if (acl) { + retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); + if (retval) + goto err_out; + + size = posix_acl_xattr_size(acl->a_count); + + value = kzalloc(size, GFP_NOFS); + if (!value) { + retval = -ENOMEM; + goto err_out; + } + + retval = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (retval < 0) + goto err_out; + } + + /* + * set the attribute on the remote. Without even looking at the + * xattr value. We leave it to the server to validate + */ + acl_name = posix_acl_xattr_name(type); + v9ses = v9fs_dentry2v9ses(dentry); + if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { + retval = v9fs_xattr_set(dentry, acl_name, value, size, 0); + goto err_out; + } + + if (S_ISLNK(inode->i_mode)) { + retval = -EOPNOTSUPP; + goto err_out; + } + + if (!inode_owner_or_capable(&init_user_ns, inode)) { + retval = -EPERM; + goto err_out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + if (acl) { + struct iattr iattr = {}; + struct posix_acl *acl_mode = acl; + + retval = posix_acl_update_mode(&init_user_ns, inode, + &iattr.ia_mode, + &acl_mode); + if (retval) + goto err_out; + if (!acl_mode) { + /* + * ACL can be represented by the mode bits. + * So don't update ACL below. + */ + kfree(value); + value = NULL; + size = 0; + } + iattr.ia_valid = ATTR_MODE; + /* + * FIXME should we update ctime ? + * What is the following setxattr update the mode ? + */ + v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + retval = acl ? -EINVAL : 0; + goto err_out; + } + break; + } + + retval = v9fs_xattr_set(dentry, acl_name, value, size, 0); + if (!retval) + set_cached_acl(inode, type, acl); + +err_out: + kfree(value); + return retval; +} + static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) { int retval; @@ -207,124 +343,3 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, *modep = mode; return 0; } - -static int v9fs_xattr_get_acl(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - struct v9fs_session_info *v9ses; - struct posix_acl *acl; - int error; - - v9ses = v9fs_dentry2v9ses(dentry); - /* - * We allow set/get/list of acl when access=client is not specified - */ - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_xattr_get(dentry, handler->name, buffer, size); - - acl = v9fs_get_cached_acl(inode, handler->flags); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int v9fs_xattr_set_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - int retval; - struct posix_acl *acl; - struct v9fs_session_info *v9ses; - - v9ses = v9fs_dentry2v9ses(dentry); - /* - * set the attribute on the remote. Without even looking at the - * xattr value. We leave it to the server to validate - */ - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_xattr_set(dentry, handler->name, value, size, - flags); - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EPERM; - if (value) { - /* update the cached acl value */ - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); - if (retval) - goto err_out; - } - } else - acl = NULL; - - switch (handler->flags) { - case ACL_TYPE_ACCESS: - if (acl) { - struct iattr iattr = { 0 }; - struct posix_acl *old_acl = acl; - - retval = posix_acl_update_mode(&init_user_ns, inode, - &iattr.ia_mode, &acl); - if (retval) - goto err_out; - if (!acl) { - /* - * ACL can be represented - * by the mode bits. So don't - * update ACL. - */ - posix_acl_release(old_acl); - value = NULL; - size = 0; - } - iattr.ia_valid = ATTR_MODE; - /* FIXME should we update ctime ? - * What is the following setxattr update the - * mode ? - */ - v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); - } - break; - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - retval = acl ? -EINVAL : 0; - goto err_out; - } - break; - default: - BUG(); - } - retval = v9fs_xattr_set(dentry, handler->name, value, size, flags); - if (!retval) - set_cached_acl(inode, handler->flags, acl); -err_out: - posix_acl_release(acl); - return retval; -} - -const struct xattr_handler v9fs_xattr_acl_access_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = v9fs_xattr_get_acl, - .set = v9fs_xattr_set_acl, -}; - -const struct xattr_handler v9fs_xattr_acl_default_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = v9fs_xattr_get_acl, - .set = v9fs_xattr_set_acl, -}; diff --git a/fs/9p/acl.h b/fs/9p/acl.h index ce5175d463dd..4c60a2bce5de 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -8,8 +8,12 @@ #ifdef CONFIG_9P_FS_POSIX_ACL int v9fs_get_acl(struct inode *inode, struct p9_fid *fid); -struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, +struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type); +int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type); int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid); int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, struct posix_acl *dacl, struct posix_acl *acl); @@ -17,7 +21,9 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl); #else +#define v9fs_iop_get_inode_acl NULL #define v9fs_iop_get_acl NULL +#define v9fs_iop_set_acl NULL static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) { return 0; diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 47b9a1122f34..a19891015f19 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -40,7 +40,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) size_t len = subreq->len - subreq->transferred; int total, err; - iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len); + iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len); total = p9_client_read(fid, pos, &to, &err); @@ -172,7 +172,7 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio) len = min_t(loff_t, i_size - start, len); - iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len); + iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len); /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 000fbaae9b18..3bb95adc9619 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -109,7 +109,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) struct iov_iter to; int n; - iov_iter_kvec(&to, READ, &kvec, 1, buflen); + iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buflen); n = p9_client_read(file->private_data, ctx->pos, &to, &err); if (err) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 5cfa4b4f070f..03c1743c4aff 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -983,14 +983,18 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, + .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, + .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, + .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, + .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 1f9298a4bd42..b6984311e00a 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/uio.h> +#include <linux/posix_acl_xattr.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -24,7 +25,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, struct iov_iter to; int err; - iov_iter_kvec(&to, READ, &kvec, 1, buffer_size); + iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buffer_size); attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { @@ -109,7 +110,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, struct iov_iter from; int retval, err; - iov_iter_kvec(&from, WRITE, &kvec, 1, value_len); + iov_iter_kvec(&from, ITER_SOURCE, &kvec, 1, value_len); p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", name, value_len, flags); @@ -182,9 +183,9 @@ static struct xattr_handler v9fs_xattr_security_handler = { const struct xattr_handler *v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, &v9fs_xattr_trusted_handler, -#ifdef CONFIG_9P_FS_POSIX_ACL - &v9fs_xattr_acl_access_handler, - &v9fs_xattr_acl_default_handler, +#ifdef CONFIG_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_9P_FS_SECURITY &v9fs_xattr_security_handler, diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index 3e11fc3331eb..b5636e544c8a 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -11,8 +11,6 @@ #include <net/9p/client.h> extern const struct xattr_handler *v9fs_xattr_handlers[]; -extern const struct xattr_handler v9fs_xattr_acl_access_handler; -extern const struct xattr_handler v9fs_xattr_acl_default_handler; ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, void *buffer, size_t buffer_size); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 0a090d614e76..7dcd59693a0c 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -298,7 +298,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (call->count2 != call->count && call->count2 != 0) return afs_protocol_error(call, afs_eproto_cb_count); call->iter = &call->def_iter; - iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4); + iov_iter_discard(&call->def_iter, ITER_DEST, call->count2 * 3 * 4); call->unmarshall++; fallthrough; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 230c2d19116d..104df2964225 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -305,7 +305,7 @@ expand: req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages, + iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages, 0, i_size); req->iter = &req->def_iter; diff --git a/fs/afs/file.c b/fs/afs/file.c index d1cfb235c4b9..2eeab57df133 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -324,7 +324,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->vnode = vnode; fsreq->iter = &fsreq->def_iter; - iov_iter_xarray(&fsreq->def_iter, READ, + iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &fsreq->vnode->netfs.inode.i_mapping->i_pages, fsreq->pos, fsreq->len); @@ -346,7 +346,7 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio) fsreq->len = folio_size(folio); fsreq->vnode = vnode; fsreq->iter = &fsreq->def_iter; - iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages, + iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages, fsreq->pos, fsreq->len); ret = afs_fetch_data(fsreq->vnode, fsreq); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 723d162078a3..9ba7b68375c9 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1301,7 +1301,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; - iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size); + iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size); } static inline void afs_extract_to_tmp(struct afs_call *call) @@ -1319,7 +1319,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call) static inline void afs_extract_discard(struct afs_call *call, size_t size) { call->iov_len = size; - iov_iter_discard(&call->def_iter, READ, size); + iov_iter_discard(&call->def_iter, ITER_DEST, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index eccc3cd0cb70..c62939e5ea1f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -359,7 +359,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0); @@ -400,7 +400,7 @@ error_do_abort: RX_USER_ABORT, ret, "KSD"); } else { len = 0; - iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); @@ -485,7 +485,7 @@ static void afs_deliver_to_call(struct afs_call *call) ) { if (state == AFS_CALL_SV_AWAIT_ACK) { len = 0; - iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0); + iov_iter_kvec(&call->def_iter, ITER_DEST, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, call->rxcall, &call->def_iter, &len, false, &remote_abort, @@ -822,7 +822,7 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -862,7 +862,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; diff --git a/fs/afs/write.c b/fs/afs/write.c index 9ebdd36eaf2f..08fd456dde67 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -609,7 +609,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, */ afs_write_to_cache(vnode, start, len, i_size, caching); - iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); ret = afs_store_data(vnode, &iter, start, false); } else { _debug("write discard %x @%llx [%llx]", len, start, i_size); @@ -1000,7 +1000,7 @@ int afs_launder_folio(struct folio *folio) bv[0].bv_page = &folio->page; bv[0].bv_offset = f; bv[0].bv_len = t - f; - iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); + iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len); trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); @@ -1552,7 +1552,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, if (unlikely(!file->f_op->read_iter)) return -EINVAL; - ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); + ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); @@ -1580,7 +1580,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, if (unlikely(!file->f_op->write_iter)) return -EINVAL; - ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); + ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter); if (ret < 0) return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); diff --git a/fs/attr.c b/fs/attr.c index 1552a5f23d6b..b45f30e516fa 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,6 +18,70 @@ #include <linux/evm.h> #include <linux/ima.h> +#include "internal.h" + +/** + * setattr_should_drop_sgid - determine whether the setgid bit needs to be + * removed + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check + * + * This function determines whether the setgid bit needs to be removed. + * We retain backwards compatibility and require setgid bit to be removed + * unconditionally if S_IXGRP is set. Otherwise we have the exact same + * requirements as setattr_prepare() and setattr_copy(). + * + * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. + */ +int setattr_should_drop_sgid(struct user_namespace *mnt_userns, + const struct inode *inode) +{ + umode_t mode = inode->i_mode; + + if (!(mode & S_ISGID)) + return 0; + if (mode & S_IXGRP) + return ATTR_KILL_SGID; + if (!in_group_or_capable(mnt_userns, inode, + i_gid_into_vfsgid(mnt_userns, inode))) + return ATTR_KILL_SGID; + return 0; +} + +/** + * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to + * be dropped + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check + * + * This function determines whether the set{g,u}id bits need to be removed. + * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the + * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both + * set{g,u}id bits need to be removed the corresponding mask of both flags is + * returned. + * + * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits + * to remove, 0 otherwise. + */ +int setattr_should_drop_suidgid(struct user_namespace *mnt_userns, + struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + kill |= setattr_should_drop_sgid(mnt_userns, inode); + + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) + return kill; + + return 0; +} +EXPORT_SYMBOL(setattr_should_drop_suidgid); + /** * chown_ok - verify permissions to chown inode * @mnt_userns: user namespace of the mount @inode was found from @@ -140,8 +204,7 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* Also check the setgid bit! */ - if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!in_group_or_capable(mnt_userns, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } @@ -251,9 +314,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!in_group_or_capable(mnt_userns, inode, + i_gid_into_vfsgid(mnt_userns, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } @@ -375,7 +437,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, } } if (ia_valid & ATTR_KILL_SGID) { - if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + if (mode & S_ISGID) { if (!(ia_valid & ATTR_MODE)) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = inode->i_mode; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9d1cde8066cf..92737166203f 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -154,7 +154,7 @@ static int bad_inode_tmpfile(struct user_namespace *mnt_userns, } static int bad_inode_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, + struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; @@ -177,7 +177,7 @@ static const struct inode_operations bad_inode_ops = .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, - .get_acl = bad_inode_get_acl, + .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6a11025e5850..de63572a9404 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -248,7 +248,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, } while (0) #ifdef ARCH_DLINFO - /* + /* * ARCH_DLINFO must come first so PPC can do its special alignment of * AUXV. * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in @@ -456,13 +456,13 @@ static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) * * Loads ELF program headers from the binary file elf_file, which has the ELF * header pointed to by elf_ex, into a newly allocated array. The caller is - * responsible for freeing the allocated data. Returns an ERR_PTR upon failure. + * responsible for freeing the allocated data. Returns NULL upon failure. */ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, struct file *elf_file) { struct elf_phdr *elf_phdata = NULL; - int retval, err = -1; + int retval = -1; unsigned int size; /* @@ -484,15 +484,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, /* Read in the program headers */ retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff); - if (retval < 0) { - err = retval; - goto out; - } - /* Success! */ - err = 0; out: - if (err) { + if (retval) { kfree(elf_phdata); elf_phdata = NULL; } @@ -1020,7 +1014,7 @@ out_free_interp: executable_stack); if (retval < 0) goto out_free_dentry; - + elf_bss = 0; elf_brk = 0; @@ -1043,7 +1037,7 @@ out_free_interp: if (unlikely (elf_brk > elf_bss)) { unsigned long nbyte; - + /* There was a PT_LOAD segment with p_memsz > p_filesz before this one. Map anonymous pages, if needed, and clear the area. */ @@ -1166,7 +1160,7 @@ out_free_interp: error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { - retval = IS_ERR((void *)error) ? + retval = IS_ERR_VALUE(error) ? PTR_ERR((void*)error) : -EINVAL; goto out_free_dentry; } @@ -1251,7 +1245,7 @@ out_free_interp: interpreter, load_bias, interp_elf_phdata, &arch_state); - if (!IS_ERR((void *)elf_entry)) { + if (!IS_ERR_VALUE(elf_entry)) { /* * load_elf_interp() returns relocation * adjustment @@ -1260,7 +1254,7 @@ out_free_interp: elf_entry += interp_elf_ex->e_entry; } if (BAD_ADDR(elf_entry)) { - retval = IS_ERR((void *)elf_entry) ? + retval = IS_ERR_VALUE(elf_entry) ? (int)elf_entry : -EINVAL; goto out_free_dentry; } @@ -1521,7 +1515,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) phdr->p_align = 0; } -static void fill_note(struct memelfnote *note, const char *name, int type, +static void fill_note(struct memelfnote *note, const char *name, int type, unsigned int sz, void *data) { note->name = name; @@ -1724,7 +1718,6 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm return 0; } -#ifdef CORE_DUMP_USE_REGSET #include <linux/regset.h> struct elf_thread_core_info { @@ -1745,6 +1738,7 @@ struct elf_note_info { int thread_notes; }; +#ifdef CORE_DUMP_USE_REGSET /* * When a regset has a writeback hook, we call it on each thread before * dumping user memory. On register window machines, this makes sure the @@ -1824,34 +1818,58 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, return 1; } +#else +static int fill_thread_core_info(struct elf_thread_core_info *t, + const struct user_regset_view *view, + long signr, struct elf_note_info *info) +{ + struct task_struct *p = t->task; + elf_fpregset_t *fpu; + + fill_prstatus(&t->prstatus.common, p, signr); + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + &(t->prstatus)); + info->size += notesize(&t->notes[0]); + + fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL); + if (!fpu || !elf_core_copy_task_fpregs(p, fpu)) { + kfree(fpu); + return 1; + } + + t->prstatus.pr_fpvalid = 1; + fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu); + info->size += notesize(&t->notes[1]); + + return 1; +} +#endif static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, struct coredump_params *cprm) { struct task_struct *dump_task = current; - const struct user_regset_view *view = task_user_regset_view(dump_task); + const struct user_regset_view *view; struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; - unsigned int i; - - info->size = 0; - info->thread = NULL; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); - if (psinfo == NULL) { - info->psinfo.data = NULL; /* So we don't free this wrongly */ + if (!psinfo) return 0; - } - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); +#ifdef CORE_DUMP_USE_REGSET + view = task_user_regset_view(dump_task); + /* * Figure out how many notes we're going to need for each thread. */ info->thread_notes = 0; - for (i = 0; i < view->n; ++i) + for (int i = 0; i < view->n; ++i) if (view->regsets[i].core_note_type != 0) ++info->thread_notes; @@ -1870,11 +1888,23 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, */ fill_elf_header(elf, phdrs, view->e_machine, view->e_flags); +#else + view = NULL; + info->thread_notes = 2; + fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); +#endif /* * Allocate a structure for each thread. */ - for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) { + info->thread = kzalloc(offsetof(struct elf_thread_core_info, + notes[info->thread_notes]), + GFP_KERNEL); + if (unlikely(!info->thread)) + return 0; + + info->thread->task = dump_task; + for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) { t = kzalloc(offsetof(struct elf_thread_core_info, notes[info->thread_notes]), GFP_KERNEL); @@ -1882,17 +1912,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 0; t->task = ct->task; - if (ct->task == dump_task || !info->thread) { - t->next = info->thread; - info->thread = t; - } else { - /* - * Make sure to keep the original task at - * the head of the list. - */ - t->next = info->thread->next; - info->thread->next = t; - } + t->next = info->thread->next; + info->thread->next = t; } /* @@ -1920,11 +1941,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 1; } -static size_t get_note_info_size(struct elf_note_info *info) -{ - return info->size; -} - /* * Write all the notes for each thread. When writing the first thread, the * process-wide notes are interleaved after the first thread-specific note. @@ -1979,197 +1995,6 @@ static void free_note_info(struct elf_note_info *info) kvfree(info->files.data); } -#else - -/* Here is the structure in which status of each thread is captured. */ -struct elf_thread_status -{ - struct list_head list; - struct elf_prstatus prstatus; /* NT_PRSTATUS */ - elf_fpregset_t fpu; /* NT_PRFPREG */ - struct task_struct *thread; - struct memelfnote notes[3]; - int num_notes; -}; - -/* - * In order to add the specific thread information for the elf file format, - * we need to keep a linked list of every threads pr_status and then create - * a single section for them in the final core file. - */ -static int elf_dump_thread_status(long signr, struct elf_thread_status *t) -{ - int sz = 0; - struct task_struct *p = t->thread; - t->num_notes = 0; - - fill_prstatus(&t->prstatus.common, p, signr); - elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), - &(t->prstatus)); - t->num_notes++; - sz += notesize(&t->notes[0]); - - if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, - &t->fpu))) { - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), - &(t->fpu)); - t->num_notes++; - sz += notesize(&t->notes[1]); - } - return sz; -} - -struct elf_note_info { - struct memelfnote *notes; - struct memelfnote *notes_files; - struct elf_prstatus *prstatus; /* NT_PRSTATUS */ - struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */ - struct list_head thread_list; - elf_fpregset_t *fpu; - user_siginfo_t csigdata; - int thread_status_size; - int numnote; -}; - -static int elf_note_info_init(struct elf_note_info *info) -{ - memset(info, 0, sizeof(*info)); - INIT_LIST_HEAD(&info->thread_list); - - /* Allocate space for ELF notes */ - info->notes = kmalloc_array(8, sizeof(struct memelfnote), GFP_KERNEL); - if (!info->notes) - return 0; - info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); - if (!info->psinfo) - return 0; - info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); - if (!info->prstatus) - return 0; - info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); - if (!info->fpu) - return 0; - return 1; -} - -static int fill_note_info(struct elfhdr *elf, int phdrs, - struct elf_note_info *info, - struct coredump_params *cprm) -{ - struct core_thread *ct; - struct elf_thread_status *ets; - - if (!elf_note_info_init(info)) - return 0; - - for (ct = current->signal->core_state->dumper.next; - ct; ct = ct->next) { - ets = kzalloc(sizeof(*ets), GFP_KERNEL); - if (!ets) - return 0; - - ets->thread = ct->task; - list_add(&ets->list, &info->thread_list); - } - - list_for_each_entry(ets, &info->thread_list, list) { - int sz; - - sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets); - info->thread_status_size += sz; - } - /* now collect the dump for the current */ - memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo); - elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs); - - /* Set up header */ - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); - - /* - * Set up the notes in similar form to SVR4 core dumps made - * with info from their /proc. - */ - - fill_note(info->notes + 0, "CORE", NT_PRSTATUS, - sizeof(*info->prstatus), info->prstatus); - fill_psinfo(info->psinfo, current->group_leader, current->mm); - fill_note(info->notes + 1, "CORE", NT_PRPSINFO, - sizeof(*info->psinfo), info->psinfo); - - fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo); - fill_auxv_note(info->notes + 3, current->mm); - info->numnote = 4; - - if (fill_files_note(info->notes + info->numnote, cprm) == 0) { - info->notes_files = info->notes + info->numnote; - info->numnote++; - } - - /* Try to dump the FPU. */ - info->prstatus->pr_fpvalid = - elf_core_copy_task_fpregs(current, cprm->regs, info->fpu); - if (info->prstatus->pr_fpvalid) - fill_note(info->notes + info->numnote++, - "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); - return 1; -} - -static size_t get_note_info_size(struct elf_note_info *info) -{ - int sz = 0; - int i; - - for (i = 0; i < info->numnote; i++) - sz += notesize(info->notes + i); - - sz += info->thread_status_size; - - return sz; -} - -static int write_note_info(struct elf_note_info *info, - struct coredump_params *cprm) -{ - struct elf_thread_status *ets; - int i; - - for (i = 0; i < info->numnote; i++) - if (!writenote(info->notes + i, cprm)) - return 0; - - /* write out the thread status notes section */ - list_for_each_entry(ets, &info->thread_list, list) { - for (i = 0; i < ets->num_notes; i++) - if (!writenote(&ets->notes[i], cprm)) - return 0; - } - - return 1; -} - -static void free_note_info(struct elf_note_info *info) -{ - while (!list_empty(&info->thread_list)) { - struct list_head *tmp = info->thread_list.next; - list_del(tmp); - kfree(list_entry(tmp, struct elf_thread_status, list)); - } - - /* Free data possibly allocated by fill_files_note(): */ - if (info->notes_files) - kvfree(info->notes_files->data); - - kfree(info->prstatus); - kfree(info->psinfo); - kfree(info->notes); - kfree(info->fpu); -} - -#endif - static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, elf_addr_t e_shoff, int segs) { @@ -2233,7 +2058,7 @@ static int elf_core_dump(struct coredump_params *cprm) /* Write notes phdr entry */ { - size_t sz = get_note_info_size(&info); + size_t sz = info.size; /* For cell spufs */ sz += elf_coredump_extra_notes_size(); @@ -2295,7 +2120,7 @@ static int elf_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; - /* write out the notes section */ + /* write out the notes section */ if (!write_note_info(&info, cprm)) goto end_coredump; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 08d0c8797828..096e3520a0b1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -434,8 +434,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) current->mm->start_stack = current->mm->start_brk + stack_size; #endif - if (create_elf_fdpic_tables(bprm, current->mm, - &exec_params, &interp_params) < 0) + retval = create_elf_fdpic_tables(bprm, current->mm, &exec_params, + &interp_params); + if (retval < 0) goto error; kdebug("- start_code %lx", current->mm->start_code); @@ -1603,7 +1604,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; - /* write out the notes section */ + /* write out the notes section */ if (!writenote(thread_list->notes, cprm)) goto end_coredump; if (!writenote(&psinfo_note, cprm)) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e1eae7ea823a..bb202ad369d5 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -44,10 +44,10 @@ static LIST_HEAD(entries); static int enabled = 1; enum {Enabled, Magic}; -#define MISC_FMT_PRESERVE_ARGV0 (1 << 31) -#define MISC_FMT_OPEN_BINARY (1 << 30) -#define MISC_FMT_CREDENTIALS (1 << 29) -#define MISC_FMT_OPEN_FILE (1 << 28) +#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31) +#define MISC_FMT_OPEN_BINARY (1UL << 30) +#define MISC_FMT_CREDENTIALS (1UL << 29) +#define MISC_FMT_OPEN_FILE (1UL << 28) typedef struct { struct list_head list; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 100bae33c677..3da1779e8b79 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -110,10 +110,11 @@ out: return ret; } -int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int ret; + struct inode *inode = d_inode(dentry); umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 45197b4f73bf..39bd36e6eeb7 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -6,7 +6,7 @@ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); -int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 905ea19df125..8bcad9940154 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5307,7 +5307,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); + err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); } return err; @@ -11360,7 +11360,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .mknod = btrfs_mknod, .listxattr = btrfs_listxattr, .permission = btrfs_permission, - .get_acl = btrfs_get_acl, + .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, @@ -11413,7 +11413,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .listxattr = btrfs_listxattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, - .get_acl = btrfs_get_acl, + .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .fileattr_get = btrfs_fileattr_get, @@ -11424,7 +11424,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .setattr = btrfs_setattr, .permission = btrfs_permission, .listxattr = btrfs_listxattr, - .get_acl = btrfs_get_acl, + .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4fd6b61b06a4..7e348bd2ccde 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4393,7 +4393,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } - ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out_acct; @@ -4492,7 +4492,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool if (args.len > args.unencoded_len - args.unencoded_offset) goto out_acct; - ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), + ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out_acct; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4bed0839b640..57d8c72737e1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -764,11 +764,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) struct btrfs_ordered_extent *ordered; if (start + len < start) { - orig_end = INT_LIMIT(loff_t); + orig_end = OFFSET_MAX; } else { orig_end = start + len - 1; - if (orig_end > INT_LIMIT(loff_t)) - orig_end = INT_LIMIT(loff_t); + if (orig_end > OFFSET_MAX) + orig_end = OFFSET_MAX; } /* start IO across the range first to instantiate any delalloc diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 000a28f46e59..175a25fcade8 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -385,38 +385,35 @@ static int cachefiles_write(struct netfs_cache_resources *cres, term_func, term_func_priv); } -/* - * Prepare a read operation, shortening it to a cached/uncached - * boundary as appropriate. - */ -static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, - loff_t i_size) +static inline enum netfs_io_source +cachefiles_do_prepare_read(struct netfs_cache_resources *cres, + loff_t start, size_t *_len, loff_t i_size, + unsigned long *_flags, ino_t netfs_ino) { enum cachefiles_prepare_read_trace why; - struct netfs_io_request *rreq = subreq->rreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct cachefiles_object *object; + struct cachefiles_object *object = NULL; struct cachefiles_cache *cache; struct fscache_cookie *cookie = fscache_cres_cookie(cres); const struct cred *saved_cred; struct file *file = cachefiles_cres_file(cres); enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER; + size_t len = *_len; loff_t off, to; ino_t ino = file ? file_inode(file)->i_ino : 0; int rc; - _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size); + _enter("%zx @%llx/%llx", len, start, i_size); - if (subreq->start >= i_size) { + if (start >= i_size) { ret = NETFS_FILL_WITH_ZEROES; why = cachefiles_trace_read_after_eof; goto out_no_object; } if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { - __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); why = cachefiles_trace_read_no_data; - if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) + if (!test_bit(NETFS_SREQ_ONDEMAND, _flags)) goto out_no_object; } @@ -437,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest * retry: off = cachefiles_inject_read_error(); if (off == 0) - off = vfs_llseek(file, subreq->start, SEEK_DATA); + off = vfs_llseek(file, start, SEEK_DATA); if (off < 0 && off >= (loff_t)-MAX_ERRNO) { if (off == (loff_t)-ENXIO) { why = cachefiles_trace_read_seek_nxio; @@ -449,21 +446,22 @@ retry: goto out; } - if (off >= subreq->start + subreq->len) { + if (off >= start + len) { why = cachefiles_trace_read_found_hole; goto download_and_store; } - if (off > subreq->start) { + if (off > start) { off = round_up(off, cache->bsize); - subreq->len = off - subreq->start; + len = off - start; + *_len = len; why = cachefiles_trace_read_found_part; goto download_and_store; } to = cachefiles_inject_read_error(); if (to == 0) - to = vfs_llseek(file, subreq->start, SEEK_HOLE); + to = vfs_llseek(file, start, SEEK_HOLE); if (to < 0 && to >= (loff_t)-MAX_ERRNO) { trace_cachefiles_io_error(object, file_inode(file), to, cachefiles_trace_seek_error); @@ -471,12 +469,13 @@ retry: goto out; } - if (to < subreq->start + subreq->len) { - if (subreq->start + subreq->len >= i_size) + if (to < start + len) { + if (start + len >= i_size) to = round_up(to, cache->bsize); else to = round_down(to, cache->bsize); - subreq->len = to - subreq->start; + len = to - start; + *_len = len; } why = cachefiles_trace_read_have_data; @@ -484,12 +483,11 @@ retry: goto out; download_and_store: - __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) { - rc = cachefiles_ondemand_read(object, subreq->start, - subreq->len); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); + if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) { + rc = cachefiles_ondemand_read(object, start, len); if (!rc) { - __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags); + __clear_bit(NETFS_SREQ_ONDEMAND, _flags); goto retry; } ret = NETFS_INVALID_READ; @@ -497,11 +495,35 @@ download_and_store: out: cachefiles_end_secure(cache, saved_cred); out_no_object: - trace_cachefiles_prep_read(subreq, ret, why, ino); + trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino); return ret; } /* + * Prepare a read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, + loff_t i_size) +{ + return cachefiles_do_prepare_read(&subreq->rreq->cache_resources, + subreq->start, &subreq->len, i_size, + &subreq->flags, subreq->rreq->inode->i_ino); +} + +/* + * Prepare an on-demand read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_io_source +cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres, + loff_t start, size_t *_len, loff_t i_size, + unsigned long *_flags, ino_t ino) +{ + return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino); +} + +/* * Prepare for a write to occur. */ int __cachefiles_prepare_write(struct cachefiles_object *object, @@ -621,6 +643,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { .write = cachefiles_write, .prepare_read = cachefiles_prepare_read, .prepare_write = cachefiles_prepare_write, + .prepare_ondemand_read = cachefiles_prepare_ondemand_read, .query_occupancy = cachefiles_query_occupancy, }; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index f4fc8e0b847c..c7e8dd5b58d4 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -85,13 +85,14 @@ retry: return acl; } -int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int ret = 0, size = 0; const char *name = NULL; char *value = NULL; struct iattr newattrs; + struct inode *inode = d_inode(dentry); struct timespec64 old_ctime = inode->i_ctime; umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index dcf701b05cc1..61f47debec5a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -288,7 +288,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) } len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); if (err == 0) err = -EFAULT; @@ -327,7 +327,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) } dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); if (err < 0) { dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e7e2ebac330d..6c7026cc8988 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2033,7 +2033,7 @@ const struct inode_operations ceph_dir_iops = { .getattr = ceph_getattr, .setattr = ceph_setattr, .listxattr = ceph_listxattr, - .get_acl = ceph_get_acl, + .get_inode_acl = ceph_get_acl, .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 04fd34557de8..6f9580defb2b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1161,7 +1161,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) aio_req->total_len = rc + zlen; } - iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, + iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs, osd_data->num_bvecs, len); iov_iter_advance(&i, rc); iov_iter_zero(zlen, &i); @@ -1400,7 +1400,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int zlen = min_t(size_t, len - ret, size - pos - ret); - iov_iter_bvec(&i, READ, bvecs, num_pages, len); + iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len); iov_iter_advance(&i, ret); iov_iter_zero(zlen, &i); ret += zlen; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index bad9eeb6a1a5..f23c5a6edc6f 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -126,7 +126,7 @@ const struct inode_operations ceph_file_iops = { .setattr = ceph_setattr, .getattr = ceph_getattr, .listxattr = ceph_listxattr, - .get_acl = ceph_get_acl, + .get_inode_acl = ceph_get_acl, .set_acl = ceph_set_acl, }; @@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (nsplits != ci->i_fragtree_nsplits) { update = true; } else if (nsplits) { - i = prandom_u32_max(nsplits); + i = get_random_u32_below(nsplits); id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; @@ -2255,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, err = __ceph_setattr(inode, attr); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode); + err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode); return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 3e2843e86e27..f3b461c708a8 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -364,7 +364,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) *fcntl_count = 0; *flock_count = 0; - ctx = inode->i_flctx; + ctx = locks_inode_context(inode); if (ctx) { spin_lock(&ctx->flc_lock); list_for_each_entry(lock, &ctx->flc_posix, fl_list) @@ -418,7 +418,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx = locks_inode_context(inode); int err = 0; int seen_fcntl = 0; int seen_flock = 0; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 3fbabc98e1f7..7dac21ee6ce7 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) return -1; /* pick */ - n = prandom_u32_max(n); + n = get_random_u32_below(n); for (j = 0, i = 0; i < m->possible_max_rank; i++) { if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 40630e6f691c..50e57a1fa32f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1117,7 +1117,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); struct posix_acl *ceph_get_acl(struct inode *, int, bool); int ceph_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, int type); + struct dentry *dentry, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); void ceph_init_inode_acls(struct inode *inode, diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fa480d62f313..c647f0d56518 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -13,6 +13,9 @@ #include <linux/string.h> #include <linux/keyctl.h> #include <linux/key-type.h> +#include <uapi/linux/posix_acl.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include <keys/user-type.h> #include "cifspdu.h" #include "cifsglob.h" @@ -20,6 +23,8 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "fs_context.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" /* security id for everyone/world system group */ static const struct cifs_sid sid_everyone = { @@ -1668,3 +1673,137 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, kfree(pntsd); return rc; } + +struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ +#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) + struct posix_acl *acl = NULL; + ssize_t rc = -EOPNOTSUPP; + unsigned int xid; + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + const char *full_path; + void *page; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return ERR_CAST(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + page = alloc_dentry_path(); + + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + acl = ERR_CAST(full_path); + goto out; + } + + /* return alt name if available as pseudo attr */ + switch (type) { + case ACL_TYPE_ACCESS: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_get_acl(xid, pTcon, full_path, &acl, + ACL_TYPE_ACCESS, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + + case ACL_TYPE_DEFAULT: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_get_acl(xid, pTcon, full_path, &acl, + ACL_TYPE_DEFAULT, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + } + + if (rc < 0) { + if (rc == -EINVAL) + acl = ERR_PTR(-EOPNOTSUPP); + else + acl = ERR_PTR(rc); + } + +out: + free_dentry_path(page); + free_xid(xid); + cifs_put_tlink(tlink); + return acl; +#else + return ERR_PTR(-EOPNOTSUPP); +#endif +} + +int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type) +{ +#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) + int rc = -EOPNOTSUPP; + unsigned int xid; + struct super_block *sb = dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct tcon_link *tlink; + struct cifs_tcon *pTcon; + const char *full_path; + void *page; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + pTcon = tlink_tcon(tlink); + + xid = get_xid(); + page = alloc_dentry_path(); + + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); + goto out; + } + + if (!acl) + goto out; + + /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ + + /* if proc/fs/cifs/streamstoxattr is set then + search server for EAs or streams to + returns as xattrs */ + if (posix_acl_xattr_size(acl->a_count) > CIFSMaxBufSize) { + cifs_dbg(FYI, "size of EA value too large\n"); + rc = -EOPNOTSUPP; + goto out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_set_acl(xid, pTcon, full_path, acl, + ACL_TYPE_ACCESS, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + + case ACL_TYPE_DEFAULT: + if (sb->s_flags & SB_POSIXACL) + rc = cifs_do_set_acl(xid, pTcon, full_path, acl, + ACL_TYPE_DEFAULT, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + break; + } + +out: + free_dentry_path(page); + free_xid(xid); + cifs_put_tlink(tlink); + return rc; +#else + return -EOPNOTSUPP; +#endif +} diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 712a43161448..040267ed8a64 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1133,6 +1133,8 @@ const struct inode_operations cifs_dir_inode_ops = { .symlink = cifs_symlink, .mknod = cifs_mknod, .listxattr = cifs_listxattr, + .get_acl = cifs_get_acl, + .set_acl = cifs_set_acl, }; const struct inode_operations cifs_file_inode_ops = { @@ -1141,6 +1143,8 @@ const struct inode_operations cifs_file_inode_ops = { .permission = cifs_permission, .listxattr = cifs_listxattr, .fiemap = cifs_fiemap, + .get_acl = cifs_get_acl, + .set_acl = cifs_set_acl, }; const char *cifs_get_link(struct dentry *dentry, struct inode *inode, diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 83e83d8beabb..f50f96e4ec30 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -224,6 +224,10 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *, u32); extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *, u32); +extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type); +extern int cifs_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct posix_acl *acl, int type); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); extern unsigned int setup_authusers_ACE(struct cifs_ace *pace); @@ -537,14 +541,14 @@ extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16, struct cifs_ntsd *, __u32, int); -extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, - char *acl_inf, const int buflen, const int acl_type, - const struct nls_table *nls_codepage, int remap_special_chars); -extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *fileName, - const char *local_acl, const int buflen, const int acl_type, - const struct nls_table *nls_codepage, int remap_special_chars); +extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, + struct posix_acl **acl, const int acl_type, + const struct nls_table *nls_codepage, int remap); +extern int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, + const struct posix_acl *acl, const int acl_type, + const struct nls_table *nls_codepage, int remap); extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); #endif /* CIFS_ALLOW_INSECURE_LEGACY */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1724066c1536..23f10e0d6e7e 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2914,32 +2914,57 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, #ifdef CONFIG_CIFS_POSIX -/*Convert an Access Control Entry from wire format to local POSIX xattr format*/ -static void cifs_convert_ace(struct posix_acl_xattr_entry *ace, - struct cifs_posix_ace *cifs_ace) +#ifdef CONFIG_FS_POSIX_ACL +/** + * cifs_init_posix_acl - convert ACL from cifs to POSIX ACL format + * @ace: POSIX ACL entry to store converted ACL into + * @cifs_ace: ACL in cifs format + * + * Convert an Access Control Entry from wire format to local POSIX xattr + * format. + * + * Note that the @cifs_uid member is used to store both {g,u}id_t. + */ +static void cifs_init_posix_acl(struct posix_acl_entry *ace, + struct cifs_posix_ace *cifs_ace) { /* u8 cifs fields do not need le conversion */ - ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); - ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); - ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); -/* - cifs_dbg(FYI, "perm %d tag %d id %d\n", - ace->e_perm, ace->e_tag, ace->e_id); -*/ + ace->e_perm = cifs_ace->cifs_e_perm; + ace->e_tag = cifs_ace->cifs_e_tag; + switch (ace->e_tag) { + case ACL_USER: + ace->e_uid = make_kuid(&init_user_ns, + le64_to_cpu(cifs_ace->cifs_uid)); + break; + case ACL_GROUP: + ace->e_gid = make_kgid(&init_user_ns, + le64_to_cpu(cifs_ace->cifs_uid)); + break; + } return; } -/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */ -static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, - const int acl_type, const int size_of_data_area) +/** + * cifs_to_posix_acl - copy cifs ACL format to POSIX ACL format + * @acl: ACLs returned in POSIX ACL format + * @src: ACLs in cifs format + * @acl_type: type of POSIX ACL requested + * @size_of_data_area: size of SMB we got + * + * This function converts ACLs from cifs format to POSIX ACL format. + * If @acl is NULL then the size of the buffer required to store POSIX ACLs in + * their uapi format is returned. + */ +static int cifs_to_posix_acl(struct posix_acl **acl, char *src, + const int acl_type, const int size_of_data_area) { int size = 0; - int i; __u16 count; struct cifs_posix_ace *pACE; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src; - struct posix_acl_xattr_header *local_acl = (void *)trgt; + struct posix_acl *kacl = NULL; + struct posix_acl_entry *pa, *pe; if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) return -EOPNOTSUPP; @@ -2959,7 +2984,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, count = le16_to_cpu(cifs_acl->access_entry_count); size = sizeof(struct cifs_posix_acl); size += sizeof(struct cifs_posix_ace) * count; -/* skip past access ACEs to get to default ACEs */ + /* skip past access ACEs to get to default ACEs */ pACE = &cifs_acl->ace_array[count]; count = le16_to_cpu(cifs_acl->default_entry_count); size += sizeof(struct cifs_posix_ace) * count; @@ -2971,62 +2996,75 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, return -EINVAL; } - size = posix_acl_xattr_size(count); - if ((buflen == 0) || (local_acl == NULL)) { - /* used to query ACL EA size */ - } else if (size > buflen) { - return -ERANGE; - } else /* buffer big enough */ { - struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); - - local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - for (i = 0; i < count ; i++) { - cifs_convert_ace(&ace[i], pACE); - pACE++; - } + /* Allocate number of POSIX ACLs to store in VFS format. */ + kacl = posix_acl_alloc(count, GFP_NOFS); + if (!kacl) + return -ENOMEM; + + FOREACH_ACL_ENTRY(pa, kacl, pe) { + cifs_init_posix_acl(pa, pACE); + pACE++; } - return size; + + *acl = kacl; + return 0; } -static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, - const struct posix_acl_xattr_entry *local_ace) +/** + * cifs_init_ace - convert ACL entry from POSIX ACL to cifs format + * @cifs_ace: the cifs ACL entry to store into + * @local_ace: the POSIX ACL entry to convert + */ +static void cifs_init_ace(struct cifs_posix_ace *cifs_ace, + const struct posix_acl_entry *local_ace) { - cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm); - cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag); - /* BB is there a better way to handle the large uid? */ - if (local_ace->e_id == cpu_to_le32(-1)) { - /* Probably no need to le convert -1 on any arch but can not hurt */ + cifs_ace->cifs_e_perm = local_ace->e_perm; + cifs_ace->cifs_e_tag = local_ace->e_tag; + + switch (local_ace->e_tag) { + case ACL_USER: + cifs_ace->cifs_uid = + cpu_to_le64(from_kuid(&init_user_ns, local_ace->e_uid)); + break; + case ACL_GROUP: + cifs_ace->cifs_uid = + cpu_to_le64(from_kgid(&init_user_ns, local_ace->e_gid)); + break; + default: cifs_ace->cifs_uid = cpu_to_le64(-1); - } else - cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); -/* - cifs_dbg(FYI, "perm %d tag %d id %d\n", - ace->e_perm, ace->e_tag, ace->e_id); -*/ + } } -/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */ -static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, - const int buflen, const int acl_type) +/** + * posix_acl_to_cifs - convert ACLs from POSIX ACL to cifs format + * @parm_data: ACLs in cifs format to conver to + * @acl: ACLs in POSIX ACL format to convert from + * @acl_type: the type of POSIX ACLs stored in @acl + * + * Return: the number cifs ACL entries after conversion + */ +static __u16 posix_acl_to_cifs(char *parm_data, const struct posix_acl *acl, + const int acl_type) { __u16 rc = 0; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; - struct posix_acl_xattr_header *local_acl = (void *)pACL; - struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); + const struct posix_acl_entry *pa, *pe; int count; - int i; + int i = 0; - if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL)) + if ((acl == NULL) || (cifs_acl == NULL)) return 0; - count = posix_acl_xattr_count((size_t)buflen); - cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n", - count, buflen, le32_to_cpu(local_acl->a_version)); - if (le32_to_cpu(local_acl->a_version) != 2) { - cifs_dbg(FYI, "unknown POSIX ACL version %d\n", - le32_to_cpu(local_acl->a_version)); - return 0; - } + count = acl->a_count; + cifs_dbg(FYI, "setting acl with %d entries\n", count); + + /* + * Note that the uapi POSIX ACL version is verified by the VFS and is + * independent of the cifs ACL version. Changing the POSIX ACL version + * is a uapi change and if it's changed we will pass down the POSIX ACL + * version in struct posix_acl from the VFS. For now there's really + * only one that all filesystems know how to deal with. + */ cifs_acl->version = cpu_to_le16(1); if (acl_type == ACL_TYPE_ACCESS) { cifs_acl->access_entry_count = cpu_to_le16(count); @@ -3038,8 +3076,9 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; } - for (i = 0; i < count; i++) - convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]); + FOREACH_ACL_ENTRY(pa, acl, pe) { + cifs_init_ace(&cifs_acl->ace_array[i++], pa); + } if (rc == 0) { rc = (__u16)(count * sizeof(struct cifs_posix_ace)); rc += sizeof(struct cifs_posix_acl); @@ -3048,11 +3087,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, return rc; } -int -CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, - char *acl_inf, const int buflen, const int acl_type, - const struct nls_table *nls_codepage, int remap) +int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, struct posix_acl **acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) { /* SMB_QUERY_POSIX_ACL */ TRANSACTION2_QPI_REQ *pSMB = NULL; @@ -3124,23 +3162,26 @@ queryAclRetry: else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); __u16 count = le16_to_cpu(pSMBr->t2.DataCount); - rc = cifs_copy_posix_acl(acl_inf, + rc = cifs_to_posix_acl(acl, (char *)&pSMBr->hdr.Protocol+data_offset, - buflen, acl_type, count); + acl_type, count); } } cifs_buf_release(pSMB); + /* + * The else branch after SendReceive() doesn't return EAGAIN so if we + * allocated @acl in cifs_to_posix_acl() we are guaranteed to return + * here and don't leak POSIX ACLs. + */ if (rc == -EAGAIN) goto queryAclRetry; return rc; } -int -CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *fileName, - const char *local_acl, const int buflen, - const int acl_type, - const struct nls_table *nls_codepage, int remap) +int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, const struct posix_acl *acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) { struct smb_com_transaction2_spi_req *pSMB = NULL; struct smb_com_transaction2_spi_rsp *pSMBr = NULL; @@ -3181,7 +3222,7 @@ setAclRetry: pSMB->ParameterOffset = cpu_to_le16(param_offset); /* convert to on the wire format for POSIX ACL */ - data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type); + data_count = posix_acl_to_cifs(parm_data, acl, acl_type); if (data_count == 0) { rc = -EOPNOTSUPP; @@ -3211,6 +3252,23 @@ setACLerrorExit: goto setAclRetry; return rc; } +#else +int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *searchName, struct posix_acl **acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) +{ + return -EOPNOTSUPP; +} + +int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *fileName, const struct posix_acl *acl, + const int acl_type, const struct nls_table *nls_codepage, + int remap) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_FS_POSIX_ACL */ int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9db9527c61cf..e80252a83225 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -759,7 +759,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, { struct msghdr smb_msg = {}; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; - iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read); + iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -774,7 +774,7 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) * and cifs_readv_from_socket sets msg_control and msg_controllen * so little to initialize in struct msghdr */ - iov_iter_discard(&smb_msg.msg_iter, READ, to_read); + iov_iter_discard(&smb_msg.msg_iter, ITER_DEST, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -786,7 +786,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, struct msghdr smb_msg = {}; struct bio_vec bv = { .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; - iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read); + iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cd9698209930..87b56b1ae117 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1413,7 +1413,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) struct inode *inode = d_inode(cfile->dentry); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); unsigned int count = 0, i; int rc = 0, xid, type; struct list_head locks_to_send, *el; @@ -3532,7 +3532,7 @@ static ssize_t __cifs_writev( ctx->iter = *from; ctx->len = len; } else { - rc = setup_aio_ctx_iter(ctx, from, WRITE); + rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE); if (rc) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; @@ -4276,7 +4276,7 @@ static ssize_t __cifs_readv( ctx->iter = *to; ctx->len = len; } else { - rc = setup_aio_ctx_iter(ctx, to, READ); + rc = setup_aio_ctx_iter(ctx, to, ITER_DEST); if (rc) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index a1751b956318..f6f3a6b75601 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -150,7 +150,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -180,7 +180,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bfaafd02fb1f..32b3877b538a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4723,13 +4723,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - iov_iter_bvec(&iter, WRITE, bvec, npages, data_len); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len); } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); iov.iov_base = buf + data_offset; iov.iov_len = data_len; - iov_iter_kvec(&iter, WRITE, &iov, 1, data_len); + iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len); } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 575fa8f58342..3851d0aaa288 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -347,7 +347,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, .iov_base = &rfc1002_marker, .iov_len = 4 }; - iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4); + iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, &hiov, 1, 4); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) goto unmask; @@ -368,7 +368,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size += iov[i].iov_len; } - iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size); + iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, iov, n_vec, size); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) @@ -384,7 +384,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rqst_page_get_length(&rqst[j], i, &bvec.bv_len, &bvec.bv_offset); - iov_iter_bvec(&smb_msg.msg_iter, WRITE, + iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE, &bvec, 1, bvec.bv_len); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 998fa51f9b68..5f2fb2fd2e37 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -200,32 +200,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler, } break; } - -#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - case XATTR_ACL_ACCESS: -#ifdef CONFIG_CIFS_POSIX - if (!value) - goto out; - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, - value, (const int)size, - ACL_TYPE_ACCESS, cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; - - case XATTR_ACL_DEFAULT: -#ifdef CONFIG_CIFS_POSIX - if (!value) - goto out; - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, - value, (const int)size, - ACL_TYPE_DEFAULT, cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; -#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ } out: @@ -366,27 +340,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler, } break; } -#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - case XATTR_ACL_ACCESS: -#ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, - value, size, ACL_TYPE_ACCESS, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; - - case XATTR_ACL_DEFAULT: -#ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & SB_POSIXACL) - rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, - value, size, ACL_TYPE_DEFAULT, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); -#endif /* CONFIG_CIFS_POSIX */ - break; -#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ } /* We could add an additional check for streams ie @@ -525,21 +478,6 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = { .set = cifs_xattr_set, }; - -static const struct xattr_handler cifs_posix_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = XATTR_ACL_ACCESS, - .get = cifs_xattr_get, - .set = cifs_xattr_set, -}; - -static const struct xattr_handler cifs_posix_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = XATTR_ACL_DEFAULT, - .get = cifs_xattr_get, - .set = cifs_xattr_set, -}; - const struct xattr_handler *cifs_xattr_handlers[] = { &cifs_user_xattr_handler, &cifs_os2_xattr_handler, @@ -549,7 +487,9 @@ const struct xattr_handler *cifs_xattr_handlers[] = { &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */ &cifs_cifs_ntsd_full_xattr_handler, &smb3_ntsd_full_xattr_handler, /* alias for above since avoiding "cifs" */ - &cifs_posix_acl_access_xattr_handler, - &cifs_posix_acl_default_xattr_handler, +#ifdef CONFIG_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif NULL }; diff --git a/fs/coredump.c b/fs/coredump.c index 7bad7785e8e6..a4c30bb900fe 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -325,6 +325,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, err = cn_printf(cn, "%lu", rlimit(RLIMIT_CORE)); break; + /* CPU the task ran on */ + case 'C': + err = cn_printf(cn, "%d", cprm->cpu); + break; default: break; } @@ -525,7 +529,6 @@ void do_coredump(const kernel_siginfo_t *siginfo) static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, - .regs = signal_pt_regs(), .limit = rlimit(RLIMIT_CORE), /* * We must use the same mm->flags while dumping core to avoid @@ -534,6 +537,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) */ .mm_flags = mm->flags, .vma_meta = NULL, + .cpu = raw_smp_processor_id(), }; audit_core_dumps(siginfo->si_signo); @@ -716,8 +720,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) * filesystem. */ mnt_userns = file_mnt_user_ns(cprm.file); - if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), - current_fsuid())) { + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + current_fsuid())) { pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n", cn.corename); goto close_fail; @@ -853,7 +857,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) if (dump_interrupted()) return 0; pos = file->f_pos; - iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE); + iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); n = __kernel_write_iter(cprm->file, &iter, &pos); if (n != PAGE_SIZE) return 0; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d5f68a0c5d15..316a778cec0f 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -439,13 +439,7 @@ struct fscrypt_master_key_secret { struct fscrypt_master_key { /* - * Back-pointer to the super_block of the filesystem to which this - * master key has been added. Only valid if ->mk_active_refs > 0. - */ - struct super_block *mk_sb; - - /* - * Link in ->mk_sb->s_master_keys->key_hashtable. + * Link in ->s_master_keys->key_hashtable. * Only valid if ->mk_active_refs > 0. */ struct hlist_node mk_node; @@ -456,7 +450,7 @@ struct fscrypt_master_key { /* * Active and structural reference counts. An active ref guarantees * that the struct continues to exist, continues to be in the keyring - * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g. + * ->s_master_keys, and that any embedded subkeys (e.g. * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * @@ -569,7 +563,8 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) void fscrypt_put_master_key(struct fscrypt_master_key *mk); -void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk); +void fscrypt_put_master_key_activeref(struct super_block *sb, + struct fscrypt_master_key *mk); struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 2a24b1f0ae68..78dd2ff306bd 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -79,10 +79,9 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk) call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } -void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk) +void fscrypt_put_master_key_activeref(struct super_block *sb, + struct fscrypt_master_key *mk) { - struct super_block *sb = mk->mk_sb; - struct fscrypt_keyring *keyring = sb->s_master_keys; size_t i; if (!refcount_dec_and_test(&mk->mk_active_refs)) @@ -93,9 +92,9 @@ void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk) * destroying any subkeys embedded in it. */ - spin_lock(&keyring->lock); + spin_lock(&sb->s_master_keys->lock); hlist_del_rcu(&mk->mk_node); - spin_unlock(&keyring->lock); + spin_unlock(&sb->s_master_keys->lock); /* * ->mk_active_refs == 0 implies that ->mk_secret is not present and @@ -243,7 +242,7 @@ void fscrypt_destroy_keyring(struct super_block *sb) WARN_ON(refcount_read(&mk->mk_struct_refs) != 1); WARN_ON(!is_master_key_secret_present(&mk->mk_secret)); wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(mk); + fscrypt_put_master_key_activeref(sb, mk); } } kfree_sensitive(keyring); @@ -424,7 +423,6 @@ static int add_new_master_key(struct super_block *sb, if (!mk) return -ENOMEM; - mk->mk_sb = sb; init_rwsem(&mk->mk_sem); refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; @@ -1068,7 +1066,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) err = -ENOKEY; if (is_master_key_secret_present(&mk->mk_secret)) { wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(mk); + fscrypt_put_master_key_activeref(sb, mk); err = 0; } inodes_remain = refcount_read(&mk->mk_active_refs) > 0; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index f7407071a952..94757ccd3056 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -44,6 +44,21 @@ struct fscrypt_mode fscrypt_modes[] = { .security_strength = 16, .ivsize = 16, }, + [FSCRYPT_MODE_SM4_XTS] = { + .friendly_name = "SM4-XTS", + .cipher_str = "xts(sm4)", + .keysize = 32, + .security_strength = 16, + .ivsize = 16, + .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS, + }, + [FSCRYPT_MODE_SM4_CTS] = { + .friendly_name = "SM4-CTS-CBC", + .cipher_str = "cts(cbc(sm4))", + .keysize = 16, + .security_strength = 16, + .ivsize = 16, + }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", @@ -509,7 +524,7 @@ static void put_crypt_info(struct fscrypt_info *ci) spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); - fscrypt_put_master_key_activeref(mk); + fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_info_cachep, ci); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 46757c3052ef..893661b52376 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -61,6 +61,13 @@ fscrypt_get_dummy_policy(struct super_block *sb) return sb->s_cop->get_dummy_policy(sb); } +/* + * Return %true if the given combination of encryption modes is supported for v1 + * (and later) encryption policies. + * + * Do *not* add anything new here, since v1 encryption policies are deprecated. + * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only. + */ static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode) { if (contents_mode == FSCRYPT_MODE_AES_256_XTS && @@ -83,6 +90,11 @@ static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode) if (contents_mode == FSCRYPT_MODE_AES_256_XTS && filenames_mode == FSCRYPT_MODE_AES_256_HCTR2) return true; + + if (contents_mode == FSCRYPT_MODE_SM4_XTS && + filenames_mode == FSCRYPT_MODE_SM4_CTS) + return true; + return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode); } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index ddb3fc258df9..b54f470e0d03 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, } EXPORT_SYMBOL_GPL(debugfs_attr_read); -ssize_t debugfs_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; @@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf, ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; - ret = simple_attr_write(file, buf, len, ppos); + if (is_signed) + ret = simple_attr_write_signed(file, buf, len, ppos); + else + ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } + +ssize_t debugfs_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(debugfs_attr_write); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); + static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, @@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val) *val = atomic_read((atomic_t *)data); return 0; } -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index d60a8d8f109d..26fef9945cc9 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -12,55 +12,67 @@ #include <trace/events/dlm.h> #include "dlm_internal.h" +#include "memory.h" #include "lock.h" #include "user.h" #include "ast.h" -static uint64_t dlm_cb_seq; -static DEFINE_SPINLOCK(dlm_cb_seq_spin); +void dlm_release_callback(struct kref *ref) +{ + struct dlm_callback *cb = container_of(ref, struct dlm_callback, ref); + + dlm_free_cb(cb); +} + +void dlm_callback_set_last_ptr(struct dlm_callback **from, + struct dlm_callback *to) +{ + if (*from) + kref_put(&(*from)->ref, dlm_release_callback); + + if (to) + kref_get(&to->ref); + + *from = to; +} -static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb) +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb) { - int i; - - log_print("last_bast %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, - (unsigned long long)lkb->lkb_last_bast.seq, - lkb->lkb_last_bast.flags, - lkb->lkb_last_bast.mode, - lkb->lkb_last_bast.sb_status, - lkb->lkb_last_bast.sb_flags); - - log_print("last_cast %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, - (unsigned long long)lkb->lkb_last_cast.seq, - lkb->lkb_last_cast.flags, - lkb->lkb_last_cast.mode, - lkb->lkb_last_cast.sb_status, - lkb->lkb_last_cast.sb_flags); - - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - log_print("cb %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, - (unsigned long long)lkb->lkb_callbacks[i].seq, - lkb->lkb_callbacks[i].flags, - lkb->lkb_callbacks[i].mode, - lkb->lkb_callbacks[i].sb_status, - lkb->lkb_callbacks[i].sb_flags); + struct dlm_callback *cb, *safe; + + list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) { + list_del(&cb->list); + kref_put(&cb->ref, dlm_release_callback); } + + lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; + + /* invalidate */ + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); + lkb->lkb_last_bast_mode = -1; } -int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq) +int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - uint64_t prev_seq; + int rv = DLM_ENQUEUE_CALLBACK_SUCCESS; + struct dlm_callback *cb; int prev_mode; - int i, rv; - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - if (lkb->lkb_callbacks[i].seq) - continue; + if (flags & DLM_CB_BAST) { + /* if cb is a bast, it should be skipped if the blocking mode is + * compatible with the last granted mode + */ + if (lkb->lkb_last_cast) { + if (dlm_modes_compat(mode, lkb->lkb_last_cast->mode)) { + log_debug(ls, "skip %x bast mode %d for cast mode %d", + lkb->lkb_id, mode, + lkb->lkb_last_cast->mode); + goto out; + } + } /* * Suppress some redundant basts here, do more on removal. @@ -68,148 +80,95 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, * is a bast for the same mode or a more restrictive mode. * (the addional > PR check is needed for PR/CW inversion) */ - - if ((i > 0) && (flags & DLM_CB_BAST) && - (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) { - - prev_seq = lkb->lkb_callbacks[i-1].seq; - prev_mode = lkb->lkb_callbacks[i-1].mode; + if (lkb->lkb_last_cb && lkb->lkb_last_cb->flags & DLM_CB_BAST) { + prev_mode = lkb->lkb_last_cb->mode; if ((prev_mode == mode) || (prev_mode > mode && prev_mode > DLM_LOCK_PR)) { - - log_debug(ls, "skip %x add bast %llu mode %d " - "for bast %llu mode %d", - lkb->lkb_id, - (unsigned long long)seq, - mode, - (unsigned long long)prev_seq, - prev_mode); - rv = 0; + log_debug(ls, "skip %x add bast mode %d for bast mode %d", + lkb->lkb_id, mode, prev_mode); goto out; } } - - lkb->lkb_callbacks[i].seq = seq; - lkb->lkb_callbacks[i].flags = flags; - lkb->lkb_callbacks[i].mode = mode; - lkb->lkb_callbacks[i].sb_status = status; - lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF); - rv = 0; - break; } - if (i == DLM_CALLBACKS_SIZE) { - log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x", - lkb->lkb_id, (unsigned long long)seq, - flags, mode, status, sbflags); - dlm_dump_lkb_callbacks(lkb); - rv = -1; + cb = dlm_allocate_cb(); + if (!cb) { + rv = DLM_ENQUEUE_CALLBACK_FAILURE; goto out; } - out: - return rv; -} - -int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_callback *cb, int *resid) -{ - int i, rv; - - *resid = 0; - - if (!lkb->lkb_callbacks[0].seq) { - rv = -ENOENT; - goto out; - } - - /* oldest undelivered cb is callbacks[0] */ - - memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback)); - memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback)); - /* shift others down */ - - for (i = 1; i < DLM_CALLBACKS_SIZE; i++) { - if (!lkb->lkb_callbacks[i].seq) - break; - memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i], - sizeof(struct dlm_callback)); - memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback)); - (*resid)++; + cb->flags = flags; + cb->mode = mode; + cb->sb_status = status; + cb->sb_flags = (sbflags & 0x000000FF); + kref_init(&cb->ref); + if (!(lkb->lkb_flags & DLM_IFL_CB_PENDING)) { + lkb->lkb_flags |= DLM_IFL_CB_PENDING; + rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED; } + list_add_tail(&cb->list, &lkb->lkb_callbacks); - /* if cb is a bast, it should be skipped if the blocking mode is - compatible with the last granted mode */ - - if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) { - if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) { - cb->flags |= DLM_CB_SKIP; - - log_debug(ls, "skip %x bast %llu mode %d " - "for cast %llu mode %d", - lkb->lkb_id, - (unsigned long long)cb->seq, - cb->mode, - (unsigned long long)lkb->lkb_last_cast.seq, - lkb->lkb_last_cast.mode); - rv = 0; - goto out; - } - } + if (flags & DLM_CB_CAST) + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, cb); - if (cb->flags & DLM_CB_CAST) { - memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback)); - lkb->lkb_last_cast_time = ktime_get(); - } + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, cb); - if (cb->flags & DLM_CB_BAST) { - memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback)); - lkb->lkb_last_bast_time = ktime_get(); - } - rv = 0; out: return rv; } +int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb) +{ + /* oldest undelivered cb is callbacks first entry */ + *cb = list_first_entry_or_null(&lkb->lkb_callbacks, + struct dlm_callback, list); + if (!*cb) + return DLM_DEQUEUE_CALLBACK_EMPTY; + + /* remove it from callbacks so shift others down */ + list_del(&(*cb)->list); + if (list_empty(&lkb->lkb_callbacks)) + return DLM_DEQUEUE_CALLBACK_LAST; + + return DLM_DEQUEUE_CALLBACK_SUCCESS; +} + void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - uint64_t new_seq, prev_seq; int rv; - spin_lock(&dlm_cb_seq_spin); - new_seq = ++dlm_cb_seq; - if (!dlm_cb_seq) - new_seq = ++dlm_cb_seq; - spin_unlock(&dlm_cb_seq_spin); - if (lkb->lkb_flags & DLM_IFL_USER) { - dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq); + dlm_user_add_ast(lkb, flags, mode, status, sbflags); return; } - mutex_lock(&lkb->lkb_cb_mutex); - prev_seq = lkb->lkb_callbacks[0].seq; - - rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq); - if (rv < 0) - goto out; - - if (!prev_seq) { + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags); + switch (rv) { + case DLM_ENQUEUE_CALLBACK_NEED_SCHED: kref_get(&lkb->lkb_ref); - mutex_lock(&ls->ls_cb_mutex); + spin_lock(&ls->ls_cb_lock); if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) { list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay); } else { queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); } - mutex_unlock(&ls->ls_cb_mutex); + spin_unlock(&ls->ls_cb_lock); + break; + case DLM_ENQUEUE_CALLBACK_FAILURE: + WARN_ON_ONCE(1); + break; + case DLM_ENQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON_ONCE(1); + break; } - out: - mutex_unlock(&lkb->lkb_cb_mutex); + spin_unlock(&lkb->lkb_cb_lock); } void dlm_callback_work(struct work_struct *work) @@ -218,53 +177,46 @@ void dlm_callback_work(struct work_struct *work) struct dlm_ls *ls = lkb->lkb_resource->res_ls; void (*castfn) (void *astparam); void (*bastfn) (void *astparam, int mode); - struct dlm_callback callbacks[DLM_CALLBACKS_SIZE]; - int i, rv, resid; - - memset(&callbacks, 0, sizeof(callbacks)); + struct dlm_callback *cb; + int rv; - mutex_lock(&lkb->lkb_cb_mutex); - if (!lkb->lkb_callbacks[0].seq) { - /* no callback work exists, shouldn't happen */ - log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id); - dlm_print_lkb(lkb); - dlm_dump_lkb_callbacks(lkb); - } + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_dequeue_lkb_callback(lkb, &cb); + spin_unlock(&lkb->lkb_cb_lock); - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid); - if (rv < 0) - break; - } + if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) + goto out; - if (resid) { - /* cbs remain, loop should have removed all, shouldn't happen */ - log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id, - resid); - dlm_print_lkb(lkb); - dlm_dump_lkb_callbacks(lkb); - } - mutex_unlock(&lkb->lkb_cb_mutex); + for (;;) { + castfn = lkb->lkb_astfn; + bastfn = lkb->lkb_bastfn; + + if (cb->flags & DLM_CB_BAST) { + trace_dlm_bast(ls, lkb, cb->mode); + lkb->lkb_last_bast_time = ktime_get(); + lkb->lkb_last_bast_mode = cb->mode; + bastfn(lkb->lkb_astparam, cb->mode); + } else if (cb->flags & DLM_CB_CAST) { + lkb->lkb_lksb->sb_status = cb->sb_status; + lkb->lkb_lksb->sb_flags = cb->sb_flags; + trace_dlm_ast(ls, lkb); + lkb->lkb_last_cast_time = ktime_get(); + castfn(lkb->lkb_astparam); + } - castfn = lkb->lkb_astfn; - bastfn = lkb->lkb_bastfn; + kref_put(&cb->ref, dlm_release_callback); - for (i = 0; i < DLM_CALLBACKS_SIZE; i++) { - if (!callbacks[i].seq) + spin_lock(&lkb->lkb_cb_lock); + rv = dlm_dequeue_lkb_callback(lkb, &cb); + if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) { + lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; + spin_unlock(&lkb->lkb_cb_lock); break; - if (callbacks[i].flags & DLM_CB_SKIP) { - continue; - } else if (callbacks[i].flags & DLM_CB_BAST) { - trace_dlm_bast(ls, lkb, callbacks[i].mode); - bastfn(lkb->lkb_astparam, callbacks[i].mode); - } else if (callbacks[i].flags & DLM_CB_CAST) { - lkb->lkb_lksb->sb_status = callbacks[i].sb_status; - lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; - trace_dlm_ast(ls, lkb); - castfn(lkb->lkb_astparam); } + spin_unlock(&lkb->lkb_cb_lock); } +out: /* undo kref_get from dlm_add_callback, may cause lkb to be freed */ dlm_put_lkb(lkb); } @@ -289,9 +241,9 @@ void dlm_callback_stop(struct dlm_ls *ls) void dlm_callback_suspend(struct dlm_ls *ls) { if (ls->ls_callback_wq) { - mutex_lock(&ls->ls_cb_mutex); + spin_lock(&ls->ls_cb_lock); set_bit(LSFL_CB_DELAY, &ls->ls_flags); - mutex_unlock(&ls->ls_cb_mutex); + spin_unlock(&ls->ls_cb_lock); flush_workqueue(ls->ls_callback_wq); } @@ -308,10 +260,8 @@ void dlm_callback_resume(struct dlm_ls *ls) if (!ls->ls_callback_wq) return; - clear_bit(LSFL_CB_DELAY, &ls->ls_flags); - more: - mutex_lock(&ls->ls_cb_mutex); + spin_lock(&ls->ls_cb_lock); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { list_del_init(&lkb->lkb_cb_list); queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); @@ -320,7 +270,9 @@ more: break; } empty = list_empty(&ls->ls_cb_delay); - mutex_unlock(&ls->ls_cb_mutex); + if (empty) + clear_bit(LSFL_CB_DELAY, &ls->ls_flags); + spin_unlock(&ls->ls_cb_lock); sum += count; if (!empty) { diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index e5e05fcc5813..880b11882495 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -11,13 +11,22 @@ #ifndef __ASTD_DOT_H__ #define __ASTD_DOT_H__ -int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq); -int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_callback *cb, int *resid); +#define DLM_ENQUEUE_CALLBACK_NEED_SCHED 1 +#define DLM_ENQUEUE_CALLBACK_SUCCESS 0 +#define DLM_ENQUEUE_CALLBACK_FAILURE -1 +int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, + int status, uint32_t sbflags); +#define DLM_DEQUEUE_CALLBACK_EMPTY 2 +#define DLM_DEQUEUE_CALLBACK_LAST 1 +#define DLM_DEQUEUE_CALLBACK_SUCCESS 0 +int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb); void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags); +void dlm_callback_set_last_ptr(struct dlm_callback **from, + struct dlm_callback *to); +void dlm_release_callback(struct kref *ref); +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb); void dlm_callback_work(struct work_struct *work); int dlm_callback_start(struct dlm_ls *ls); void dlm_callback_stop(struct dlm_ls *ls); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index ac8b62106ce0..20b60709eccf 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -183,7 +183,7 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x) return -EINVAL; } - if (dlm_allow_conn) + if (dlm_lowcomms_is_running()) return -EBUSY; return 0; @@ -194,7 +194,7 @@ static int dlm_check_zero_and_dlm_running(unsigned int x) if (!x) return -EINVAL; - if (dlm_allow_conn) + if (dlm_lowcomms_is_running()) return -EBUSY; return 0; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 8fb04ebbafb5..8a0e1b1f74ad 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -246,7 +246,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, - lkb->lkb_last_bast.mode, + lkb->lkb_last_bast_mode, rsb_lookup, lkb->lkb_wait_type, lkb->lkb_lvbseq, diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index e34c3d2639a5..ab1a55337a6e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -211,6 +211,7 @@ struct dlm_args { #endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +#define DLM_IFL_CB_PENDING 0x04000000 /* least significant 2 bytes are message changed, they are full transmitted * but at receive side only the 2 bytes LSB will be set. * @@ -222,18 +223,17 @@ struct dlm_args { #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 -#define DLM_CALLBACKS_SIZE 6 - #define DLM_CB_CAST 0x00000001 #define DLM_CB_BAST 0x00000002 -#define DLM_CB_SKIP 0x00000004 struct dlm_callback { - uint64_t seq; uint32_t flags; /* DLM_CBF_ */ int sb_status; /* copy to lksb status */ uint8_t sb_flags; /* copy to lksb flags */ int8_t mode; /* rq mode of bast, gr mode of cast */ + + struct list_head list; + struct kref ref; }; struct dlm_lkb { @@ -268,12 +268,13 @@ struct dlm_lkb { unsigned long lkb_timeout_cs; #endif - struct mutex lkb_cb_mutex; + spinlock_t lkb_cb_lock; struct work_struct lkb_cb_work; struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */ - struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; - struct dlm_callback lkb_last_cast; - struct dlm_callback lkb_last_bast; + struct list_head lkb_callbacks; + struct dlm_callback *lkb_last_cast; + struct dlm_callback *lkb_last_cb; + int lkb_last_bast_mode; ktime_t lkb_last_cast_time; /* for debugging */ ktime_t lkb_last_bast_time; /* for debugging */ @@ -591,11 +592,7 @@ struct dlm_ls { int ls_new_rsb_count; struct list_head ls_new_rsb; /* new rsb structs */ - spinlock_t ls_remove_spin; - wait_queue_head_t ls_remove_wait; - char ls_remove_name[DLM_RESNAME_MAXLEN+1]; char *ls_remove_names[DLM_REMOVE_NAMES_MAX]; - int ls_remove_len; int ls_remove_lens[DLM_REMOVE_NAMES_MAX]; struct list_head ls_nodes; /* current nodes in ls */ @@ -631,7 +628,7 @@ struct dlm_ls { /* recovery related */ - struct mutex ls_cb_mutex; + spinlock_t ls_cb_lock; struct list_head ls_cb_delay; /* save for queue_work later */ struct timer_list ls_timer; struct task_struct *ls_recoverd_task; @@ -670,7 +667,7 @@ struct dlm_ls { void *ls_ops_arg; int ls_namelen; - char ls_name[1]; + char ls_name[DLM_LOCKSPACE_LEN + 1]; }; /* diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 94a72ede5764..e1adfa5aed05 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1209,6 +1209,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, if (!lkb) return -ENOMEM; + lkb->lkb_last_bast_mode = -1; lkb->lkb_nodeid = -1; lkb->lkb_grmode = DLM_LOCK_IV; kref_init(&lkb->lkb_ref); @@ -1218,7 +1219,8 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, INIT_LIST_HEAD(&lkb->lkb_time_list); #endif INIT_LIST_HEAD(&lkb->lkb_cb_list); - mutex_init(&lkb->lkb_cb_mutex); + INIT_LIST_HEAD(&lkb->lkb_callbacks); + spin_lock_init(&lkb->lkb_cb_lock); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); idr_preload(GFP_NOFS); @@ -1587,37 +1589,6 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) return error; } -/* If there's an rsb for the same resource being removed, ensure - * that the remove message is sent before the new lookup message. - */ - -#define DLM_WAIT_PENDING_COND(ls, r) \ - (ls->ls_remove_len && \ - !rsb_cmp(r, ls->ls_remove_name, \ - ls->ls_remove_len)) - -static void wait_pending_remove(struct dlm_rsb *r) -{ - struct dlm_ls *ls = r->res_ls; - restart: - spin_lock(&ls->ls_remove_spin); - if (DLM_WAIT_PENDING_COND(ls, r)) { - log_debug(ls, "delay lookup for remove dir %d %s", - r->res_dir_nodeid, r->res_name); - spin_unlock(&ls->ls_remove_spin); - wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r)); - goto restart; - } - spin_unlock(&ls->ls_remove_spin); -} - -/* - * ls_remove_spin protects ls_remove_name and ls_remove_len which are - * read by other threads in wait_pending_remove. ls_remove_names - * and ls_remove_lens are only used by the scan thread, so they do - * not need protection. - */ - static void shrink_bucket(struct dlm_ls *ls, int b) { struct rb_node *n, *next; @@ -1699,11 +1670,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b) * list and sending the removal. Keeping this gap small is * important to keep us (the master node) from being out of sync * with the remote dir node for very long. - * - * From the time the rsb is removed from toss until just after - * send_remove, the rsb name is saved in ls_remove_name. A new - * lookup checks this to ensure that a new lookup message for the - * same resource name is not sent just before the remove message. */ for (i = 0; i < remote_count; i++) { @@ -1750,22 +1716,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b) } rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); - - /* block lookup of same name until we've sent remove */ - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = len; - memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - spin_unlock(&ls->ls_rsbtbl[b].lock); - send_remove(r); - - /* allow lookup of name again */ - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = 0; - memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - wake_up(&ls->ls_remove_wait); + spin_unlock(&ls->ls_rsbtbl[b].lock); dlm_free_rsb(r); } @@ -2716,8 +2668,6 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) return 0; } - wait_pending_remove(r); - r->res_first_lkid = lkb->lkb_id; send_lookup(r, lkb); return 1; @@ -3552,7 +3502,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace, static int _create_message(struct dlm_ls *ls, int mb_len, int to_nodeid, int mstype, struct dlm_message **ms_ret, - struct dlm_mhandle **mh_ret) + struct dlm_mhandle **mh_ret, + gfp_t allocation) { struct dlm_message *ms; struct dlm_mhandle *mh; @@ -3562,7 +3513,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len, pass into midcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb); if (!mh) return -ENOBUFS; @@ -3584,7 +3535,8 @@ static int _create_message(struct dlm_ls *ls, int mb_len, static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, int to_nodeid, int mstype, struct dlm_message **ms_ret, - struct dlm_mhandle **mh_ret) + struct dlm_mhandle **mh_ret, + gfp_t allocation) { int mb_len = sizeof(struct dlm_message); @@ -3605,15 +3557,16 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, } return _create_message(r->res_ls, mb_len, to_nodeid, mstype, - ms_ret, mh_ret); + ms_ret, mh_ret, allocation); } /* further lowcomms enhancements or alternate implementations may make the return value from this function useful at some point */ -static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) +static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms, + const void *name, int namelen) { - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, name, namelen); return 0; } @@ -3673,13 +3626,13 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) if (error) return error; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); if (error) goto fail; send_args(r, lkb, ms); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); if (error) goto fail; return 0; @@ -3734,7 +3687,8 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) to_nodeid = lkb->lkb_nodeid; - error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh); + error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -3742,7 +3696,7 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) ms->m_result = 0; - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3755,7 +3709,8 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) to_nodeid = lkb->lkb_nodeid; - error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh); + error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -3763,7 +3718,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) ms->m_bastmode = cpu_to_le32(mode); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3780,13 +3735,14 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) if (error) return error; - error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); + error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh, + GFP_NOFS); if (error) goto fail; send_args(r, lkb, ms); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); if (error) goto fail; return 0; @@ -3804,14 +3760,15 @@ static int send_remove(struct dlm_rsb *r) to_nodeid = dlm_dir_nodeid(r); - error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh); + error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh, + GFP_ATOMIC); if (error) goto out; memcpy(ms->m_extra, r->res_name, r->res_length); ms->m_hash = cpu_to_le32(r->res_hash); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3825,7 +3782,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, to_nodeid = lkb->lkb_nodeid; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS); if (error) goto out; @@ -3833,7 +3790,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, ms->m_result = cpu_to_le32(to_dlm_errno(rv)); - error = send_message(mh, ms); + error = send_message(mh, ms, r->res_name, r->res_length); out: return error; } @@ -3866,7 +3823,8 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, struct dlm_mhandle *mh; int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid); - error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh); + error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh, + GFP_NOFS); if (error) goto out; @@ -3874,7 +3832,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, ms->m_result = cpu_to_le32(to_dlm_errno(rv)); ms->m_nodeid = cpu_to_le32(ret_nodeid); - error = send_message(mh, ms); + error = send_message(mh, ms, ms_in->m_extra, receive_extralen(ms_in)); out: return error; } @@ -4044,66 +4002,6 @@ out: return error; } -static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) -{ - char name[DLM_RESNAME_MAXLEN + 1]; - struct dlm_message *ms; - struct dlm_mhandle *mh; - struct dlm_rsb *r; - uint32_t hash, b; - int rv, dir_nodeid; - - memset(name, 0, sizeof(name)); - memcpy(name, ms_name, len); - - hash = jhash(name, len, 0); - b = hash & (ls->ls_rsbtbl_size - 1); - - dir_nodeid = dlm_hash2nodeid(ls, hash); - - log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name); - - spin_lock(&ls->ls_rsbtbl[b].lock); - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); - if (!rv) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_error(ls, "repeat_remove on keep %s", name); - return; - } - - rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); - if (!rv) { - spin_unlock(&ls->ls_rsbtbl[b].lock); - log_error(ls, "repeat_remove on toss %s", name); - return; - } - - /* use ls->remove_name2 to avoid conflict with shrink? */ - - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = len; - memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - spin_unlock(&ls->ls_rsbtbl[b].lock); - - rv = _create_message(ls, sizeof(struct dlm_message) + len, - dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); - if (rv) - goto out; - - memcpy(ms->m_extra, name, len); - ms->m_hash = cpu_to_le32(hash); - - send_message(mh, ms); - -out: - spin_lock(&ls->ls_remove_spin); - ls->ls_remove_len = 0; - memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); - spin_unlock(&ls->ls_remove_spin); - wake_up(&ls->ls_remove_wait); -} - static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; @@ -4173,25 +4071,11 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) ENOTBLK request failures when the lookup reply designating us as master is delayed. */ - /* We could repeatedly return -EBADR here if our send_remove() is - delayed in being sent/arriving/being processed on the dir node. - Another node would repeatedly lookup up the master, and the dir - node would continue returning our nodeid until our send_remove - took effect. - - We send another remove message in case our previous send_remove - was lost/ignored/missed somehow. */ - if (error != -ENOTBLK) { log_limit(ls, "receive_request %x from %d %d", le32_to_cpu(ms->m_lkid), from_nodeid, error); } - if (namelen && error == -EBADR) { - send_repeat_remove(ls, ms->m_extra, namelen); - msleep(1000); - } - setup_stub_lkb(ls, ms); send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); return error; @@ -6294,8 +6178,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) } list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { - memset(&lkb->lkb_callbacks, 0, - sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + dlm_purge_lkb_callbacks(lkb); list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } @@ -6336,8 +6219,7 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) spin_lock(&proc->asts_spin); list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) { - memset(&lkb->lkb_callbacks, 0, - sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE); + dlm_purge_lkb_callbacks(lkb); list_del_init(&lkb->lkb_cb_list); dlm_put_lkb(lkb); } @@ -6368,13 +6250,13 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid) int error; error = _create_message(ls, sizeof(struct dlm_message), nodeid, - DLM_MSG_PURGE, &ms, &mh); + DLM_MSG_PURGE, &ms, &mh, GFP_NOFS); if (error) return error; ms->m_nodeid = cpu_to_le32(nodeid); ms->m_pid = cpu_to_le32(pid); - return send_message(mh, ms); + return send_message(mh, ms, NULL, 0); } int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index bae050df7abf..d0b4e2181a5f 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -17,7 +17,6 @@ #include "recoverd.h" #include "dir.h" #include "midcomms.h" -#include "lowcomms.h" #include "config.h" #include "memory.h" #include "lock.h" @@ -391,7 +390,7 @@ static int threads_start(void) /* Thread for sending/receiving messages for all lockspace's */ error = dlm_midcomms_start(); if (error) { - log_print("cannot start dlm lowcomms %d", error); + log_print("cannot start dlm midcomms %d", error); goto scand_fail; } @@ -473,7 +472,7 @@ static int new_lockspace(const char *name, const char *cluster, error = -ENOMEM; - ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS); + ls = kzalloc(sizeof(*ls), GFP_NOFS); if (!ls) goto out; memcpy(ls->ls_name, name, namelen); @@ -524,9 +523,6 @@ static int new_lockspace(const char *name, const char *cluster, spin_lock_init(&ls->ls_rsbtbl[i].lock); } - spin_lock_init(&ls->ls_remove_spin); - init_waitqueue_head(&ls->ls_remove_wait); - for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1, GFP_KERNEL); @@ -567,7 +563,7 @@ static int new_lockspace(const char *name, const char *cluster, init_completion(&ls->ls_recovery_done); ls->ls_recovery_result = -1; - mutex_init(&ls->ls_cb_mutex); + spin_lock_init(&ls->ls_cb_lock); INIT_LIST_HEAD(&ls->ls_cb_delay); ls->ls_recoverd_task = NULL; @@ -726,7 +722,7 @@ static int __dlm_new_lockspace(const char *name, const char *cluster, if (!ls_count) { dlm_scand_stop(); dlm_midcomms_shutdown(); - dlm_lowcomms_stop(); + dlm_midcomms_stop(); } out: mutex_unlock(&ls_lock); @@ -929,7 +925,7 @@ int dlm_release_lockspace(void *lockspace, int force) if (!error) ls_count--; if (!ls_count) - dlm_lowcomms_stop(); + dlm_midcomms_stop(); mutex_unlock(&ls_lock); return error; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 59f64c596233..8b80ca0cd65f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -63,41 +63,49 @@ #define NEEDED_RMEM (4*1024*1024) -/* Number of messages to send before rescheduling */ -#define MAX_SEND_MSG_COUNT 25 -#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) - struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ - struct mutex sock_mutex; + /* this semaphore is used to allow parallel recv/send in read + * lock mode. When we release a sock we need to held the write lock. + * + * However this is locking code and not nice. When we remove the + * othercon handling we can look into other mechanism to synchronize + * io handling to call sock_release() at the right time. + */ + struct rw_semaphore sock_lock; unsigned long flags; -#define CF_READ_PENDING 1 -#define CF_WRITE_PENDING 2 -#define CF_INIT_PENDING 4 +#define CF_APP_LIMITED 0 +#define CF_RECV_PENDING 1 +#define CF_SEND_PENDING 2 +#define CF_RECV_INTR 3 +#define CF_IO_STOP 4 #define CF_IS_OTHERCON 5 -#define CF_CLOSE 6 -#define CF_APP_LIMITED 7 -#define CF_CLOSING 8 -#define CF_SHUTDOWN 9 -#define CF_CONNECTED 10 -#define CF_RECONNECT 11 -#define CF_DELAY_CONNECT 12 -#define CF_EOF 13 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; - atomic_t writequeue_cnt; int retries; -#define MAX_CONNECT_RETRIES 3 struct hlist_node list; + /* due some connect()/accept() races we currently have this cross over + * connection attempt second connection for one node. + * + * There is a solution to avoid the race by introducing a connect + * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to + * connect. Otherside can connect but will only be considered that + * the other side wants to have a reconnect. + * + * However changing to this behaviour will break backwards compatible. + * In a DLM protocol major version upgrade we should remove this! + */ struct connection *othercon; - struct connection *sendcon; - struct work_struct rwork; /* Receive workqueue */ - struct work_struct swork; /* Send workqueue */ - wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ - unsigned char *rx_buf; - int rx_buflen; + struct work_struct rwork; /* receive worker */ + struct work_struct swork; /* send worker */ + unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE]; int rx_leftover; + int mark; + int addr_count; + int curr_addr_index; + struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT]; + spinlock_t addrs_lock; struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -136,13 +144,12 @@ struct dlm_msg { struct kref ref; }; -struct dlm_node_addr { - struct list_head list; +struct processqueue_entry { + unsigned char *buf; int nodeid; - int mark; - int addr_count; - int curr_addr_index; - struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; + int buflen; + + struct list_head list; }; struct dlm_proto_ops { @@ -157,10 +164,6 @@ struct dlm_proto_ops { int (*listen_validate)(void); void (*listen_sockopts)(struct socket *sock); int (*listen_bind)(struct socket *sock); - /* What to do to shutdown */ - void (*shutdown_action)(struct connection *con); - /* What to do to eof check */ - bool (*eof_condition)(struct connection *con); }; static struct listen_sock_callbacks { @@ -170,17 +173,13 @@ static struct listen_sock_callbacks { void (*sk_write_space)(struct sock *); } listen_sock; -static LIST_HEAD(dlm_node_addrs); -static DEFINE_SPINLOCK(dlm_node_addrs_spin); - static struct listen_connection listen_con; -static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; +static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; -int dlm_allow_conn; /* Work queues */ -static struct workqueue_struct *recv_workqueue; -static struct workqueue_struct *send_workqueue; +static struct workqueue_struct *io_workqueue; +static struct workqueue_struct *process_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(connections_lock); @@ -188,8 +187,45 @@ DEFINE_STATIC_SRCU(connections_srcu); static const struct dlm_proto_ops *dlm_proto_ops; +#define DLM_IO_SUCCESS 0 +#define DLM_IO_END 1 +#define DLM_IO_EOF 2 +#define DLM_IO_RESCHED 3 + static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); +static void process_dlm_messages(struct work_struct *work); + +static DECLARE_WORK(process_work, process_dlm_messages); +static DEFINE_SPINLOCK(processqueue_lock); +static bool process_dlm_messages_pending; +static LIST_HEAD(processqueue); + +bool dlm_lowcomms_is_running(void) +{ + return !!listen_con.sock; +} + +static void lowcomms_queue_swork(struct connection *con) +{ + assert_spin_locked(&con->writequeue_lock); + + if (!test_bit(CF_IO_STOP, &con->flags) && + !test_bit(CF_APP_LIMITED, &con->flags) && + !test_and_set_bit(CF_SEND_PENDING, &con->flags)) + queue_work(io_workqueue, &con->swork); +} + +static void lowcomms_queue_rwork(struct connection *con) +{ +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk)); +#endif + + if (!test_bit(CF_IO_STOP, &con->flags) && + !test_and_set_bit(CF_RECV_PENDING, &con->flags)) + queue_work(io_workqueue, &con->rwork); +} static void writequeue_entry_ctor(void *data) { @@ -214,15 +250,12 @@ static struct writequeue_entry *con_next_wq(struct connection *con) { struct writequeue_entry *e; - if (list_empty(&con->writequeue)) - return NULL; - - e = list_first_entry(&con->writequeue, struct writequeue_entry, - list); + e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry, + list); /* if len is zero nothing is to send, if there are users filling * buffers we wait until the users are done so we can send more. */ - if (e->users || e->len == 0) + if (!e || e->users || e->len == 0) return NULL; return e; @@ -240,28 +273,15 @@ static struct connection *__find_con(int nodeid, int r) return NULL; } -static bool tcp_eof_condition(struct connection *con) -{ - return atomic_read(&con->writequeue_cnt); -} - -static int dlm_con_init(struct connection *con, int nodeid) +static void dlm_con_init(struct connection *con, int nodeid) { - con->rx_buflen = dlm_config.ci_buffer_size; - con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); - if (!con->rx_buf) - return -ENOMEM; - con->nodeid = nodeid; - mutex_init(&con->sock_mutex); + init_rwsem(&con->sock_lock); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); - atomic_set(&con->writequeue_cnt, 0); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); - init_waitqueue_head(&con->shutdown_wait); - - return 0; + spin_lock_init(&con->addrs_lock); } /* @@ -271,7 +291,7 @@ static int dlm_con_init(struct connection *con, int nodeid) static struct connection *nodeid2con(int nodeid, gfp_t alloc) { struct connection *con, *tmp; - int r, ret; + int r; r = nodeid_hash(nodeid); con = __find_con(nodeid, r); @@ -282,11 +302,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) if (!con) return NULL; - ret = dlm_con_init(con, nodeid); - if (ret) { - kfree(con); - return NULL; - } + dlm_con_init(con, nodeid); spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can @@ -298,7 +314,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) tmp = __find_con(nodeid, r); if (tmp) { spin_unlock(&connections_lock); - kfree(con->rx_buf); kfree(con); return tmp; } @@ -309,29 +324,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return con; } -/* Loop round all connections */ -static void foreach_conn(void (*conn_func)(struct connection *c)) -{ - int i; - struct connection *con; - - for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_rcu(con, &connection_hash[i], list) - conn_func(con); - } -} - -static struct dlm_node_addr *find_node_addr(int nodeid) -{ - struct dlm_node_addr *na; - - list_for_each_entry(na, &dlm_node_addrs, list) { - if (na->nodeid == nodeid) - return na; - } - return NULL; -} - static int addr_compare(const struct sockaddr_storage *x, const struct sockaddr_storage *y) { @@ -365,40 +357,47 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, unsigned int *mark) { struct sockaddr_storage sas; - struct dlm_node_addr *na; + struct connection *con; + int idx; if (!dlm_local_count) return -1; - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (na && na->addr_count) { - memcpy(&sas, na->addr[na->curr_addr_index], - sizeof(struct sockaddr_storage)); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } - if (try_new_addr) { - na->curr_addr_index++; - if (na->curr_addr_index == na->addr_count) - na->curr_addr_index = 0; - } + spin_lock(&con->addrs_lock); + if (!con->addr_count) { + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; } - spin_unlock(&dlm_node_addrs_spin); - if (!na) - return -EEXIST; + memcpy(&sas, &con->addr[con->curr_addr_index], + sizeof(struct sockaddr_storage)); - if (!na->addr_count) - return -ENOENT; + if (try_new_addr) { + con->curr_addr_index++; + if (con->curr_addr_index == con->addr_count) + con->curr_addr_index = 0; + } - *mark = na->mark; + *mark = con->mark; + spin_unlock(&con->addrs_lock); if (sas_out) memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); - if (!sa_out) + if (!sa_out) { + srcu_read_unlock(&connections_srcu, idx); return 0; + } - if (dlm_local_addr[0]->ss_family == AF_INET) { + if (dlm_local_addr[0].ss_family == AF_INET) { struct sockaddr_in *in4 = (struct sockaddr_in *) &sas; struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out; ret4->sin_addr.s_addr = in4->sin_addr.s_addr; @@ -408,43 +407,46 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, ret6->sin6_addr = in6->sin6_addr; } + srcu_read_unlock(&connections_srcu, idx); return 0; } static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid, unsigned int *mark) { - struct dlm_node_addr *na; - int rv = -EEXIST; - int addr_i; - - spin_lock(&dlm_node_addrs_spin); - list_for_each_entry(na, &dlm_node_addrs, list) { - if (!na->addr_count) - continue; - - for (addr_i = 0; addr_i < na->addr_count; addr_i++) { - if (addr_compare(na->addr[addr_i], addr)) { - *nodeid = na->nodeid; - *mark = na->mark; - rv = 0; - goto unlock; + struct connection *con; + int i, idx, addr_i; + + idx = srcu_read_lock(&connections_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + WARN_ON_ONCE(!con->addr_count); + + spin_lock(&con->addrs_lock); + for (addr_i = 0; addr_i < con->addr_count; addr_i++) { + if (addr_compare(&con->addr[addr_i], addr)) { + *nodeid = con->nodeid; + *mark = con->mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); + return 0; + } } + spin_unlock(&con->addrs_lock); } } -unlock: - spin_unlock(&dlm_node_addrs_spin); - return rv; + srcu_read_unlock(&connections_srcu, idx); + + return -ENOENT; } -/* caller need to held dlm_node_addrs_spin lock */ -static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na, - const struct sockaddr_storage *addr) +static bool dlm_lowcomms_con_has_addr(const struct connection *con, + const struct sockaddr_storage *addr) { int i; - for (i = 0; i < na->addr_count; i++) { - if (addr_compare(na->addr[i], addr)) + for (i = 0; i < con->addr_count; i++) { + if (addr_compare(&con->addr[i], addr)) return true; } @@ -453,118 +455,82 @@ static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na, int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) { - struct sockaddr_storage *new_addr; - struct dlm_node_addr *new_node, *na; - bool ret; - - new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS); - if (!new_node) - return -ENOMEM; + struct connection *con; + bool ret, idx; - new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS); - if (!new_addr) { - kfree(new_node); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, GFP_NOFS); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return -ENOMEM; } - memcpy(new_addr, addr, len); - - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (!na) { - new_node->nodeid = nodeid; - new_node->addr[0] = new_addr; - new_node->addr_count = 1; - new_node->mark = dlm_config.ci_mark; - list_add(&new_node->list, &dlm_node_addrs); - spin_unlock(&dlm_node_addrs_spin); + spin_lock(&con->addrs_lock); + if (!con->addr_count) { + memcpy(&con->addr[0], addr, sizeof(*addr)); + con->addr_count = 1; + con->mark = dlm_config.ci_mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return 0; } - ret = dlm_lowcomms_na_has_addr(na, addr); + ret = dlm_lowcomms_con_has_addr(con, addr); if (ret) { - spin_unlock(&dlm_node_addrs_spin); - kfree(new_addr); - kfree(new_node); + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return -EEXIST; } - if (na->addr_count >= DLM_MAX_ADDR_COUNT) { - spin_unlock(&dlm_node_addrs_spin); - kfree(new_addr); - kfree(new_node); + if (con->addr_count >= DLM_MAX_ADDR_COUNT) { + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return -ENOSPC; } - na->addr[na->addr_count++] = new_addr; - spin_unlock(&dlm_node_addrs_spin); - kfree(new_node); + memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr)); + srcu_read_unlock(&connections_srcu, idx); + spin_unlock(&con->addrs_lock); return 0; } /* Data available on socket or listen socket received a connect */ static void lowcomms_data_ready(struct sock *sk) { - struct connection *con; - - con = sock2con(sk); - if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) - queue_work(recv_workqueue, &con->rwork); -} - -static void lowcomms_listen_data_ready(struct sock *sk) -{ - if (!dlm_allow_conn) - return; + struct connection *con = sock2con(sk); - queue_work(recv_workqueue, &listen_con.rwork); + set_bit(CF_RECV_INTR, &con->flags); + lowcomms_queue_rwork(con); } static void lowcomms_write_space(struct sock *sk) { - struct connection *con; - - con = sock2con(sk); - if (!con) - return; - - if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { - log_print("connected to node %d", con->nodeid); - queue_work(send_workqueue, &con->swork); - return; - } + struct connection *con = sock2con(sk); clear_bit(SOCK_NOSPACE, &con->sock->flags); + spin_lock_bh(&con->writequeue_lock); if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { con->sock->sk->sk_write_pending--; clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } - queue_work(send_workqueue, &con->swork); -} - -static inline void lowcomms_connect_sock(struct connection *con) -{ - if (test_bit(CF_CLOSE, &con->flags)) - return; - queue_work(send_workqueue, &con->swork); - cond_resched(); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); } static void lowcomms_state_change(struct sock *sk) { /* SCTP layer is not calling sk_data_ready when the connection - * is done, so we catch the signal through here. Also, it - * doesn't switch socket state when entering shutdown, so we - * skip the write in that case. + * is done, so we catch the signal through here. */ - if (sk->sk_shutdown) { - if (sk->sk_shutdown == RCV_SHUTDOWN) - lowcomms_data_ready(sk); - } else if (sk->sk_state == TCP_ESTABLISHED) { - lowcomms_write_space(sk); - } + if (sk->sk_shutdown == RCV_SHUTDOWN) + lowcomms_data_ready(sk); +} + +static void lowcomms_listen_data_ready(struct sock *sk) +{ + queue_work(io_workqueue, &listen_con.rwork); } int dlm_lowcomms_connect_node(int nodeid) @@ -576,47 +542,49 @@ int dlm_lowcomms_connect_node(int nodeid) return 0; idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, GFP_NOFS); - if (!con) { + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); - return -ENOMEM; + return -ENOENT; } - lowcomms_connect_sock(con); + down_read(&con->sock_lock); + if (!con->sock) { + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + } + up_read(&con->sock_lock); srcu_read_unlock(&connections_srcu, idx); + cond_resched(); return 0; } int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) { - struct dlm_node_addr *na; + struct connection *con; + int idx; - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (!na) { - spin_unlock(&dlm_node_addrs_spin); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return -ENOENT; } - na->mark = mark; - spin_unlock(&dlm_node_addrs_spin); - + spin_lock(&con->addrs_lock); + con->mark = mark; + spin_unlock(&con->addrs_lock); + srcu_read_unlock(&connections_srcu, idx); return 0; } static void lowcomms_error_report(struct sock *sk) { - struct connection *con; - void (*orig_report)(struct sock *) = NULL; + struct connection *con = sock2con(sk); struct inet_sock *inet; - con = sock2con(sk); - if (con == NULL) - goto out; - - orig_report = listen_sock.sk_error_report; - inet = inet_sk(sk); switch (sk->sk_family) { case AF_INET: @@ -642,66 +610,25 @@ static void lowcomms_error_report(struct sock *sk) "invalid socket family %d set, " "sk_err=%d/%d\n", dlm_our_nodeid(), sk->sk_family, sk->sk_err, sk->sk_err_soft); - goto out; - } - - /* below sendcon only handling */ - if (test_bit(CF_IS_OTHERCON, &con->flags)) - con = con->sendcon; - - switch (sk->sk_err) { - case ECONNREFUSED: - set_bit(CF_DELAY_CONNECT, &con->flags); - break; - default: break; } - if (!test_and_set_bit(CF_RECONNECT, &con->flags)) - queue_work(send_workqueue, &con->swork); + dlm_midcomms_unack_msg_resend(con->nodeid); -out: - if (orig_report) - orig_report(sk); + listen_sock.sk_error_report(sk); } -/* Note: sk_callback_lock must be locked before calling this function. */ -static void save_listen_callbacks(struct socket *sock) +static void restore_callbacks(struct sock *sk) { - struct sock *sk = sock->sk; - - listen_sock.sk_data_ready = sk->sk_data_ready; - listen_sock.sk_state_change = sk->sk_state_change; - listen_sock.sk_write_space = sk->sk_write_space; - listen_sock.sk_error_report = sk->sk_error_report; -} - -static void restore_callbacks(struct socket *sock) -{ - struct sock *sk = sock->sk; +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(!lockdep_sock_is_held(sk)); +#endif - lock_sock(sk); sk->sk_user_data = NULL; sk->sk_data_ready = listen_sock.sk_data_ready; sk->sk_state_change = listen_sock.sk_state_change; sk->sk_write_space = listen_sock.sk_write_space; sk->sk_error_report = listen_sock.sk_error_report; - release_sock(sk); -} - -static void add_listen_sock(struct socket *sock, struct listen_connection *con) -{ - struct sock *sk = sock->sk; - - lock_sock(sk); - save_listen_callbacks(sock); - con->sock = sock; - - sk->sk_user_data = con; - sk->sk_allocation = GFP_NOFS; - /* Install a data_ready callback */ - sk->sk_data_ready = lowcomms_listen_data_ready; - release_sock(sk); } /* Make a socket active */ @@ -713,10 +640,10 @@ static void add_sock(struct socket *sock, struct connection *con) con->sock = sock; sk->sk_user_data = con; - /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_data_ready; sk->sk_write_space = lowcomms_write_space; - sk->sk_state_change = lowcomms_state_change; + if (dlm_config.ci_protocol == DLM_PROTO_SCTP) + sk->sk_state_change = lowcomms_state_change; sk->sk_allocation = GFP_NOFS; sk->sk_error_report = lowcomms_error_report; release_sock(sk); @@ -727,7 +654,7 @@ static void add_sock(struct socket *sock, struct connection *con) static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, int *addr_len) { - saddr->ss_family = dlm_local_addr[0]->ss_family; + saddr->ss_family = dlm_local_addr[0].ss_family; if (saddr->ss_family == AF_INET) { struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; in4_addr->sin_port = cpu_to_be16(port); @@ -773,43 +700,67 @@ static void free_entry(struct writequeue_entry *e) } list_del(&e->list); - atomic_dec(&e->con->writequeue_cnt); kref_put(&e->ref, dlm_page_release); } static void dlm_close_sock(struct socket **sock) { - if (*sock) { - restore_callbacks(*sock); - sock_release(*sock); - *sock = NULL; + lock_sock((*sock)->sk); + restore_callbacks((*sock)->sk); + release_sock((*sock)->sk); + + sock_release(*sock); + *sock = NULL; +} + +static void allow_connection_io(struct connection *con) +{ + if (con->othercon) + clear_bit(CF_IO_STOP, &con->othercon->flags); + clear_bit(CF_IO_STOP, &con->flags); +} + +static void stop_connection_io(struct connection *con) +{ + if (con->othercon) + stop_connection_io(con->othercon); + + down_write(&con->sock_lock); + if (con->sock) { + lock_sock(con->sock->sk); + restore_callbacks(con->sock->sk); + + spin_lock_bh(&con->writequeue_lock); + set_bit(CF_IO_STOP, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); + } else { + spin_lock_bh(&con->writequeue_lock); + set_bit(CF_IO_STOP, &con->flags); + spin_unlock_bh(&con->writequeue_lock); } + up_write(&con->sock_lock); + + cancel_work_sync(&con->swork); + cancel_work_sync(&con->rwork); } /* Close a remote connection and tidy up */ -static void close_connection(struct connection *con, bool and_other, - bool tx, bool rx) +static void close_connection(struct connection *con, bool and_other) { - bool closing = test_and_set_bit(CF_CLOSING, &con->flags); struct writequeue_entry *e; - if (tx && !closing && cancel_work_sync(&con->swork)) { - log_print("canceled swork for node %d", con->nodeid); - clear_bit(CF_WRITE_PENDING, &con->flags); - } - if (rx && !closing && cancel_work_sync(&con->rwork)) { - log_print("canceled rwork for node %d", con->nodeid); - clear_bit(CF_READ_PENDING, &con->flags); + if (con->othercon && and_other) + close_connection(con->othercon, false); + + down_write(&con->sock_lock); + if (!con->sock) { + up_write(&con->sock_lock); + return; } - mutex_lock(&con->sock_mutex); dlm_close_sock(&con->sock); - if (con->othercon && and_other) { - /* Will only re-enter once. */ - close_connection(con->othercon, false, tx, rx); - } - /* if we send a writequeue entry only a half way, we drop the * whole entry because reconnection and that we not start of the * middle of a msg which will confuse the other end. @@ -821,200 +772,209 @@ static void close_connection(struct connection *con, bool and_other, * our policy is to start on a clean state when disconnects, we don't * know what's send/received on transport layer in this case. */ - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_first_entry(&con->writequeue, struct writequeue_entry, list); if (e->dirty) free_entry(e); } - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); con->rx_leftover = 0; con->retries = 0; clear_bit(CF_APP_LIMITED, &con->flags); - clear_bit(CF_CONNECTED, &con->flags); - clear_bit(CF_DELAY_CONNECT, &con->flags); - clear_bit(CF_RECONNECT, &con->flags); - clear_bit(CF_EOF, &con->flags); - mutex_unlock(&con->sock_mutex); - clear_bit(CF_CLOSING, &con->flags); + clear_bit(CF_RECV_PENDING, &con->flags); + clear_bit(CF_SEND_PENDING, &con->flags); + up_write(&con->sock_lock); } -static void shutdown_connection(struct connection *con) +static struct processqueue_entry *new_processqueue_entry(int nodeid, + int buflen) { - int ret; - - flush_work(&con->swork); + struct processqueue_entry *pentry; - mutex_lock(&con->sock_mutex); - /* nothing to shutdown */ - if (!con->sock) { - mutex_unlock(&con->sock_mutex); - return; - } + pentry = kmalloc(sizeof(*pentry), GFP_NOFS); + if (!pentry) + return NULL; - set_bit(CF_SHUTDOWN, &con->flags); - ret = kernel_sock_shutdown(con->sock, SHUT_WR); - mutex_unlock(&con->sock_mutex); - if (ret) { - log_print("Connection %p failed to shutdown: %d will force close", - con, ret); - goto force_close; - } else { - ret = wait_event_timeout(con->shutdown_wait, - !test_bit(CF_SHUTDOWN, &con->flags), - DLM_SHUTDOWN_WAIT_TIMEOUT); - if (ret == 0) { - log_print("Connection %p shutdown timed out, will force close", - con); - goto force_close; - } + pentry->buf = kmalloc(buflen, GFP_NOFS); + if (!pentry->buf) { + kfree(pentry); + return NULL; } - return; + pentry->nodeid = nodeid; + return pentry; +} -force_close: - clear_bit(CF_SHUTDOWN, &con->flags); - close_connection(con, false, true, true); +static void free_processqueue_entry(struct processqueue_entry *pentry) +{ + kfree(pentry->buf); + kfree(pentry); } -static void dlm_tcp_shutdown(struct connection *con) +struct dlm_processed_nodes { + int nodeid; + + struct list_head list; +}; + +static void add_processed_node(int nodeid, struct list_head *processed_nodes) { - if (con->othercon) - shutdown_connection(con->othercon); - shutdown_connection(con); + struct dlm_processed_nodes *n; + + list_for_each_entry(n, processed_nodes, list) { + /* we already remembered this node */ + if (n->nodeid == nodeid) + return; + } + + /* if it's fails in worst case we simple don't send an ack back. + * We try it next time. + */ + n = kmalloc(sizeof(*n), GFP_NOFS); + if (!n) + return; + + n->nodeid = nodeid; + list_add(&n->list, processed_nodes); } -static int con_realloc_receive_buf(struct connection *con, int newlen) +static void process_dlm_messages(struct work_struct *work) { - unsigned char *newbuf; + struct dlm_processed_nodes *n, *n_tmp; + struct processqueue_entry *pentry; + LIST_HEAD(processed_nodes); - newbuf = kmalloc(newlen, GFP_NOFS); - if (!newbuf) - return -ENOMEM; + spin_lock(&processqueue_lock); + pentry = list_first_entry_or_null(&processqueue, + struct processqueue_entry, list); + if (WARN_ON_ONCE(!pentry)) { + spin_unlock(&processqueue_lock); + return; + } - /* copy any leftover from last receive */ - if (con->rx_leftover) - memmove(newbuf, con->rx_buf, con->rx_leftover); + list_del(&pentry->list); + spin_unlock(&processqueue_lock); - /* swap to new buffer space */ - kfree(con->rx_buf); - con->rx_buflen = newlen; - con->rx_buf = newbuf; + for (;;) { + dlm_process_incoming_buffer(pentry->nodeid, pentry->buf, + pentry->buflen); + add_processed_node(pentry->nodeid, &processed_nodes); + free_processqueue_entry(pentry); + + spin_lock(&processqueue_lock); + pentry = list_first_entry_or_null(&processqueue, + struct processqueue_entry, list); + if (!pentry) { + process_dlm_messages_pending = false; + spin_unlock(&processqueue_lock); + break; + } - return 0; + list_del(&pentry->list); + spin_unlock(&processqueue_lock); + } + + /* send ack back after we processed couple of messages */ + list_for_each_entry_safe(n, n_tmp, &processed_nodes, list) { + list_del(&n->list); + dlm_midcomms_receive_done(n->nodeid); + kfree(n); + } } /* Data received from remote end */ -static int receive_from_sock(struct connection *con) +static int receive_from_sock(struct connection *con, int buflen) { + struct processqueue_entry *pentry; + int ret, buflen_real; struct msghdr msg; struct kvec iov; - int ret, buflen; - mutex_lock(&con->sock_mutex); + pentry = new_processqueue_entry(con->nodeid, buflen); + if (!pentry) + return DLM_IO_RESCHED; - if (con->sock == NULL) { - ret = -EAGAIN; - goto out_close; - } - - /* realloc if we get new buffer size to read out */ - buflen = dlm_config.ci_buffer_size; - if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { - ret = con_realloc_receive_buf(con, buflen); - if (ret < 0) - goto out_resched; - } + memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover); - for (;;) { - /* calculate new buffer parameter regarding last receive and - * possible leftover bytes - */ - iov.iov_base = con->rx_buf + con->rx_leftover; - iov.iov_len = con->rx_buflen - con->rx_leftover; - - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; - ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, - msg.msg_flags); - trace_dlm_recv(con->nodeid, ret); - if (ret == -EAGAIN) - break; - else if (ret <= 0) - goto out_close; - - /* new buflen according readed bytes and leftover from last receive */ - buflen = ret + con->rx_leftover; - ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); - if (ret < 0) - goto out_close; - - /* calculate leftover bytes from process and put it into begin of - * the receive buffer, so next receive we have the full message - * at the start address of the receive buffer. - */ - con->rx_leftover = buflen - ret; - if (con->rx_leftover) { - memmove(con->rx_buf, con->rx_buf + ret, - con->rx_leftover); + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes + */ + iov.iov_base = pentry->buf + con->rx_leftover; + iov.iov_len = buflen - con->rx_leftover; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + clear_bit(CF_RECV_INTR, &con->flags); +again: + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); + trace_dlm_recv(con->nodeid, ret); + if (ret == -EAGAIN) { + lock_sock(con->sock->sk); + if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) { + release_sock(con->sock->sk); + goto again; } + + clear_bit(CF_RECV_PENDING, &con->flags); + release_sock(con->sock->sk); + free_processqueue_entry(pentry); + return DLM_IO_END; + } else if (ret == 0) { + /* close will clear CF_RECV_PENDING */ + free_processqueue_entry(pentry); + return DLM_IO_EOF; + } else if (ret < 0) { + free_processqueue_entry(pentry); + return ret; } - dlm_midcomms_receive_done(con->nodeid); - mutex_unlock(&con->sock_mutex); - return 0; + /* new buflen according readed bytes and leftover from last receive */ + buflen_real = ret + con->rx_leftover; + ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf, + buflen_real); + if (ret < 0) { + free_processqueue_entry(pentry); + return ret; + } -out_resched: - if (!test_and_set_bit(CF_READ_PENDING, &con->flags)) - queue_work(recv_workqueue, &con->rwork); - mutex_unlock(&con->sock_mutex); - return -EAGAIN; - -out_close: - if (ret == 0) { - log_print("connection %p got EOF from %d", - con, con->nodeid); - - if (dlm_proto_ops->eof_condition && - dlm_proto_ops->eof_condition(con)) { - set_bit(CF_EOF, &con->flags); - mutex_unlock(&con->sock_mutex); - } else { - mutex_unlock(&con->sock_mutex); - close_connection(con, false, true, false); + pentry->buflen = ret; - /* handling for tcp shutdown */ - clear_bit(CF_SHUTDOWN, &con->flags); - wake_up(&con->shutdown_wait); - } + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen_real - ret; + memmove(con->rx_leftover_buf, pentry->buf + ret, + con->rx_leftover); - /* signal to breaking receive worker */ - ret = -1; - } else { - mutex_unlock(&con->sock_mutex); + spin_lock(&processqueue_lock); + list_add_tail(&pentry->list, &processqueue); + if (!process_dlm_messages_pending) { + process_dlm_messages_pending = true; + queue_work(process_workqueue, &process_work); } - return ret; + spin_unlock(&processqueue_lock); + + return DLM_IO_SUCCESS; } /* Listening socket is busy, accept a connection */ -static int accept_from_sock(struct listen_connection *con) +static int accept_from_sock(void) { - int result; struct sockaddr_storage peeraddr; - struct socket *newsock; - int len, idx; - int nodeid; + int len, idx, result, nodeid; struct connection *newcon; - struct connection *addcon; + struct socket *newsock; unsigned int mark; - if (!con->sock) - return -ENOTCONN; - - result = kernel_accept(con->sock, &newsock, O_NONBLOCK); - if (result < 0) + result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK); + if (result == -EAGAIN) + return DLM_IO_END; + else if (result < 0) goto accept_err; /* Get the connected socket's peer */ @@ -1062,16 +1022,16 @@ static int accept_from_sock(struct listen_connection *con) * In this case we store the incoming one in "othercon" */ idx = srcu_read_lock(&connections_srcu); - newcon = nodeid2con(nodeid, GFP_NOFS); - if (!newcon) { + newcon = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!newcon)) { srcu_read_unlock(&connections_srcu, idx); - result = -ENOMEM; + result = -ENOENT; goto accept_err; } sock_set_mark(newsock->sk, mark); - mutex_lock(&newcon->sock_mutex); + down_write(&newcon->sock_lock); if (newcon->sock) { struct connection *othercon = newcon->othercon; @@ -1079,63 +1039,50 @@ static int accept_from_sock(struct listen_connection *con) othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); - mutex_unlock(&newcon->sock_mutex); + up_write(&newcon->sock_lock); srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } - result = dlm_con_init(othercon, nodeid); - if (result < 0) { - kfree(othercon); - mutex_unlock(&newcon->sock_mutex); - srcu_read_unlock(&connections_srcu, idx); - goto accept_err; - } - - lockdep_set_subclass(&othercon->sock_mutex, 1); - set_bit(CF_IS_OTHERCON, &othercon->flags); + dlm_con_init(othercon, nodeid); + lockdep_set_subclass(&othercon->sock_lock, 1); newcon->othercon = othercon; - othercon->sendcon = newcon; + set_bit(CF_IS_OTHERCON, &othercon->flags); } else { /* close other sock con if we have something new */ - close_connection(othercon, false, true, false); + close_connection(othercon, false); } - mutex_lock(&othercon->sock_mutex); + down_write(&othercon->sock_lock); add_sock(newsock, othercon); - addcon = othercon; - mutex_unlock(&othercon->sock_mutex); + + /* check if we receved something while adding */ + lock_sock(othercon->sock->sk); + lowcomms_queue_rwork(othercon); + release_sock(othercon->sock->sk); + up_write(&othercon->sock_lock); } else { /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will result in calling sk_error_report recursively. */ add_sock(newsock, newcon); - addcon = newcon; - } - - set_bit(CF_CONNECTED, &addcon->flags); - mutex_unlock(&newcon->sock_mutex); - - /* - * Add it to the active queue in case we got data - * between processing the accept adding the socket - * to the read_sockets list - */ - if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) - queue_work(recv_workqueue, &addcon->rwork); + /* check if we receved something while adding */ + lock_sock(newcon->sock->sk); + lowcomms_queue_rwork(newcon); + release_sock(newcon->sock->sk); + } + up_write(&newcon->sock_lock); srcu_read_unlock(&connections_srcu, idx); - return 0; + return DLM_IO_SUCCESS; accept_err: if (newsock) sock_release(newsock); - if (result != -EAGAIN) - log_print("error accepting connection from node: %d", result); return result; } @@ -1167,7 +1114,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port) int i, addr_len, result = 0; for (i = 0; i < dlm_local_count; i++) { - memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr)); + memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr)); make_sockaddr(&localaddr, port, &addr_len); if (!i) @@ -1187,7 +1134,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port) /* Get local addresses */ static void init_local(void) { - struct sockaddr_storage sas, *addr; + struct sockaddr_storage sas; int i; dlm_local_count = 0; @@ -1195,21 +1142,10 @@ static void init_local(void) if (dlm_our_addr(&sas, i)) break; - addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS); - if (!addr) - break; - dlm_local_addr[dlm_local_count++] = addr; + memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas)); } } -static void deinit_local(void) -{ - int i; - - for (i = 0; i < dlm_local_count; i++) - kfree(dlm_local_addr[i]); -} - static struct writequeue_entry *new_writequeue_entry(struct connection *con) { struct writequeue_entry *entry; @@ -1240,7 +1176,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, { struct writequeue_entry *e; - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); if (!list_empty(&con->writequeue)) { e = list_last_entry(&con->writequeue, struct writequeue_entry, list); if (DLM_WQ_REMAIN_BYTES(e) >= len) { @@ -1263,14 +1199,13 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; - atomic_inc(&con->writequeue_cnt); if (cb) cb(data); list_add_tail(&e->list, &con->writequeue); out: - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); return e; }; @@ -1319,13 +1254,13 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, len < sizeof(struct dlm_header)) { BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE); log_print("failed to allocate a buffer of size %d", len); - WARN_ON(1); + WARN_ON_ONCE(1); return NULL; } idx = srcu_read_lock(&connections_srcu); - con = nodeid2con(nodeid, allocation); - if (!con) { + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { srcu_read_unlock(&connections_srcu, idx); return NULL; } @@ -1350,7 +1285,7 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) struct connection *con = e->con; int users; - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); kref_get(&msg->ref); list_add(&msg->list, &e->msgs); @@ -1359,13 +1294,11 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) goto out; e->len = DLM_WQ_LENGTH_BYTES(e); - spin_unlock(&con->writequeue_lock); - queue_work(send_workqueue, &con->swork); - return; + lowcomms_queue_swork(con); out: - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); return; } @@ -1387,7 +1320,7 @@ void dlm_lowcomms_put_msg(struct dlm_msg *msg) kref_put(&msg->ref, dlm_msg_release); } -/* does not held connections_srcu, usage workqueue only */ +/* does not held connections_srcu, usage lowcomms_error_report only */ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) { struct dlm_msg *msg_resend; @@ -1413,90 +1346,79 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) } /* Send a message */ -static void send_to_sock(struct connection *con) +static int send_to_sock(struct connection *con) { const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; int len, offset, ret; - int count = 0; - mutex_lock(&con->sock_mutex); - if (con->sock == NULL) - goto out_connect; - - spin_lock(&con->writequeue_lock); - for (;;) { - e = con_next_wq(con); - if (!e) - break; + spin_lock_bh(&con->writequeue_lock); + e = con_next_wq(con); + if (!e) { + clear_bit(CF_SEND_PENDING, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + return DLM_IO_END; + } - len = e->len; - offset = e->offset; - BUG_ON(len == 0 && e->users == 0); - spin_unlock(&con->writequeue_lock); - - ret = kernel_sendpage(con->sock, e->page, offset, len, - msg_flags); - trace_dlm_send(con->nodeid, ret); - if (ret == -EAGAIN || ret == 0) { - if (ret == -EAGAIN && - test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && - !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { - /* Notify TCP that we're limited by the - * application window size. - */ - set_bit(SOCK_NOSPACE, &con->sock->flags); - con->sock->sk->sk_write_pending++; - } - cond_resched(); - goto out; - } else if (ret < 0) - goto out; + len = e->len; + offset = e->offset; + WARN_ON_ONCE(len == 0 && e->users == 0); + spin_unlock_bh(&con->writequeue_lock); - /* Don't starve people filling buffers */ - if (++count >= MAX_SEND_MSG_COUNT) { - cond_resched(); - count = 0; + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); + trace_dlm_send(con->nodeid, ret); + if (ret == -EAGAIN || ret == 0) { + lock_sock(con->sock->sk); + spin_lock_bh(&con->writequeue_lock); + if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags); + con->sock->sk->sk_write_pending++; + + clear_bit(CF_SEND_PENDING, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); + + /* wait for write_space() event */ + return DLM_IO_END; } + spin_unlock_bh(&con->writequeue_lock); + release_sock(con->sock->sk); - spin_lock(&con->writequeue_lock); - writequeue_entry_complete(e, ret); - } - spin_unlock(&con->writequeue_lock); - - /* close if we got EOF */ - if (test_and_clear_bit(CF_EOF, &con->flags)) { - mutex_unlock(&con->sock_mutex); - close_connection(con, false, false, true); - - /* handling for tcp shutdown */ - clear_bit(CF_SHUTDOWN, &con->flags); - wake_up(&con->shutdown_wait); - } else { - mutex_unlock(&con->sock_mutex); + return DLM_IO_RESCHED; + } else if (ret < 0) { + return ret; } - return; - -out: - mutex_unlock(&con->sock_mutex); - return; + spin_lock_bh(&con->writequeue_lock); + writequeue_entry_complete(e, ret); + spin_unlock_bh(&con->writequeue_lock); -out_connect: - mutex_unlock(&con->sock_mutex); - queue_work(send_workqueue, &con->swork); - cond_resched(); + return DLM_IO_SUCCESS; } static void clean_one_writequeue(struct connection *con) { struct writequeue_entry *e, *safe; - spin_lock(&con->writequeue_lock); + spin_lock_bh(&con->writequeue_lock); list_for_each_entry_safe(e, safe, &con->writequeue, list) { free_entry(e); } - spin_unlock(&con->writequeue_lock); + spin_unlock_bh(&con->writequeue_lock); +} + +static void connection_release(struct rcu_head *rcu) +{ + struct connection *con = container_of(rcu, struct connection, rcu); + + WARN_ON_ONCE(!list_empty(&con->writequeue)); + WARN_ON_ONCE(con->sock); + kfree(con); } /* Called from recovery when it knows that a node has @@ -1504,286 +1426,311 @@ static void clean_one_writequeue(struct connection *con) int dlm_lowcomms_close(int nodeid) { struct connection *con; - struct dlm_node_addr *na; int idx; log_print("closing connection to node %d", nodeid); + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); - if (con) { - set_bit(CF_CLOSE, &con->flags); - close_connection(con, true, true, true); - clean_one_writequeue(con); + if (WARN_ON_ONCE(!con)) { + srcu_read_unlock(&connections_srcu, idx); + return -ENOENT; + } + + stop_connection_io(con); + log_print("io handling for node: %d stopped", nodeid); + close_connection(con, true); + + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + + clean_one_writequeue(con); + call_srcu(&connections_srcu, &con->rcu, connection_release); + if (con->othercon) { + clean_one_writequeue(con->othercon); if (con->othercon) - clean_one_writequeue(con->othercon); + call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); } srcu_read_unlock(&connections_srcu, idx); - spin_lock(&dlm_node_addrs_spin); - na = find_node_addr(nodeid); - if (na) { - list_del(&na->list); - while (na->addr_count--) - kfree(na->addr[na->addr_count]); - kfree(na); - } - spin_unlock(&dlm_node_addrs_spin); + /* for debugging we print when we are done to compare with other + * messages in between. This function need to be correctly synchronized + * with io handling + */ + log_print("closing connection to node %d done", nodeid); return 0; } -/* Receive workqueue function */ +/* Receive worker function */ static void process_recv_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, rwork); + int ret, buflen; + + down_read(&con->sock_lock); + if (!con->sock) { + up_read(&con->sock_lock); + return; + } + + buflen = READ_ONCE(dlm_config.ci_buffer_size); + do { + ret = receive_from_sock(con, buflen); + } while (ret == DLM_IO_SUCCESS); + up_read(&con->sock_lock); - clear_bit(CF_READ_PENDING, &con->flags); - receive_from_sock(con); + switch (ret) { + case DLM_IO_END: + /* CF_RECV_PENDING cleared */ + break; + case DLM_IO_EOF: + close_connection(con, false); + /* CF_RECV_PENDING cleared */ + break; + case DLM_IO_RESCHED: + cond_resched(); + queue_work(io_workqueue, &con->rwork); + /* CF_RECV_PENDING not cleared */ + break; + default: + if (ret < 0) { + if (test_bit(CF_IS_OTHERCON, &con->flags)) { + close_connection(con, false); + } else { + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + } + + /* CF_RECV_PENDING cleared for othercon + * we trigger send queue if not already done + * and process_send_sockets will handle it + */ + break; + } + + WARN_ON_ONCE(1); + break; + } } static void process_listen_recv_socket(struct work_struct *work) { - accept_from_sock(&listen_con); + int ret; + + if (WARN_ON_ONCE(!listen_con.sock)) + return; + + do { + ret = accept_from_sock(); + } while (ret == DLM_IO_SUCCESS); + + if (ret < 0) + log_print("critical error accepting connection: %d", ret); } -static void dlm_connect(struct connection *con) +static int dlm_connect(struct connection *con) { struct sockaddr_storage addr; int result, addr_len; struct socket *sock; unsigned int mark; - /* Some odd races can cause double-connects, ignore them */ - if (con->retries++ > MAX_CONNECT_RETRIES) - return; - - if (con->sock) { - log_print("node %d already connected.", con->nodeid); - return; - } - memset(&addr, 0, sizeof(addr)); result = nodeid_to_addr(con->nodeid, &addr, NULL, dlm_proto_ops->try_new_addr, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); - return; + return result; } /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) - goto socket_err; + return result; sock_set_mark(sock->sk, mark); dlm_proto_ops->sockopts(sock); - add_sock(sock, con); - result = dlm_proto_ops->bind(sock); - if (result < 0) - goto add_sock_err; + if (result < 0) { + sock_release(sock); + return result; + } + + add_sock(sock, con); log_print_ratelimited("connecting to %d", con->nodeid); make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr, addr_len); - if (result < 0) - goto add_sock_err; - - return; - -add_sock_err: - dlm_close_sock(&con->sock); + switch (result) { + case -EINPROGRESS: + /* not an error */ + fallthrough; + case 0: + break; + default: + if (result < 0) + dlm_close_sock(&con->sock); -socket_err: - /* - * Some errors are fatal and this list might need adjusting. For other - * errors we try again until the max number of retries is reached. - */ - if (result != -EHOSTUNREACH && - result != -ENETUNREACH && - result != -ENETDOWN && - result != -EINVAL && - result != -EPROTONOSUPPORT) { - log_print("connect %d try %d error %d", con->nodeid, - con->retries, result); - msleep(1000); - lowcomms_connect_sock(con); + break; } + + return result; } -/* Send workqueue function */ +/* Send worker function */ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); + int ret; - WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags)); - - clear_bit(CF_WRITE_PENDING, &con->flags); + WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags)); - if (test_and_clear_bit(CF_RECONNECT, &con->flags)) { - close_connection(con, false, false, true); - dlm_midcomms_unack_msg_resend(con->nodeid); + down_read(&con->sock_lock); + if (!con->sock) { + up_read(&con->sock_lock); + down_write(&con->sock_lock); + if (!con->sock) { + ret = dlm_connect(con); + switch (ret) { + case 0: + break; + case -EINPROGRESS: + /* avoid spamming resched on connection + * we might can switch to a state_change + * event based mechanism if established + */ + msleep(100); + break; + default: + /* CF_SEND_PENDING not cleared */ + up_write(&con->sock_lock); + log_print("connect to node %d try %d error %d", + con->nodeid, con->retries++, ret); + msleep(1000); + /* For now we try forever to reconnect. In + * future we should send a event to cluster + * manager to fence itself after certain amount + * of retries. + */ + queue_work(io_workqueue, &con->swork); + return; + } + } + downgrade_write(&con->sock_lock); } - if (con->sock == NULL) { - if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) - msleep(1000); + do { + ret = send_to_sock(con); + } while (ret == DLM_IO_SUCCESS); + up_read(&con->sock_lock); - mutex_lock(&con->sock_mutex); - dlm_connect(con); - mutex_unlock(&con->sock_mutex); - } + switch (ret) { + case DLM_IO_END: + /* CF_SEND_PENDING cleared */ + break; + case DLM_IO_RESCHED: + /* CF_SEND_PENDING not cleared */ + cond_resched(); + queue_work(io_workqueue, &con->swork); + break; + default: + if (ret < 0) { + close_connection(con, false); + + /* CF_SEND_PENDING cleared */ + spin_lock_bh(&con->writequeue_lock); + lowcomms_queue_swork(con); + spin_unlock_bh(&con->writequeue_lock); + break; + } - if (!list_empty(&con->writequeue)) - send_to_sock(con); + WARN_ON_ONCE(1); + break; + } } static void work_stop(void) { - if (recv_workqueue) { - destroy_workqueue(recv_workqueue); - recv_workqueue = NULL; + if (io_workqueue) { + destroy_workqueue(io_workqueue); + io_workqueue = NULL; } - if (send_workqueue) { - destroy_workqueue(send_workqueue); - send_workqueue = NULL; + if (process_workqueue) { + destroy_workqueue(process_workqueue); + process_workqueue = NULL; } } static int work_start(void) { - recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM); - if (!recv_workqueue) { - log_print("can't start dlm_recv"); + io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM, + 0); + if (!io_workqueue) { + log_print("can't start dlm_io"); return -ENOMEM; } - send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM); - if (!send_workqueue) { - log_print("can't start dlm_send"); - destroy_workqueue(recv_workqueue); - recv_workqueue = NULL; + /* ordered dlm message process queue, + * should be converted to a tasklet + */ + process_workqueue = alloc_ordered_workqueue("dlm_process", + WQ_HIGHPRI | WQ_MEM_RECLAIM); + if (!process_workqueue) { + log_print("can't start dlm_process"); + destroy_workqueue(io_workqueue); + io_workqueue = NULL; return -ENOMEM; } return 0; } -static void shutdown_conn(struct connection *con) -{ - if (dlm_proto_ops->shutdown_action) - dlm_proto_ops->shutdown_action(con); -} - void dlm_lowcomms_shutdown(void) { - int idx; - - /* Set all the flags to prevent any - * socket activity. - */ - dlm_allow_conn = 0; - - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); + /* stop lowcomms_listen_data_ready calls */ + lock_sock(listen_con.sock->sk); + listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready; + release_sock(listen_con.sock->sk); + cancel_work_sync(&listen_con.rwork); dlm_close_sock(&listen_con.sock); - idx = srcu_read_lock(&connections_srcu); - foreach_conn(shutdown_conn); - srcu_read_unlock(&connections_srcu, idx); -} - -static void _stop_conn(struct connection *con, bool and_other) -{ - mutex_lock(&con->sock_mutex); - set_bit(CF_CLOSE, &con->flags); - set_bit(CF_READ_PENDING, &con->flags); - set_bit(CF_WRITE_PENDING, &con->flags); - if (con->sock && con->sock->sk) { - lock_sock(con->sock->sk); - con->sock->sk->sk_user_data = NULL; - release_sock(con->sock->sk); - } - if (con->othercon && and_other) - _stop_conn(con->othercon, false); - mutex_unlock(&con->sock_mutex); -} - -static void stop_conn(struct connection *con) -{ - _stop_conn(con, true); + flush_workqueue(process_workqueue); } -static void connection_release(struct rcu_head *rcu) +void dlm_lowcomms_shutdown_node(int nodeid, bool force) { - struct connection *con = container_of(rcu, struct connection, rcu); - - kfree(con->rx_buf); - kfree(con); -} + struct connection *con; + int idx; -static void free_conn(struct connection *con) -{ - close_connection(con, true, true, true); - spin_lock(&connections_lock); - hlist_del_rcu(&con->list); - spin_unlock(&connections_lock); - if (con->othercon) { - clean_one_writequeue(con->othercon); - call_srcu(&connections_srcu, &con->othercon->rcu, - connection_release); + idx = srcu_read_lock(&connections_srcu); + con = nodeid2con(nodeid, 0); + if (WARN_ON_ONCE(!con)) { + srcu_read_unlock(&connections_srcu, idx); + return; } - clean_one_writequeue(con); - call_srcu(&connections_srcu, &con->rcu, connection_release); -} -static void work_flush(void) -{ - int ok; - int i; - struct connection *con; - - do { - ok = 1; - foreach_conn(stop_conn); - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); - for (i = 0; i < CONN_HASH_SIZE && ok; i++) { - hlist_for_each_entry_rcu(con, &connection_hash[i], - list) { - ok &= test_bit(CF_READ_PENDING, &con->flags); - ok &= test_bit(CF_WRITE_PENDING, &con->flags); - if (con->othercon) { - ok &= test_bit(CF_READ_PENDING, - &con->othercon->flags); - ok &= test_bit(CF_WRITE_PENDING, - &con->othercon->flags); - } - } - } - } while (!ok); + flush_work(&con->swork); + stop_connection_io(con); + WARN_ON_ONCE(!force && !list_empty(&con->writequeue)); + close_connection(con, true); + clean_one_writequeue(con); + if (con->othercon) + clean_one_writequeue(con->othercon); + allow_connection_io(con); + srcu_read_unlock(&connections_srcu, idx); } void dlm_lowcomms_stop(void) { - int idx; - - idx = srcu_read_lock(&connections_srcu); - work_flush(); - foreach_conn(free_conn); - srcu_read_unlock(&connections_srcu, idx); work_stop(); - deinit_local(); - dlm_proto_ops = NULL; } @@ -1799,7 +1746,7 @@ static int dlm_listen_for_all(void) if (result < 0) return result; - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); if (result < 0) { log_print("Can't create comms socket: %d", result); @@ -1813,14 +1760,22 @@ static int dlm_listen_for_all(void) if (result < 0) goto out; - save_listen_callbacks(sock); - add_listen_sock(sock, &listen_con); + lock_sock(sock->sk); + listen_sock.sk_data_ready = sock->sk->sk_data_ready; + listen_sock.sk_write_space = sock->sk->sk_write_space; + listen_sock.sk_error_report = sock->sk->sk_error_report; + listen_sock.sk_state_change = sock->sk->sk_state_change; + + listen_con.sock = sock; + + sock->sk->sk_allocation = GFP_NOFS; + sock->sk->sk_data_ready = lowcomms_listen_data_ready; + release_sock(sock->sk); - INIT_WORK(&listen_con.rwork, process_listen_recv_socket); result = sock->ops->listen(sock, 5); if (result < 0) { dlm_close_sock(&listen_con.sock); - goto out; + return result; } return 0; @@ -1838,7 +1793,7 @@ static int dlm_tcp_bind(struct socket *sock) /* Bind to our cluster-known address connecting to avoid * routing problems. */ - memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); + memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr)); make_sockaddr(&src_addr, 0, &addr_len); result = sock->ops->bind(sock, (struct sockaddr *)&src_addr, @@ -1854,17 +1809,7 @@ static int dlm_tcp_bind(struct socket *sock) static int dlm_tcp_connect(struct connection *con, struct socket *sock, struct sockaddr *addr, int addr_len) { - int ret; - - ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); - switch (ret) { - case -EINPROGRESS: - fallthrough; - case 0: - return 0; - } - - return ret; + return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); } static int dlm_tcp_listen_validate(void) @@ -1895,8 +1840,8 @@ static int dlm_tcp_listen_bind(struct socket *sock) int addr_len; /* Bind to our port */ - make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); - return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0], + make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); + return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0], addr_len); } @@ -1909,8 +1854,6 @@ static const struct dlm_proto_ops dlm_tcp_ops = { .listen_validate = dlm_tcp_listen_validate, .listen_sockopts = dlm_tcp_listen_sockopts, .listen_bind = dlm_tcp_listen_bind, - .shutdown_action = dlm_tcp_shutdown, - .eof_condition = tcp_eof_condition, }; static int dlm_sctp_bind(struct socket *sock) @@ -1931,13 +1874,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, sock_set_sndtimeo(sock->sk, 5); ret = sock->ops->connect(sock, addr, addr_len, 0); sock_set_sndtimeo(sock->sk, 0); - if (ret < 0) - return ret; - - if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("connected to node %d", con->nodeid); - - return 0; + return ret; } static int dlm_sctp_listen_validate(void) @@ -1977,11 +1914,7 @@ static const struct dlm_proto_ops dlm_sctp_ops = { int dlm_lowcomms_start(void) { - int error = -EINVAL; - int i; - - for (i = 0; i < CONN_HASH_SIZE; i++) - INIT_HLIST_HEAD(&connection_hash[i]); + int error; init_local(); if (!dlm_local_count) { @@ -1990,13 +1923,9 @@ int dlm_lowcomms_start(void) goto fail; } - INIT_WORK(&listen_con.rwork, process_listen_recv_socket); - error = work_start(); if (error) - goto fail_local; - - dlm_allow_conn = 1; + goto fail; /* Start listening */ switch (dlm_config.ci_protocol) { @@ -2022,25 +1951,38 @@ int dlm_lowcomms_start(void) fail_listen: dlm_proto_ops = NULL; fail_proto_ops: - dlm_allow_conn = 0; - dlm_close_sock(&listen_con.sock); work_stop(); -fail_local: - deinit_local(); fail: return error; } +void dlm_lowcomms_init(void) +{ + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&connection_hash[i]); + + INIT_WORK(&listen_con.rwork, process_listen_recv_socket); +} + void dlm_lowcomms_exit(void) { - struct dlm_node_addr *na, *safe; + struct connection *con; + int i, idx; - spin_lock(&dlm_node_addrs_spin); - list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) { - list_del(&na->list); - while (na->addr_count--) - kfree(na->addr[na->addr_count]); - kfree(na); + idx = srcu_read_lock(&connections_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(con, &connection_hash[i], list) { + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + + if (con->othercon) + call_srcu(&connections_srcu, &con->othercon->rcu, + connection_release); + call_srcu(&connections_srcu, &con->rcu, connection_release); + } } - spin_unlock(&dlm_node_addrs_spin); + srcu_read_unlock(&connections_srcu, idx); } diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 29369feea991..3e8dca66183b 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -29,12 +29,14 @@ static inline int nodeid_hash(int nodeid) return nodeid & (CONN_HASH_SIZE-1); } -/* switch to check if dlm is running */ -extern int dlm_allow_conn; +/* check if dlm is running */ +bool dlm_lowcomms_is_running(void); int dlm_lowcomms_start(void); void dlm_lowcomms_shutdown(void); +void dlm_lowcomms_shutdown_node(int nodeid, bool force); void dlm_lowcomms_stop(void); +void dlm_lowcomms_init(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 1c5be4b70ac1..a77338be3237 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -17,7 +17,7 @@ #include "user.h" #include "memory.h" #include "config.h" -#include "lowcomms.h" +#include "midcomms.h" #define CREATE_TRACE_POINTS #include <trace/events/dlm.h> @@ -30,6 +30,8 @@ static int __init init_dlm(void) if (error) goto out; + dlm_midcomms_init(); + error = dlm_lockspace_init(); if (error) goto out_mem; @@ -66,6 +68,7 @@ static int __init init_dlm(void) out_lockspace: dlm_lockspace_exit(); out_mem: + dlm_midcomms_exit(); dlm_memory_exit(); out: return error; @@ -79,7 +82,7 @@ static void __exit exit_dlm(void) dlm_config_exit(); dlm_memory_exit(); dlm_lockspace_exit(); - dlm_lowcomms_exit(); + dlm_midcomms_exit(); dlm_unregister_debugfs(); } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 2af2ccfe43a9..923c01a8a0aa 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -573,7 +573,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) node = &rv->nodes[i]; if (dlm_is_member(ls, node->nodeid)) continue; - dlm_add_member(ls, node); + error = dlm_add_member(ls, node); + if (error) + return error; + log_rinfo(ls, "add member %d", node->nodeid); } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index ce35c3c19aeb..eb7a08641fcf 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -14,12 +14,14 @@ #include "lowcomms.h" #include "config.h" #include "memory.h" +#include "ast.h" static struct kmem_cache *writequeue_cache; static struct kmem_cache *mhandle_cache; static struct kmem_cache *msg_cache; static struct kmem_cache *lkb_cache; static struct kmem_cache *rsb_cache; +static struct kmem_cache *cb_cache; int __init dlm_memory_init(void) @@ -46,8 +48,16 @@ int __init dlm_memory_init(void) if (!rsb_cache) goto rsb; + cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback), + __alignof__(struct dlm_callback), 0, + NULL); + if (!rsb_cache) + goto cb; + return 0; +cb: + kmem_cache_destroy(rsb_cache); rsb: kmem_cache_destroy(msg_cache); msg: @@ -67,6 +77,7 @@ void dlm_memory_exit(void) kmem_cache_destroy(msg_cache); kmem_cache_destroy(lkb_cache); kmem_cache_destroy(rsb_cache); + kmem_cache_destroy(cb_cache); } char *dlm_allocate_lvb(struct dlm_ls *ls) @@ -115,12 +126,17 @@ void dlm_free_lkb(struct dlm_lkb *lkb) kfree(ua); } } + + /* drop references if they are set */ + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); + kmem_cache_free(lkb_cache, lkb); } -struct dlm_mhandle *dlm_allocate_mhandle(void) +struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation) { - return kmem_cache_alloc(mhandle_cache, GFP_NOFS); + return kmem_cache_alloc(mhandle_cache, allocation); } void dlm_free_mhandle(struct dlm_mhandle *mhandle) @@ -147,3 +163,13 @@ void dlm_free_msg(struct dlm_msg *msg) { kmem_cache_free(msg_cache, msg); } + +struct dlm_callback *dlm_allocate_cb(void) +{ + return kmem_cache_alloc(cb_cache, GFP_ATOMIC); +} + +void dlm_free_cb(struct dlm_callback *cb) +{ + kmem_cache_free(cb_cache, cb); +} diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h index 7bd3f1a391ca..6b29563d24f7 100644 --- a/fs/dlm/memory.h +++ b/fs/dlm/memory.h @@ -20,12 +20,14 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); void dlm_free_lkb(struct dlm_lkb *l); char *dlm_allocate_lvb(struct dlm_ls *ls); void dlm_free_lvb(char *l); -struct dlm_mhandle *dlm_allocate_mhandle(void); +struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation); void dlm_free_mhandle(struct dlm_mhandle *mhandle); struct writequeue_entry *dlm_allocate_writequeue(void); void dlm_free_writequeue(struct writequeue_entry *writequeue); struct dlm_msg *dlm_allocate_msg(gfp_t allocation); void dlm_free_msg(struct dlm_msg *msg); +struct dlm_callback *dlm_allocate_cb(void); +void dlm_free_cb(struct dlm_callback *cb); #endif /* __MEMORY_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 6489bc22ad61..fc015a6abe17 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -132,6 +132,7 @@ */ #define DLM_DEBUG_FENCE_TERMINATION 0 +#include <trace/events/dlm.h> #include <net/tcp.h> #include "dlm_internal.h" @@ -194,7 +195,7 @@ struct midcomms_node { }; struct dlm_mhandle { - const struct dlm_header *inner_hd; + const union dlm_packet *inner_p; struct midcomms_node *node; struct dlm_opts *opts; struct dlm_msg *msg; @@ -305,11 +306,11 @@ static void dlm_send_queue_flush(struct midcomms_node *node) pr_debug("flush midcomms send queue of node %d\n", node->nodeid); rcu_read_lock(); - spin_lock(&node->send_queue_lock); + spin_lock_bh(&node->send_queue_lock); list_for_each_entry_rcu(mh, &node->send_queue, list) { dlm_mhandle_delete(node, mh); } - spin_unlock(&node->send_queue_lock); + spin_unlock_bh(&node->send_queue_lock); rcu_read_unlock(); } @@ -415,7 +416,7 @@ static int dlm_send_fin(struct midcomms_node *node, m_header->h_cmd = DLM_FIN; pr_debug("sending fin msg to node %d\n", node->nodeid); - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, NULL, 0); set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); return 0; @@ -436,7 +437,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) } } - spin_lock(&node->send_queue_lock); + spin_lock_bh(&node->send_queue_lock); list_for_each_entry_rcu(mh, &node->send_queue, list) { if (before(mh->seq, seq)) { dlm_mhandle_delete(node, mh); @@ -445,7 +446,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) break; } } - spin_unlock(&node->send_queue_lock); + spin_unlock_bh(&node->send_queue_lock); rcu_read_unlock(); } @@ -468,12 +469,26 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); log_print("%s: unexpected state: %d\n", __func__, node->state); - WARN_ON(1); + WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); } +static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) +{ + switch (p->header.h_cmd) { + case DLM_MSG: + trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message); + break; + case DLM_RCOM: + trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom); + break; + default: + break; + } +} + static void dlm_midcomms_receive_buffer(union dlm_packet *p, struct midcomms_node *node, uint32_t seq) @@ -525,7 +540,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, spin_unlock(&node->state_lock); log_print("%s: unexpected state: %d\n", __func__, node->state); - WARN_ON(1); + WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); @@ -533,7 +548,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); break; default: - WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer_3_2_trace(seq, p); dlm_receive_buffer(p, node->nodeid); set_bit(DLM_NODE_ULP_DELIVERED, &node->flags); break; @@ -754,7 +770,7 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) goto out; } - WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer(p, nodeid); break; case DLM_OPTS: @@ -874,12 +890,7 @@ static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid) dlm_receive_buffer(p, nodeid); } -/* - * Called from the low-level comms layer to process a buffer of - * commands. - */ - -int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) +int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len) { const unsigned char *ptr = buf; const struct dlm_header *hd; @@ -914,6 +925,32 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) if (msglen > len) break; + ret += msglen; + len -= msglen; + ptr += msglen; + } + + return ret; +} + +/* + * Called from the low-level comms layer to process a buffer of + * commands. + */ +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) +{ + const unsigned char *ptr = buf; + const struct dlm_header *hd; + uint16_t msglen; + int ret = 0; + + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; + + msglen = le16_to_cpu(hd->h_length); + if (msglen > len) + break; + switch (hd->h_version) { case cpu_to_le32(DLM_VERSION_3_1): dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); @@ -1030,9 +1067,9 @@ static void midcomms_new_msg_cb(void *data) atomic_inc(&mh->node->send_queue_cnt); - spin_lock(&mh->node->send_queue_lock); + spin_lock_bh(&mh->node->send_queue_lock); list_add_tail_rcu(&mh->list, &mh->node->send_queue); - spin_unlock(&mh->node->send_queue_lock); + spin_unlock_bh(&mh->node->send_queue_lock); mh->seq = mh->node->seq_send++; } @@ -1055,7 +1092,7 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node dlm_fill_opts_header(opts, len, mh->seq); *ppc += sizeof(*opts); - mh->inner_hd = (const struct dlm_header *)*ppc; + mh->inner_p = (const union dlm_packet *)*ppc; return msg; } @@ -1079,9 +1116,9 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, } /* this is a bug, however we going on and hope it will be resolved */ - WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); + WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); - mh = dlm_allocate_mhandle(); + mh = dlm_allocate_mhandle(allocation); if (!mh) goto err; @@ -1111,7 +1148,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, break; default: dlm_free_mhandle(mh); - WARN_ON(1); + WARN_ON_ONCE(1); goto err; } @@ -1130,11 +1167,32 @@ err: } #endif -static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) +static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh, + const void *name, int namelen) +{ + switch (mh->inner_p->header.h_cmd) { + case DLM_MSG: + trace_dlm_send_message(mh->node->nodeid, mh->seq, + &mh->inner_p->message, + name, namelen); + break; + case DLM_RCOM: + trace_dlm_send_rcom(mh->node->nodeid, mh->seq, + &mh->inner_p->rcom); + break; + default: + /* nothing to trace */ + break; + } +} + +static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh, + const void *name, int namelen) { /* nexthdr chain for fast lookup */ - mh->opts->o_nextcmd = mh->inner_hd->h_cmd; + mh->opts->o_nextcmd = mh->inner_p->header.h_cmd; mh->committed = true; + dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen); dlm_lowcomms_commit_msg(mh->msg); } @@ -1142,8 +1200,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) * dlm_midcomms_get_mhandle */ #ifndef __CHECKER__ -void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, + const void *name, int namelen) { + switch (mh->node->version) { case DLM_VERSION_3_1: srcu_read_unlock(&nodes_srcu, mh->idx); @@ -1154,12 +1214,12 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: - dlm_midcomms_commit_msg_3_2(mh); + dlm_midcomms_commit_msg_3_2(mh, name, namelen); srcu_read_unlock(&nodes_srcu, mh->idx); break; default: srcu_read_unlock(&nodes_srcu, mh->idx); - WARN_ON(1); + WARN_ON_ONCE(1); break; } } @@ -1167,12 +1227,27 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) int dlm_midcomms_start(void) { + return dlm_lowcomms_start(); +} + +void dlm_midcomms_stop(void) +{ + dlm_lowcomms_stop(); +} + +void dlm_midcomms_init(void) +{ int i; for (i = 0; i < CONN_HASH_SIZE; i++) INIT_HLIST_HEAD(&node_hash[i]); - return dlm_lowcomms_start(); + dlm_lowcomms_init(); +} + +void dlm_midcomms_exit(void) +{ + dlm_lowcomms_exit(); } static void dlm_act_fin_ack_rcv(struct midcomms_node *node) @@ -1201,7 +1276,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); log_print("%s: unexpected state: %d\n", __func__, node->state); - WARN_ON(1); + WARN_ON_ONCE(1); return; } spin_unlock(&node->state_lock); @@ -1319,7 +1394,7 @@ static void midcomms_node_release(struct rcu_head *rcu) { struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); - WARN_ON(atomic_read(&node->send_queue_cnt)); + WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); kfree(node); } @@ -1372,11 +1447,13 @@ static void midcomms_shutdown(struct midcomms_node *node) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); midcomms_node_reset(node); + dlm_lowcomms_shutdown_node(node->nodeid, true); return; } pr_debug("active shutdown done for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); + dlm_lowcomms_shutdown_node(node->nodeid, false); } void dlm_midcomms_shutdown(void) @@ -1384,6 +1461,8 @@ void dlm_midcomms_shutdown(void) struct midcomms_node *node; int i, idx; + dlm_lowcomms_shutdown(); + mutex_lock(&close_lock); idx = srcu_read_lock(&nodes_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { @@ -1401,8 +1480,6 @@ void dlm_midcomms_shutdown(void) } srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); - - dlm_lowcomms_shutdown(); } int dlm_midcomms_close(int nodeid) diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 82bcd9661922..bea1cee4279c 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -14,12 +14,17 @@ struct midcomms_node; +int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len); int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); -void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh); +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, + int namelen); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); +void dlm_midcomms_stop(void); +void dlm_midcomms_init(void); +void dlm_midcomms_exit(void); void dlm_midcomms_shutdown(void); void dlm_midcomms_add_member(int nodeid); void dlm_midcomms_remove_member(int nodeid); diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f19860315043..b76d52e2f6bd 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -91,7 +91,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc) { - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, NULL, 0); } static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc) @@ -516,7 +516,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rf = (struct rcom_config *) rc->rc_buf; rf->rf_lvblen = cpu_to_le32(~0U); - dlm_midcomms_commit_mhandle(mh); + dlm_midcomms_commit_mhandle(mh, NULL, 0); return 0; } diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 036a9a0078f6..8be2893ad15b 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -44,7 +44,8 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF; e->nodeid = nodeid; - memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length)); + memcpy(&e->request, ms, sizeof(*ms)); + memcpy(&e->request.m_extra, ms->m_extra, length); atomic_inc(&ls->ls_requestqueue_cnt); mutex_lock(&ls->ls_requestqueue_mutex); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index c5d27bccc3dc..35129505ddda 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -25,6 +25,7 @@ #include "user.h" #include "ast.h" #include "config.h" +#include "memory.h" static const char name_prefix[] = "dlm"; static const struct file_operations device_fops; @@ -175,7 +176,7 @@ static int lkb_is_endoflife(int mode, int status) being removed and then remove that lkb from the orphans list and free it */ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq) + int status, uint32_t sbflags) { struct dlm_ls *ls; struct dlm_user_args *ua; @@ -209,16 +210,22 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, spin_lock(&proc->asts_spin); - rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq); - if (rv < 0) { + rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags); + switch (rv) { + case DLM_ENQUEUE_CALLBACK_FAILURE: spin_unlock(&proc->asts_spin); + WARN_ON_ONCE(1); goto out; - } - - if (list_empty(&lkb->lkb_cb_list)) { + case DLM_ENQUEUE_CALLBACK_NEED_SCHED: kref_get(&lkb->lkb_ref); list_add_tail(&lkb->lkb_cb_list, &proc->asts); wake_up_interruptible(&proc->wait); + break; + case DLM_ENQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON_ONCE(1); + break; } spin_unlock(&proc->asts_spin); @@ -800,8 +807,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, struct dlm_user_proc *proc = file->private_data; struct dlm_lkb *lkb; DECLARE_WAITQUEUE(wait, current); - struct dlm_callback cb; - int rv, resid, copy_lvb = 0; + struct dlm_callback *cb; + int rv, copy_lvb = 0; int old_mode, new_mode; if (count == sizeof(struct dlm_device_version)) { @@ -857,53 +864,58 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, without removing lkb_cb_list; so empty lkb_cb_list is always consistent with empty lkb_callbacks */ - lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list); + lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list); /* rem_lkb_callback sets a new lkb_last_cast */ - old_mode = lkb->lkb_last_cast.mode; + old_mode = lkb->lkb_last_cast->mode; - rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid); - if (rv < 0) { + rv = dlm_dequeue_lkb_callback(lkb, &cb); + switch (rv) { + case DLM_DEQUEUE_CALLBACK_EMPTY: /* this shouldn't happen; lkb should have been removed from - list when resid was zero */ + * list when last item was dequeued + */ log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id); list_del_init(&lkb->lkb_cb_list); spin_unlock(&proc->asts_spin); /* removes ref for proc->asts, may cause lkb to be freed */ dlm_put_lkb(lkb); + WARN_ON_ONCE(1); goto try_another; - } - if (!resid) + case DLM_DEQUEUE_CALLBACK_LAST: list_del_init(&lkb->lkb_cb_list); - spin_unlock(&proc->asts_spin); - - if (cb.flags & DLM_CB_SKIP) { - /* removes ref for proc->asts, may cause lkb to be freed */ - if (!resid) - dlm_put_lkb(lkb); - goto try_another; + lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; + break; + case DLM_DEQUEUE_CALLBACK_SUCCESS: + break; + default: + WARN_ON_ONCE(1); + break; } + spin_unlock(&proc->asts_spin); - if (cb.flags & DLM_CB_BAST) { - trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb.mode); - } else if (cb.flags & DLM_CB_CAST) { - new_mode = cb.mode; + if (cb->flags & DLM_CB_BAST) { + trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode); + } else if (cb->flags & DLM_CB_CAST) { + new_mode = cb->mode; - if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && + if (!cb->sb_status && lkb->lkb_lksb->sb_lvbptr && dlm_lvb_operations[old_mode + 1][new_mode + 1]) copy_lvb = 1; - lkb->lkb_lksb->sb_status = cb.sb_status; - lkb->lkb_lksb->sb_flags = cb.sb_flags; + lkb->lkb_lksb->sb_status = cb->sb_status; + lkb->lkb_lksb->sb_flags = cb->sb_flags; trace_dlm_ast(lkb->lkb_resource->res_ls, lkb); } rv = copy_result_to_user(lkb->lkb_ua, test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), - cb.flags, cb.mode, copy_lvb, buf, count); + cb->flags, cb->mode, copy_lvb, buf, count); + + kref_put(&cb->ref, dlm_release_callback); /* removes ref for proc->asts, may cause lkb to be freed */ - if (!resid) + if (rv == DLM_DEQUEUE_CALLBACK_LAST) dlm_put_lkb(lkb); return rv; diff --git a/fs/dlm/user.h b/fs/dlm/user.h index 6b9bce6b96e0..33059452d79e 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -7,7 +7,7 @@ #define __USER_DOT_H__ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, - int status, uint32_t sbflags, uint64_t seq); + int status, uint32_t sbflags); int dlm_user_init(void); void dlm_user_exit(void); int dlm_device_deregister(struct dlm_ls *ls); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index c214fe0981bd..f3cd00fac9c3 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -18,6 +18,8 @@ #include <linux/fs_stack.h> #include <linux/slab.h> #include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include <linux/fileattr.h> #include <asm/unaligned.h> #include "ecryptfs_kernel.h" @@ -1120,6 +1122,28 @@ static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns, return rc; } +static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ + return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry), + posix_acl_xattr_name(type)); +} + +static int ecryptfs_set_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct posix_acl *acl, + int type) +{ + int rc; + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct inode *lower_inode = d_inode(lower_dentry); + + rc = vfs_set_acl(&init_user_ns, lower_dentry, + posix_acl_xattr_name(type), acl); + if (!rc) + fsstack_copy_attr_all(d_inode(dentry), lower_inode); + return rc; +} + const struct inode_operations ecryptfs_symlink_iops = { .get_link = ecryptfs_get_link, .permission = ecryptfs_permission, @@ -1143,6 +1167,8 @@ const struct inode_operations ecryptfs_dir_iops = { .listxattr = ecryptfs_listxattr, .fileattr_get = ecryptfs_fileattr_get, .fileattr_set = ecryptfs_fileattr_set, + .get_acl = ecryptfs_get_acl, + .set_acl = ecryptfs_set_acl, }; const struct inode_operations ecryptfs_main_iops = { @@ -1152,6 +1178,8 @@ const struct inode_operations ecryptfs_main_iops = { .listxattr = ecryptfs_listxattr, .fileattr_get = ecryptfs_fileattr_get, .fileattr_set = ecryptfs_fileattr_set, + .get_acl = ecryptfs_get_acl, + .set_acl = ecryptfs_set_acl, }; static int ecryptfs_xattr_get(const struct xattr_handler *handler, @@ -1182,6 +1210,10 @@ static const struct xattr_handler ecryptfs_xattr_handler = { }; const struct xattr_handler *ecryptfs_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif &ecryptfs_xattr_handler, NULL }; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fe8ac0e163f7..f57f921683d7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -13,9 +13,7 @@ void erofs_unmap_metabuf(struct erofs_buf *buf) { if (buf->kmap_type == EROFS_KMAP) - kunmap(buf->page); - else if (buf->kmap_type == EROFS_KMAP_ATOMIC) - kunmap_atomic(buf->base); + kunmap_local(buf->base); buf->base = NULL; buf->kmap_type = EROFS_NO_KMAP; } @@ -54,9 +52,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, } if (buf->kmap_type == EROFS_NO_KMAP) { if (type == EROFS_KMAP) - buf->base = kmap(page); - else if (type == EROFS_KMAP_ATOMIC) - buf->base = kmap_atomic(page); + buf->base = kmap_local_page(page); buf->kmap_type = type; } else if (buf->kmap_type != type) { DBG_BUGON(1); @@ -403,6 +399,8 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_readahead, .bmap = erofs_bmap, .direct_IO = noop_direct_IO, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, }; #ifdef CONFIG_FS_DAX diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index af5ed6b9c54d..014e20962376 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -11,265 +11,201 @@ static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); static struct vfsmount *erofs_pseudo_mnt; -static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, +struct erofs_fscache_request { + struct erofs_fscache_request *primary; + struct netfs_cache_resources cache_resources; + struct address_space *mapping; /* The mapping being accessed */ + loff_t start; /* Start position */ + size_t len; /* Length of the request */ + size_t submitted; /* Length of submitted */ + short error; /* 0 or error that occurred */ + refcount_t ref; +}; + +static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping, loff_t start, size_t len) { - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; - rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); - if (!rreq) + req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL); + if (!req) return ERR_PTR(-ENOMEM); - rreq->start = start; - rreq->len = len; - rreq->mapping = mapping; - rreq->inode = mapping->host; - INIT_LIST_HEAD(&rreq->subrequests); - refcount_set(&rreq->ref, 1); - return rreq; -} + req->mapping = mapping; + req->start = start; + req->len = len; + refcount_set(&req->ref, 1); -static void erofs_fscache_put_request(struct netfs_io_request *rreq) -{ - if (!refcount_dec_and_test(&rreq->ref)) - return; - if (rreq->cache_resources.ops) - rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); + return req; } -static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq) +static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary, + size_t len) { - if (!refcount_dec_and_test(&subreq->ref)) - return; - erofs_fscache_put_request(subreq->rreq); - kfree(subreq); -} + struct erofs_fscache_request *req; -static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; + /* use primary request for the first submission */ + if (!primary->submitted) { + refcount_inc(&primary->ref); + return primary; + } - while (!list_empty(&rreq->subrequests)) { - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - list_del(&subreq->rreq_link); - erofs_fscache_put_subrequest(subreq); + req = erofs_fscache_req_alloc(primary->mapping, + primary->start + primary->submitted, len); + if (!IS_ERR(req)) { + req->primary = primary; + refcount_inc(&primary->ref); } + return req; } -static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq) +static void erofs_fscache_req_complete(struct erofs_fscache_request *req) { - struct netfs_io_subrequest *subreq; struct folio *folio; - unsigned int iopos = 0; - pgoff_t start_page = rreq->start / PAGE_SIZE; - pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; - bool subreq_failed = false; + bool failed = req->error; + pgoff_t start_page = req->start / PAGE_SIZE; + pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1; - XA_STATE(xas, &rreq->mapping->i_pages, start_page); - - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - subreq_failed = (subreq->error < 0); + XA_STATE(xas, &req->mapping->i_pages, start_page); rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos, pgend; - bool pg_failed = false; - if (xas_retry(&xas, folio)) continue; - - pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; - pgend = pgpos + folio_size(folio); - - for (;;) { - if (!subreq) { - pg_failed = true; - break; - } - - pg_failed |= subreq_failed; - if (pgend < iopos + subreq->len) - break; - - iopos += subreq->len; - if (!list_is_last(&subreq->rreq_link, - &rreq->subrequests)) { - subreq = list_next_entry(subreq, rreq_link); - subreq_failed = (subreq->error < 0); - } else { - subreq = NULL; - subreq_failed = false; - } - if (pgend == iopos) - break; - } - - if (!pg_failed) + if (!failed) folio_mark_uptodate(folio); - folio_unlock(folio); } rcu_read_unlock(); } -static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq) +static void erofs_fscache_req_put(struct erofs_fscache_request *req) { - erofs_fscache_rreq_unlock_folios(rreq); - erofs_fscache_clear_subrequests(rreq); - erofs_fscache_put_request(rreq); + if (refcount_dec_and_test(&req->ref)) { + if (req->cache_resources.ops) + req->cache_resources.ops->end_operation(&req->cache_resources); + if (!req->primary) + erofs_fscache_req_complete(req); + else + erofs_fscache_req_put(req->primary); + kfree(req); + } } -static void erofc_fscache_subreq_complete(void *priv, +static void erofs_fscache_subreq_complete(void *priv, ssize_t transferred_or_error, bool was_async) { - struct netfs_io_subrequest *subreq = priv; - struct netfs_io_request *rreq = subreq->rreq; - - if (IS_ERR_VALUE(transferred_or_error)) - subreq->error = transferred_or_error; + struct erofs_fscache_request *req = priv; - if (atomic_dec_and_test(&rreq->nr_outstanding)) - erofs_fscache_rreq_complete(rreq); - - erofs_fscache_put_subrequest(subreq); + if (IS_ERR_VALUE(transferred_or_error)) { + if (req->primary) + req->primary->error = transferred_or_error; + else + req->error = transferred_or_error; + } + erofs_fscache_req_put(req); } /* - * Read data from fscache and fill the read data into page cache described by - * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes - * the start physical address in the cache file. + * Read data from fscache (cookie, pstart, len), and fill the read data into + * page cache described by (req->mapping, lstart, len). @pstart describeis the + * start physical address in the cache file. */ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, - struct netfs_io_request *rreq, loff_t pstart) + struct erofs_fscache_request *req, loff_t pstart, size_t len) { enum netfs_io_source source; - struct super_block *sb = rreq->mapping->host->i_sb; - struct netfs_io_subrequest *subreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; + struct super_block *sb = req->mapping->host->i_sb; + struct netfs_cache_resources *cres = &req->cache_resources; struct iov_iter iter; - loff_t start = rreq->start; - size_t len = rreq->len; + loff_t lstart = req->start + req->submitted; size_t done = 0; int ret; - atomic_set(&rreq->nr_outstanding, 1); + DBG_BUGON(len > req->len - req->submitted); ret = fscache_begin_read_operation(cres, cookie); if (ret) - goto out; + return ret; while (done < len) { - subreq = kzalloc(sizeof(struct netfs_io_subrequest), - GFP_KERNEL); - if (subreq) { - INIT_LIST_HEAD(&subreq->rreq_link); - refcount_set(&subreq->ref, 2); - subreq->rreq = rreq; - refcount_inc(&rreq->ref); - } else { - ret = -ENOMEM; - goto out; - } - - subreq->start = pstart + done; - subreq->len = len - done; - subreq->flags = 1 << NETFS_SREQ_ONDEMAND; + loff_t sstart = pstart + done; + size_t slen = len - done; + unsigned long flags = 1 << NETFS_SREQ_ONDEMAND; - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - - source = cres->ops->prepare_read(subreq, LLONG_MAX); - if (WARN_ON(subreq->len == 0)) + source = cres->ops->prepare_ondemand_read(cres, + sstart, &slen, LLONG_MAX, &flags, 0); + if (WARN_ON(slen == 0)) source = NETFS_INVALID_READ; if (source != NETFS_READ_FROM_CACHE) { - erofs_err(sb, "failed to fscache prepare_read (source %d)", - source); - ret = -EIO; - subreq->error = ret; - erofs_fscache_put_subrequest(subreq); - goto out; + erofs_err(sb, "failed to fscache prepare_read (source %d)", source); + return -EIO; } - atomic_inc(&rreq->nr_outstanding); + refcount_inc(&req->ref); + iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages, + lstart + done, slen); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, - start + done, subreq->len); - - ret = fscache_read(cres, subreq->start, &iter, - NETFS_READ_HOLE_FAIL, - erofc_fscache_subreq_complete, subreq); + ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL, + erofs_fscache_subreq_complete, req); if (ret == -EIOCBQUEUED) ret = 0; if (ret) { erofs_err(sb, "failed to fscache_read (ret %d)", ret); - goto out; + return ret; } - done += subreq->len; + done += slen; } -out: - if (atomic_dec_and_test(&rreq->nr_outstanding)) - erofs_fscache_rreq_complete(rreq); - - return ret; + DBG_BUGON(done != len); + return 0; } static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; struct super_block *sb = folio_mapping(folio)->host->i_sb; - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; struct erofs_map_dev mdev = { .m_deviceid = 0, .m_pa = folio_pos(folio), }; ret = erofs_map_dev(sb, &mdev); - if (ret) - goto out; + if (ret) { + folio_unlock(folio); + return ret; + } - rreq = erofs_fscache_alloc_request(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio_mapping(folio), folio_pos(folio), folio_size(folio)); - if (IS_ERR(rreq)) { - ret = PTR_ERR(rreq); - goto out; + if (IS_ERR(req)) { + folio_unlock(folio); + return PTR_ERR(req); } - return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa); -out: - folio_unlock(folio); + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + req, mdev.m_pa, folio_size(folio)); + if (ret) + req->error = ret; + + erofs_fscache_req_put(req); return ret; } -/* - * Read into page cache in the range described by (@pos, @len). - * - * On return, the caller is responsible for page unlocking if the output @unlock - * is true, or the callee will take this responsibility through netfs_io_request - * interface. - * - * The return value is the number of bytes successfully handled, or negative - * error code on failure. The only exception is that, the length of the range - * instead of the error code is returned on failure after netfs_io_request is - * allocated, so that .readahead() could advance rac accordingly. - */ -static int erofs_fscache_data_read(struct address_space *mapping, - loff_t pos, size_t len, bool *unlock) +static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) { + struct address_space *mapping = primary->mapping; struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; - struct netfs_io_request *rreq; + struct erofs_fscache_request *req; struct erofs_map_blocks map; struct erofs_map_dev mdev; struct iov_iter iter; + loff_t pos = primary->start + primary->submitted; size_t count; int ret; - *unlock = true; - map.m_la = pos; ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); if (ret) @@ -290,24 +226,26 @@ static int erofs_fscache_data_read(struct address_space *mapping, if (IS_ERR(src)) return PTR_ERR(src); - iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE); + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE); if (copy_to_iter(src + offset, size, &iter) != size) { erofs_put_metabuf(&buf); return -EFAULT; } iov_iter_zero(PAGE_SIZE - size, &iter); erofs_put_metabuf(&buf); - return PAGE_SIZE; + primary->submitted += PAGE_SIZE; + return 0; } + count = primary->len - primary->submitted; if (!(map.m_flags & EROFS_MAP_MAPPED)) { - count = len; - iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count); + iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count); iov_iter_zero(count, &iter); - return count; + primary->submitted += count; + return 0; } - count = min_t(size_t, map.m_llen - (pos - map.m_la), len); + count = min_t(size_t, map.m_llen - (pos - map.m_la), count); DBG_BUGON(!count || count % PAGE_SIZE); mdev = (struct erofs_map_dev) { @@ -318,64 +256,65 @@ static int erofs_fscache_data_read(struct address_space *mapping, if (ret) return ret; - rreq = erofs_fscache_alloc_request(mapping, pos, count); - if (IS_ERR(rreq)) - return PTR_ERR(rreq); + req = erofs_fscache_req_chain(primary, count); + if (IS_ERR(req)) + return PTR_ERR(req); - *unlock = false; - erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa + (pos - map.m_la)); - return count; + ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + req, mdev.m_pa + (pos - map.m_la), count); + erofs_fscache_req_put(req); + primary->submitted += count; + return ret; } -static int erofs_fscache_read_folio(struct file *file, struct folio *folio) +static int erofs_fscache_data_read(struct erofs_fscache_request *req) { - bool unlock; int ret; - DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + do { + ret = erofs_fscache_data_read_slice(req); + if (ret) + req->error = ret; + } while (!ret && req->submitted < req->len); - ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio), - folio_size(folio), &unlock); - if (unlock) { - if (ret > 0) - folio_mark_uptodate(folio); + return ret; +} + +static int erofs_fscache_read_folio(struct file *file, struct folio *folio) +{ + struct erofs_fscache_request *req; + int ret; + + req = erofs_fscache_req_alloc(folio_mapping(folio), + folio_pos(folio), folio_size(folio)); + if (IS_ERR(req)) { folio_unlock(folio); + return PTR_ERR(req); } - return ret < 0 ? ret : 0; + + ret = erofs_fscache_data_read(req); + erofs_fscache_req_put(req); + return ret; } static void erofs_fscache_readahead(struct readahead_control *rac) { - struct folio *folio; - size_t len, done = 0; - loff_t start, pos; - bool unlock; - int ret, size; + struct erofs_fscache_request *req; if (!readahead_count(rac)) return; - start = readahead_pos(rac); - len = readahead_length(rac); + req = erofs_fscache_req_alloc(rac->mapping, + readahead_pos(rac), readahead_length(rac)); + if (IS_ERR(req)) + return; - do { - pos = start + done; - ret = erofs_fscache_data_read(rac->mapping, pos, - len - done, &unlock); - if (ret <= 0) - return; + /* The request completion will drop refs on the folios. */ + while (readahead_folio(rac)) + ; - size = ret; - while (size) { - folio = readahead_folio(rac); - size -= folio_size(folio); - if (unlock) { - folio_mark_uptodate(folio); - folio_unlock(folio); - } - } - } while ((done += ret) < len); + erofs_fscache_data_read(req); + erofs_fscache_req_put(req); } static const struct address_space_operations erofs_fscache_meta_aops = { @@ -494,7 +433,8 @@ static int erofs_fscache_register_domain(struct super_block *sb) static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; @@ -516,7 +456,7 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, fscache_use_cookie(cookie, false); ctx->cookie = cookie; - if (need_inode) { + if (flags & EROFS_REG_COOKIE_NEED_INODE) { struct inode *const inode = new_inode(sb); if (!inode) { @@ -554,14 +494,15 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) static struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { int err; struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; - ctx = erofs_fscache_acquire_cookie(sb, name, need_inode); + ctx = erofs_fscache_acquire_cookie(sb, name, flags); if (IS_ERR(ctx)) return ctx; @@ -589,7 +530,8 @@ out: static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { struct inode *inode; struct erofs_fscache *ctx; @@ -602,23 +544,30 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, ctx = inode->i_private; if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) continue; - igrab(inode); + if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { + igrab(inode); + } else { + erofs_err(sb, "%s already exists in domain %s", name, + domain->domain_id); + ctx = ERR_PTR(-EEXIST); + } spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } spin_unlock(&psb->s_inode_list_lock); - ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode); + ctx = erofs_fscache_domain_init_cookie(sb, name, flags); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { if (EROFS_SB(sb)->domain_id) - return erofs_domain_register_cookie(sb, name, need_inode); - return erofs_fscache_acquire_cookie(sb, name, need_inode); + return erofs_domain_register_cookie(sb, name, flags); + return erofs_fscache_acquire_cookie(sb, name, flags); } void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) @@ -647,6 +596,7 @@ int erofs_fscache_register_fs(struct super_block *sb) int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; + unsigned int flags; if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); @@ -655,8 +605,20 @@ int erofs_fscache_register_fs(struct super_block *sb) if (ret) return ret; - /* acquired domain/volume will be relinquished in kill_sb() on error */ - fscache = erofs_fscache_register_cookie(sb, sbi->fsid, true); + /* + * When shared domain is enabled, using NEED_NOEXIST to guarantee + * the primary data blob (aka fsid) is unique in the shared domain. + * + * For non-shared-domain case, fscache_acquire_volume() invoked by + * erofs_fscache_register_volume() has already guaranteed + * the uniqueness of primary data blob. + * + * Acquired domain/volume will be relinquished in kill_sb() on error. + */ + flags = EROFS_REG_COOKIE_NEED_INODE; + if (sbi->domain_id) + flags |= EROFS_REG_COOKIE_NEED_NOEXIST; + fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); if (IS_ERR(fscache)) return PTR_ERR(fscache); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index ad2a82f2eb4c..d3b8736fa124 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -268,6 +268,7 @@ static int erofs_fill_inode(struct inode *inode) case S_IFDIR: inode->i_op = &erofs_dir_iops; inode->i_fop = &erofs_dir_fops; + inode_nohighmem(inode); break; case S_IFLNK: err = erofs_fill_symlink(inode, kaddr, ofs); @@ -295,6 +296,7 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; + mapping_set_large_folios(inode->i_mapping); #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; @@ -371,7 +373,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, .fiemap = erofs_fiemap, }; @@ -379,12 +381,12 @@ const struct inode_operations erofs_symlink_iops = { .get_link = page_get_link, .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, }; const struct inode_operations erofs_fast_symlink_iops = { .get_link = simple_get_link, .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, }; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 05dc68627722..bb8501c0ff5b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -255,8 +255,7 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ - EROFS_KMAP, /* use kmap() to map the buffer */ - EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */ + EROFS_KMAP, /* use kmap_local_page() to map the buffer */ }; struct erofs_buf { @@ -604,13 +603,18 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_NEED_INODE 1 +#define EROFS_REG_COOKIE_NEED_NOEXIST 2 + /* fscache.c */ #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode); + char *name, + unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); extern const struct address_space_operations erofs_fscache_access_aops; @@ -623,7 +627,8 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} static inline struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, bool need_inode) + char *name, + unsigned int flags) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 0dc34721080c..b64a108fac92 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -228,6 +228,6 @@ const struct inode_operations erofs_dir_iops = { .lookup = erofs_lookup, .getattr = erofs_getattr, .listxattr = erofs_listxattr, - .get_acl = erofs_get_acl, + .get_inode_acl = erofs_get_acl, .fiemap = erofs_fiemap, }; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 1c7dcca702b3..481788c24a68 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -245,7 +245,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, } if (erofs_is_fscache_mode(sb)) { - fscache = erofs_fscache_register_cookie(sb, dif->path, false); + fscache = erofs_fscache_register_cookie(sb, dif->path, 0); if (IS_ERR(fscache)) return PTR_ERR(fscache); dif->fscache = fscache; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 8106bcb5a38d..a62fb8a3318a 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -148,7 +148,7 @@ static inline int xattr_iter_fixup(struct xattr_iter *it) it->blkaddr += erofs_blknr(it->ofs); it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); it->ofs = erofs_blkoff(it->ofs); @@ -174,7 +174,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); return vi->xattr_isize - xattr_header_sz; @@ -368,7 +368,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); it->it.blkaddr = blkaddr; @@ -580,7 +580,7 @@ static int shared_listxattr(struct listxattr_iter *it) it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP_ATOMIC); + EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); it->it.blkaddr = blkaddr; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index b792d424d774..ccf7c55d477f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -175,16 +175,6 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* how to allocate cached pages for a pcluster */ -enum z_erofs_cache_alloctype { - DONTALLOC, /* don't allocate any cached pages */ - /* - * try to use cached I/O if page allocation succeeds or fallback - * to in-place I/O instead to avoid any direct reclaim. - */ - TRYALLOC, -}; - /* * tagged pointer with 1-bit tag for all compressed pages * tag 0 - the page is just found with an extra page reference @@ -292,12 +282,29 @@ struct z_erofs_decompress_frontend { .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } +static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) +{ + unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; + + if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) + return false; + + if (fe->backmost) + return true; + + if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && + fe->map.m_la < fe->headoffset) + return true; + + return false; +} + static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - enum z_erofs_cache_alloctype type, struct page **pagepool) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* * optimistic allocation without direct reclaim since inplace I/O @@ -326,18 +333,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } else { /* I/O is needed, no possible to decompress directly */ standalone = false; - switch (type) { - case TRYALLOC: - newpage = erofs_allocpage(pagepool, gfp); - if (!newpage) - continue; - set_page_private(newpage, - Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); - break; - default: /* DONTALLOC */ + if (!shouldalloc) continue; - } + + /* + * try to use cached I/O if page allocation + * succeeds or fallback to in-place I/O instead + * to avoid any direct reclaim. + */ + newpage = erofs_allocpage(pagepool, gfp); + if (!newpage) + continue; + set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); + t = tag_compressed_page_justfound(newpage); } if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, @@ -488,7 +496,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) struct erofs_workgroup *grp; int err; - if (!(map->m_flags & EROFS_MAP_ENCODED)) { + if (!(map->m_flags & EROFS_MAP_ENCODED) || + (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -637,20 +646,6 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) return true; } -static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, - unsigned int cachestrategy, - erofs_off_t la) -{ - if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) - return false; - - if (fe->backmost) - return true; - - return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && - la < fe->headoffset; -} - static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, struct page *page, unsigned int pageofs, unsigned int len) @@ -687,12 +682,9 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - - enum z_erofs_cache_alloctype cache_strategy; unsigned int cur, end, spiltted; int err = 0; @@ -746,13 +738,7 @@ repeat: fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ - if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, - map->m_la)) - cache_strategy = TRYALLOC; - else - cache_strategy = DONTALLOC; - - z_erofs_bind_cache(fe, cache_strategy, pagepool); + z_erofs_bind_cache(fe, pagepool); } hitted: /* diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 0bb66927e3d0..0150570c33aa 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -178,7 +178,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int advise, type; m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP_ATOMIC); + erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); @@ -416,7 +416,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, out: pos += lcn * (1 << amortizedshift); m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP_ATOMIC); + erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); @@ -694,10 +694,15 @@ static int z_erofs_do_map_blocks(struct inode *inode, map->m_pa = blknr_to_addr(m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) - goto out; + goto unmap_out; } if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (map->m_llen > map->m_plen) { + DBG_BUGON(1); + err = -EFSCORRUPTED; + goto unmap_out; + } if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED; @@ -718,14 +723,12 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; } + unmap_out: erofs_unmap_metabuf(&m.map->buf); - -out: erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", __func__, map->m_la, map->m_pa, map->m_llen, map->m_plen, map->m_flags); - return err; } diff --git a/fs/exec.c b/fs/exec.c index a0b1f0337a62..ab913243a367 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -64,6 +64,7 @@ #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> +#include <linux/time_namespace.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -171,7 +172,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) exit: fput(file); out: - return error; + return error; } #endif /* #ifdef CONFIG_USELIB */ @@ -199,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; - unsigned int gup_flags = FOLL_FORCE; + unsigned int gup_flags = 0; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -842,16 +843,13 @@ int setup_arg_pages(struct linux_binprm *bprm, * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; + + stack_expand = min(rlim_stack, stack_size + stack_expand); + #ifdef CONFIG_STACK_GROWSUP - if (stack_size + stack_expand > rlim_stack) - stack_base = vma->vm_start + rlim_stack; - else - stack_base = vma->vm_end + stack_expand; + stack_base = vma->vm_start + stack_expand; #else - if (stack_size + stack_expand > rlim_stack) - stack_base = vma->vm_end - rlim_stack; - else - stack_base = vma->vm_start - stack_expand; + stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); @@ -1297,6 +1295,10 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL; + retval = exec_task_namespaces(); + if (retval) + goto out_unlock; + #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); @@ -1568,6 +1570,12 @@ static void check_unsafe_exec(struct linux_binprm *bprm) if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + /* + * If another task is sharing our fs, we cannot safely + * suid exec because the differently privileged task + * will be able to manipulate the current directory, etc. + * It would be nice to force an unshare instead... + */ t = p; n_fs = 1; spin_lock(&p->fs->lock); @@ -1591,8 +1599,8 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) struct user_namespace *mnt_userns; struct inode *inode = file_inode(file); unsigned int mode; - kuid_t uid; - kgid_t gid; + vfsuid_t vfsuid; + vfsgid_t vfsgid; if (!mnt_may_suid(file->f_path.mnt)) return; @@ -1611,23 +1619,23 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; - uid = i_uid_into_mnt(mnt_userns, inode); - gid = i_gid_into_mnt(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ - if (!kuid_has_mapping(bprm->cred->user_ns, uid) || - !kgid_has_mapping(bprm->cred->user_ns, gid)) + if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || + !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->euid = uid; + bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->egid = gid; + bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } @@ -1748,6 +1756,7 @@ static int search_binary_handler(struct linux_binprm *bprm) return retval; } +/* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; @@ -1806,6 +1815,11 @@ static int bprm_execve(struct linux_binprm *bprm, if (retval) return retval; + /* + * Check for unsafe execution states before exec_binprm(), which + * will call back into begin_new_exec(), into bprm_creds_from_file(), + * where setuid-ness is evaluated. + */ check_unsafe_exec(bprm); current->in_execve = 1; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index bf298967c5b8..440d5f1e9d47 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -219,11 +219,12 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) * inode->i_mutex: down */ int -ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error; int update_mode = 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 925ab6287d35..3841becb94ff 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu); -extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 5dc0a31f4a08..eca60b747c6b 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -667,7 +667,7 @@ ext2_try_to_allocate(struct super_block *sb, int group, { ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group); ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group); - ext2_grpblk_t start, end; + ext2_grpblk_t start, end; unsigned long num = 0; start = 0; @@ -1481,11 +1481,11 @@ unsigned long ext2_count_free_blocks (struct super_block * sb) desc_count, bitmap_count); return bitmap_count; #else - for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { - desc = ext2_get_group_desc (sb, i, NULL); - if (!desc) - continue; - desc_count += le16_to_cpu(desc->bg_free_blocks_count); + for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc = ext2_get_group_desc(sb, i, NULL); + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_blocks_count); } return desc_count; #endif diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 8f597753ac12..e5cbc27ba459 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -81,11 +81,10 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - int err = 0; inode_inc_iversion(dir); block_write_end(NULL, mapping, pos, len, len, page, NULL); @@ -94,16 +93,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) i_size_write(dir, pos+len); mark_inode_dirty(dir); } - - if (IS_DIRSYNC(dir)) { - err = write_one_page(page); - if (!err) - err = sync_inode_metadata(dir, 1); - } else { - unlock_page(page); - } - - return err; + unlock_page(page); } static bool ext2_check_page(struct page *page, int quiet, char *kaddr) @@ -413,7 +403,7 @@ found: return de; } -/** +/* * Return the '..' directory entry and the page in which the entry was found * (as a parameter - p). * @@ -460,6 +450,17 @@ static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, ext2_get_block); } + +static int ext2_handle_dirsync(struct inode *dir) +{ + int err; + + err = filemap_write_and_wait(dir->i_mapping); + if (!err) + err = sync_inode_metadata(dir, 1); + return err; +} + void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, struct page *page, void *page_addr, struct inode *inode, int update_times) @@ -474,11 +475,12 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, BUG_ON(err); de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type(de, inode); - err = ext2_commit_chunk(page, pos, len); + ext2_commit_chunk(page, pos, len); if (update_times) dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); + ext2_handle_dirsync(dir); } /* @@ -566,10 +568,11 @@ got_it: memcpy(de->name, name, namelen); de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type (de, inode); - err = ext2_commit_chunk(page, pos, rec_len); + ext2_commit_chunk(page, pos, rec_len); dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); + err = ext2_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: ext2_put_page(page, page_addr); @@ -615,10 +618,11 @@ int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page, if (pde) pde->rec_len = ext2_rec_len_to_disk(to - from); dir->inode = 0; - err = ext2_commit_chunk(page, pos, to - from); + ext2_commit_chunk(page, pos, to - from); inode->i_ctime = inode->i_mtime = current_time(inode); EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); + err = ext2_handle_dirsync(inode); out: return err; } @@ -658,7 +662,8 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) memcpy (de->name, "..\0", 4); ext2_set_de_type (de, inode); kunmap_atomic(kaddr); - err = ext2_commit_chunk(page, 0, chunk_size); + ext2_commit_chunk(page, 0, chunk_size); + err = ext2_handle_dirsync(inode); fail: put_page(page); return err; @@ -679,7 +684,7 @@ int ext2_empty_dir (struct inode * inode) page = ext2_get_page(inode, i, 0, &page_addr); if (IS_ERR(page)) - goto not_empty; + return 0; kaddr = page_addr; de = (ext2_dirent *)kaddr; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index eb97aa3d700e..6b4bebe982ca 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -200,7 +200,7 @@ const struct inode_operations ext2_file_inode_operations = { .listxattr = ext2_listxattr, .getattr = ext2_getattr, .setattr = ext2_setattr, - .get_acl = ext2_get_acl, + .get_inode_acl = ext2_get_acl, .set_acl = ext2_set_acl, .fiemap = ext2_fiemap, .fileattr_get = ext2_fileattr_get, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index f4944c4dee60..78b8686d9a4a 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -277,7 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - parent_group = prandom_u32_max(ngroups); + parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext2_get_group_desc (sb, group, NULL); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 918ab2f9e4c0..69aed9e2359e 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -869,11 +869,6 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } -static int ext2_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, ext2_get_block, wbc); -} - static int ext2_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, ext2_get_block); @@ -948,7 +943,6 @@ const struct address_space_operations ext2_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = ext2_read_folio, .readahead = ext2_readahead, - .writepage = ext2_writepage, .write_begin = ext2_write_begin, .write_end = ext2_write_end, .bmap = ext2_bmap, @@ -1652,7 +1646,7 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } setattr_copy(&init_user_ns, inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 9125eab85146..c056957221a2 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -427,7 +427,7 @@ const struct inode_operations ext2_dir_inode_operations = { .listxattr = ext2_listxattr, .getattr = ext2_getattr, .setattr = ext2_setattr, - .get_acl = ext2_get_acl, + .get_inode_acl = ext2_get_acl, .set_acl = ext2_set_acl, .tmpfile = ext2_tmpfile, .fileattr_get = ext2_fileattr_get, @@ -438,6 +438,6 @@ const struct inode_operations ext2_special_inode_operations = { .listxattr = ext2_listxattr, .getattr = ext2_getattr, .setattr = ext2_setattr, - .get_acl = ext2_get_acl, + .get_inode_acl = ext2_get_acl, .set_acl = ext2_set_acl, }; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 03f2af98b1b4..69c88facfe90 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1648,7 +1648,7 @@ static int __init init_ext2_fs(void) err = init_inodecache(); if (err) return err; - err = register_filesystem(&ext2_fs_type); + err = register_filesystem(&ext2_fs_type); if (err) goto out; return 0; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 57e82e25f8e2..a9f89539aeee 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -225,12 +225,13 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; int update_mode = 0; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 3219669732bf..09c4a8a3b716 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); -int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8d5453852f98..140e1eb300d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -558,7 +558,7 @@ enum { * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) static inline void ext4_check_flag_values(void) @@ -2964,7 +2964,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ - EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ + EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ + EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */ } ext4_iget_flags; extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -2999,6 +3000,7 @@ extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); @@ -3619,8 +3621,8 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); -extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, - struct inode *inode); +extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, struct dentry *dentry); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); @@ -3756,8 +3758,7 @@ extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, - int len, - bool keep_towrite); + int len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 8e1fb18f465e..77f318ec8abb 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -86,15 +86,21 @@ static int ext4_journal_check_start(struct super_block *sb) return 0; } -handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, +handle_t *__ext4_journal_start_sb(struct inode *inode, + struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, int revoke_creds) { journal_t *journal; int err; - - trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds, - _RET_IP_); + if (inode) + trace_ext4_journal_start_inode(inode, blocks, rsv_blocks, + revoke_creds, type, + _RET_IP_); + else + trace_ext4_journal_start_sb(sb, blocks, rsv_blocks, + revoke_creds, type, + _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index db2ae4a2b38d..0c77697d5e90 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -261,9 +261,9 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) -handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks, - int revoke_creds); +handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb, + unsigned int line, int type, int blocks, + int rsv_blocks, int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -303,7 +303,7 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb) } #define ext4_journal_start_sb(sb, type, nblocks) \ - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \ + __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\ ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ @@ -323,7 +323,7 @@ static inline handle_t *__ext4_journal_start(struct inode *inode, int blocks, int rsv_blocks, int revoke_creds) { - return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, + return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks, rsv_blocks, revoke_creds); } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6c399a8b22b3..9de1c9d1a13d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2635,9 +2635,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unwritten, ex_ee_len); path[depth].p_ext = ex; - a = ex_ee_block > start ? ex_ee_block : start; - b = ex_ee_block+ex_ee_len - 1 < end ? - ex_ee_block+ex_ee_len - 1 : end; + a = max(ex_ee_block, start); + b = min(ex_ee_block + ex_ee_len - 1, end); ext_debug(inode, " border %u:%u\n", a, b); @@ -5567,8 +5566,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, - ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, - len_lblk, SHIFT_RIGHT); + max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) @@ -5799,6 +5797,14 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) struct ext4_extent *extent; ext4_lblk_t first_lblk, first_lclu, last_lclu; + /* + * if data can be stored inline, the logical cluster isn't + * mapped - no physical clusters have been allocated, and the + * file has no extents + */ + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return 0; + /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); if (IS_ERR(path)) { diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index cd0a861853e3..7bc221038c6c 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -155,9 +155,7 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, int __init ext4_init_es(void) { - ext4_es_cachep = kmem_cache_create("ext4_extent_status", - sizeof(struct extent_status), - 0, (SLAB_RECLAIM_ACCOUNT), NULL); + ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); if (ext4_es_cachep == NULL) return -ENOMEM; return 0; @@ -1371,7 +1369,7 @@ retry: if (count_reserved) count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, &orig_es, &rc); - goto out; + goto out_get_reserved; } if (len1 > 0) { @@ -1413,6 +1411,7 @@ retry: } } +out_get_reserved: if (count_reserved) *reserved = get_rsvd(inode, end, es, &rc); out: @@ -1807,9 +1806,7 @@ static void ext4_print_pending_tree(struct inode *inode) int __init ext4_init_pending(void) { - ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", - sizeof(struct pending_reservation), - 0, (SLAB_RECLAIM_ACCOUNT), NULL); + ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT); if (ext4_pending_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 0f6d0a80467d..4594b62f147b 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -420,25 +420,34 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); mutex_unlock(&ei->i_fc_lock); + + if (IS_ENCRYPTED(dir)) { + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, + NULL); + mutex_lock(&ei->i_fc_lock); + return -EOPNOTSUPP; + } + node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; - node->fcd_parent = dentry->d_parent->d_inode->i_ino; + node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; if (dentry->d_name.len > DNAME_INLINE_LEN) { node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); - ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM, NULL); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -666,18 +675,6 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) /* Ext4 commit path routines */ -/* memzero and update CRC */ -static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, - u32 *crc) -{ - void *ret; - - ret = memset(dst, 0, len); - if (crc) - *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); - return ret; -} - /* * Allocate len bytes on a fast commit buffer. * @@ -691,62 +688,60 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { - struct ext4_fc_tl *tl; + struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; - int pad_len; + int remaining; + u8 *dst; /* - * After allocating len, we should have space at least for a 0 byte - * padding. + * If 'len' is too long to fit in any block alongside a PAD tlv, then we + * cannot fulfill the request. */ - if (len + EXT4_FC_TAG_BASE_LEN > bsize) + if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; - if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) { - /* - * Only allocate from current buffer if we have enough space for - * this request AND we have space to add a zero byte padding. - */ - if (!sbi->s_fc_bh) { - ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); - if (ret) - return NULL; - sbi->s_fc_bh = bh; - } + if (!sbi->s_fc_bh) { + ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); + if (ret) + return NULL; + sbi->s_fc_bh = bh; + } + dst = sbi->s_fc_bh->b_data + off; + + /* + * Allocate the bytes in the current block if we can do so while still + * leaving enough space for a PAD tlv. + */ + remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; + if (len <= remaining) { sbi->s_fc_bytes += len; - return sbi->s_fc_bh->b_data + off; + return dst; } - /* Need to add PAD tag */ - tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); - tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); - pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN; - tl->fc_len = cpu_to_le16(pad_len); - if (crc) - *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN); - if (pad_len > 0) - ext4_fc_memzero(sb, tl + 1, pad_len, crc); + + /* + * Else, terminate the current block with a PAD tlv, then allocate a new + * block and allocate the bytes at the start of that new block. + */ + + tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); + tl.fc_len = cpu_to_le16(remaining); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); + *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); + ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; - sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; + sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } -/* memcpy to fc reserved space and update CRC */ -static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, - int len, u32 *crc) -{ - if (crc) - *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); - return memcpy(dst, src, len); -} - /* * Complete a fast commit by writing tail tag. * @@ -774,16 +769,20 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); - tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); + tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); - ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); - ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); + memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); + crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); - ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); + memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); + dst += sizeof(tail.fc_crc); + memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); @@ -807,8 +806,8 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); - ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); - ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); + memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } @@ -830,11 +829,11 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); - ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; - ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); + memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); - ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); + memcpy(dst, fc_dentry->fcd_name.name, dlen); return true; } @@ -872,15 +871,11 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) if (!dst) goto err; - if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc)) - goto err; + memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; - if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) - goto err; + memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); - if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), - inode_len, crc)) - goto err; + memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); ret = 0; err: brelse(iloc.bh); @@ -986,7 +981,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) finish_wait(&ei->i_fc_wait, &wait); } spin_unlock(&sbi->s_fc_lock); - ret = jbd2_submit_inode_data(ei->jinode); + ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); @@ -1388,7 +1383,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, return 0; } - ret = __ext4_unlink(NULL, old_parent, &entry, inode); + ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; @@ -1977,32 +1972,31 @@ void ext4_fc_replay_cleanup(struct super_block *sb) kfree(sbi->s_fc_replay_state.fc_modified_inodes); } -static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl, - u8 *val, u8 *end) +static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, + int tag, int len) { - if (val + tl->fc_len > end) - return false; - - /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do - * journal rescan before do CRC check. Other tags length check will - * rely on CRC check. - */ - switch (tl->fc_tag) { + switch (tag) { case EXT4_FC_TAG_ADD_RANGE: - return (sizeof(struct ext4_fc_add_range) == tl->fc_len); - case EXT4_FC_TAG_TAIL: - return (sizeof(struct ext4_fc_tail) <= tl->fc_len); - case EXT4_FC_TAG_HEAD: - return (sizeof(struct ext4_fc_head) == tl->fc_len); + return len == sizeof(struct ext4_fc_add_range); case EXT4_FC_TAG_DEL_RANGE: + return len == sizeof(struct ext4_fc_del_range); + case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: - case EXT4_FC_TAG_CREAT: + len -= sizeof(struct ext4_fc_dentry_info); + return len >= 1 && len <= EXT4_NAME_LEN; case EXT4_FC_TAG_INODE: + len -= sizeof(struct ext4_fc_inode); + return len >= EXT4_GOOD_OLD_INODE_SIZE && + len <= sbi->s_inode_size; case EXT4_FC_TAG_PAD: - default: - return true; + return true; /* padding can have any length */ + case EXT4_FC_TAG_TAIL: + return len >= sizeof(struct ext4_fc_tail); + case EXT4_FC_TAG_HEAD: + return len == sizeof(struct ext4_fc_head); } + return false; } /* @@ -2040,7 +2034,7 @@ static int ext4_fc_replay_scan(journal_t *journal, state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; - end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; @@ -2061,11 +2055,12 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_replay_expected_off++; - for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; - if (!ext4_fc_tag_len_isvalid(&tl, val, end)) { + if (tl.fc_len > end - val || + !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; goto out_err; @@ -2178,9 +2173,9 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, #endif start = (u8 *)bh->b_data; - end = (__u8 *)bh->b_data + journal->j_blocksize - 1; + end = start + journal->j_blocksize; - for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; + for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; @@ -2249,17 +2244,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) journal->j_fc_cleanup_callback = ext4_fc_cleanup; } -static const char *fc_ineligible_reasons[] = { - "Extended attributes changed", - "Cross rename", - "Journal flag changed", - "Insufficient memory", - "Swap boot", - "Resize", - "Dir renamed", - "Falloc range op", - "Data journalling", - "FC Commit Failed" +static const char * const fc_ineligible_reasons[] = { + [EXT4_FC_REASON_XATTR] = "Extended attributes changed", + [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", + [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", + [EXT4_FC_REASON_NOMEM] = "Insufficient memory", + [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", + [EXT4_FC_REASON_RESIZE] = "Resize", + [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", + [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", + [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", + [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index a6154c3ed135..2fadb2c4780c 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -58,7 +58,7 @@ struct ext4_fc_dentry_info { __u8 fc_dname[]; }; -/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ +/* Value structure for EXT4_FC_TAG_INODE. */ struct ext4_fc_inode { __le32 fc_ino; __u8 fc_raw_inode[]; @@ -96,6 +96,7 @@ enum { EXT4_FC_REASON_RENAME_DIR, EXT4_FC_REASON_FALLOC_RANGE, EXT4_FC_REASON_INODE_JOURNAL_DATA, + EXT4_FC_REASON_ENCRYPTED_FILENAME, EXT4_FC_REASON_MAX }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index a7a597c727e6..7ac0a81bd371 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -955,7 +955,7 @@ const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_file_getattr, .listxattr = ext4_listxattr, - .get_acl = ext4_get_acl, + .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e9bc46684106..63f9bb6e8851 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -465,7 +465,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); parent_group = hinfo.hash % ngroups; } else - parent_group = prandom_u32_max(ngroups); + parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); @@ -870,7 +870,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, struct super_block *sb = dir->i_sb; int nblocks = 0; #ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(p)) return PTR_ERR(p); @@ -1076,8 +1076,8 @@ repeat_in_this_group: if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { BUG_ON(nblocks <= 0); - handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks, 0, + handle = __ext4_journal_start_sb(NULL, dir->i_sb, + line_no, handle_type, nblocks, 0, ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 860fc5119009..c68bebe7ff4b 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; + unsigned int key; int ret = -EIO; *err = 0; @@ -156,7 +157,13 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, if (!p->key) goto no_block; while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); + key = le32_to_cpu(p->key); + if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) { + /* the block was out of range */ + ret = -EFSCORRUPTED; + goto failure; + } + bh = sb_getblk(sb, key); if (unlikely(!bh)) { ret = -ENOMEM; goto failure; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a4fbe825694b..2b42ececa46d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -180,8 +180,7 @@ static int ext4_read_inline_data(struct inode *inode, void *buffer, BUG_ON(len > EXT4_I(inode)->i_inline_size); - cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? - len : EXT4_MIN_INLINE_DATA_SIZE; + cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE); raw_inode = ext4_raw_inode(iloc); memcpy(buffer, (void *)(raw_inode->i_block), cp_len); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2b5ef1b64249..9d9f414f99fe 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -222,13 +222,13 @@ void ext4_evict_inode(struct inode *inode) /* * For inodes with journalled data, transaction commit could have - * dirtied the inode. Flush worker is ignoring it because of I_FREEING - * flag but we still need to remove the inode from the writeback lists. + * dirtied the inode. And for inodes with dioread_nolock, unwritten + * extents converting worker could merge extents and also have dirtied + * the inode. Flush worker is ignoring it because of I_FREEING flag but + * we still need to remove the inode from the writeback lists. */ - if (!list_empty_careful(&inode->i_io_list)) { - WARN_ON_ONCE(!ext4_should_journal_data(inode)); + if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); - } /* * Protect us against freezing - iput() caller didn't have to have any @@ -335,6 +335,12 @@ stop_handle: ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: + /* + * Check out some where else accidentally dirty the evicting inode, + * which may probably cause inode use-after-free issues later. + */ + WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); + if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -1309,7 +1315,8 @@ static int ext4_write_end(struct file *file, trace_ext4_write_end(inode, pos, len, copied); - if (ext4_has_inline_data(inode)) + if (ext4_has_inline_data(inode) && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, page); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -1543,9 +1550,12 @@ void ext4_da_release_space(struct inode *inode, int to_free) */ struct mpage_da_data { + /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; + unsigned int can_map:1; /* Can writepages call map blocks? */ + /* These are internal state of ext4_do_writepages() */ pgoff_t first_page; /* The first page to write */ pgoff_t next_page; /* Current page to examine */ pgoff_t last_page; /* Last page to examine */ @@ -2009,7 +2019,6 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; struct ext4_io_submit io_submit; - bool keep_towrite = false; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { folio_invalidate(folio, 0, folio_size(folio)); @@ -2067,7 +2076,6 @@ static int ext4_writepage(struct page *page, unlock_page(page); return 0; } - keep_towrite = true; } if (PageChecked(page) && ext4_should_journal_data(inode)) @@ -2084,7 +2092,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return -ENOMEM; } - ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite); + ret = ext4_bio_write_page(&io_submit, page, len); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); @@ -2118,7 +2126,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) len = size & ~PAGE_MASK; else len = PAGE_SIZE; - err = ext4_bio_write_page(&mpd->io_submit, page, len, false); + err = ext4_bio_write_page(&mpd->io_submit, page, len); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; @@ -2551,18 +2559,33 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } +/* Return true if the page needs to be written as part of transaction commit */ +static bool ext4_page_nomap_can_writeout(struct page *page) +{ + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) + return true; + } while ((bh = bh->b_this_page) != head); + return false; +} + /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages - * and underlying extent to map + * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for - * IO immediately. When we find a page which isn't mapped we start accumulating - * extent of buffers underlying these pages that needs mapping (formed by - * either delayed or unwritten buffers). We also lock the pages containing - * these buffers. The extent found is returned in @mpd structure (starting at - * mpd->lblk with length mpd->len blocks). + * IO immediately. If we cannot map blocks, we submit just already mapped + * buffers in the page for IO and keep page dirty. When we can map blocks and + * we find a page which isn't mapped we start accumulating extent of buffers + * underlying these pages that needs mapping (formed by either delayed or + * unwritten buffers). We also lock the pages containing these buffers. The + * extent found is returned in @mpd structure (starting at mpd->lblk with + * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an @@ -2653,14 +2676,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; - /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)page->index) << - (PAGE_SHIFT - blkbits); - head = page_buffers(page); - err = mpage_process_page_bufs(mpd, head, head, lblk); - if (err <= 0) - goto out; - err = 0; + /* + * Writeout for transaction commit where we cannot + * modify metadata is simple. Just submit the page. + */ + if (!mpd->can_map) { + if (ext4_page_nomap_can_writeout(page)) { + err = mpage_submit_page(mpd, page); + if (err < 0) + goto out; + } else { + unlock_page(page); + mpd->first_page++; + } + } else { + /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)page->index) << + (PAGE_SHIFT - blkbits); + head = page_buffers(page); + err = mpage_process_page_bufs(mpd, head, head, + lblk); + if (err <= 0) + goto out; + err = 0; + } left--; } pagevec_release(&pvec); @@ -2673,25 +2712,27 @@ out: return err; } -static int ext4_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, + void *data) { + return ext4_writepage(page, wbc); +} + +static int ext4_do_writepages(struct mpage_da_data *mpd) +{ + struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; - struct mpage_da_data mpd; - struct inode *inode = mapping->host; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; - - percpu_down_read(&sbi->s_writepages_rwsem); trace_ext4_writepages(inode, wbc); /* @@ -2703,7 +2744,9 @@ static int ext4_writepages(struct address_space *mapping, goto out_writepages; if (ext4_should_journal_data(inode)) { - ret = generic_writepages(mapping, wbc); + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); + blk_finish_plug(&plug); goto out_writepages; } @@ -2757,19 +2800,18 @@ static int ext4_writepages(struct address_space *mapping, writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; - mpd.first_page = writeback_index; - mpd.last_page = -1; + mpd->first_page = writeback_index; + mpd->last_page = -1; } else { - mpd.first_page = wbc->range_start >> PAGE_SHIFT; - mpd.last_page = wbc->range_end >> PAGE_SHIFT; + mpd->first_page = wbc->range_start >> PAGE_SHIFT; + mpd->last_page = wbc->range_end >> PAGE_SHIFT; } - mpd.inode = inode; - mpd.wbc = wbc; - ext4_io_submit_init(&mpd.io_submit, wbc); + ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); + tag_pages_for_writeback(mapping, mpd->first_page, + mpd->last_page); blk_start_plug(&plug); /* @@ -2778,31 +2820,32 @@ retry: * in the block layer on device congestion while having transaction * started. */ - mpd.do_map = 0; - mpd.scanned_until_end = 0; - mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!mpd.io_submit.io_end) { + mpd->do_map = 0; + mpd->scanned_until_end = 0; + mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } - ret = mpage_prepare_extent_to_map(&mpd); + ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, false); + mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); - ext4_put_io_end_defer(mpd.io_submit.io_end); - mpd.io_submit.io_end = NULL; + ext4_io_submit(&mpd->io_submit); + ext4_put_io_end_defer(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; - while (!mpd.scanned_until_end && wbc->nr_to_write > 0) { + while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ - mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!mpd.io_submit.io_end) { + mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } + WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when @@ -2822,16 +2865,16 @@ retry: "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ - ext4_put_io_end(mpd.io_submit.io_end); - mpd.io_submit.io_end = NULL; + ext4_put_io_end(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; break; } - mpd.do_map = 1; + mpd->do_map = 1; - trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); - ret = mpage_prepare_extent_to_map(&mpd); - if (!ret && mpd.map.m_len) - ret = mpage_map_and_submit_extent(handle, &mpd, + trace_ext4_da_write_pages(inode, mpd->first_page, wbc); + ret = mpage_prepare_extent_to_map(mpd); + if (!ret && mpd->map.m_len) + ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, @@ -2846,12 +2889,12 @@ retry: if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; - mpd.do_map = 0; + mpd->do_map = 0; } /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, give_up_on_write); + mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); + ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have @@ -2861,11 +2904,11 @@ retry: * up doing unwritten extent conversion. */ if (handle) { - ext4_put_io_end_defer(mpd.io_submit.io_end); + ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else - ext4_put_io_end(mpd.io_submit.io_end); - mpd.io_submit.io_end = NULL; + ext4_put_io_end(mpd->io_submit.io_end); + mpd->io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2885,8 +2928,8 @@ unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - mpd.last_page = writeback_index - 1; - mpd.first_page = 0; + mpd->last_page = writeback_index - 1; + mpd->first_page = 0; goto retry; } @@ -2896,15 +2939,51 @@ unplug: * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = mpd.first_page; + mapping->writeback_index = mpd->first_page; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); return ret; } +static int ext4_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct super_block *sb = mapping->host->i_sb; + struct mpage_da_data mpd = { + .inode = mapping->host, + .wbc = wbc, + .can_map = 1, + }; + int ret; + + if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + return -EIO; + + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + ret = ext4_do_writepages(&mpd); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); + + return ret; +} + +int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = jinode->i_dirty_start, + .range_end = jinode->i_dirty_end, + }; + struct mpage_da_data mpd = { + .inode = jinode->i_vfs_inode, + .wbc = &wbc, + .can_map = 0, + }; + return ext4_do_writepages(&mpd); +} + static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -3646,7 +3725,6 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis, static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, - .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, @@ -3664,7 +3742,6 @@ static const struct address_space_operations ext4_aops = { static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, - .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, @@ -3673,6 +3750,7 @@ static const struct address_space_operations ext4_journalled_aops = { .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, + .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, @@ -3681,7 +3759,6 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, - .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, @@ -4225,7 +4302,8 @@ int ext4_truncate(struct inode *inode) /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { - if (ext4_inode_attach_jinode(inode) < 0) + err = ext4_inode_attach_jinode(inode); + if (err) goto out_trace; } @@ -4473,9 +4551,17 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); - block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + block = ext4_inode_table(sb, gdp); + if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || + (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { + ext4_error(sb, "Invalid inode table block %llu in " + "block_group %u", block, iloc->block_group); + return -EFSCORRUPTED; + } + block += (inode_offset / inodes_per_block); + bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; @@ -5044,8 +5130,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); - brelse(iloc.bh); + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + ext4_error_inode(inode, function, line, 0, + "bad inode without EXT4_IGET_BAD flag"); + ret = -EUCLEAN; + goto bad_inode; + } + brelse(iloc.bh); unlock_new_inode(inode); return inode; @@ -5550,7 +5642,7 @@ out_mmap_sem: ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode); + rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); err_out: if (error) @@ -5853,6 +5945,14 @@ static int __ext4_expand_extra_isize(struct inode *inode, return 0; } + /* + * We may need to allocate external xattr block so we need quotas + * initialized. Here we can be called with various locks held so we + * cannot affort to initialize quotas ourselves. So just bail. + */ + if (dquot_initialize_needed(inode)) + return -EAGAIN; + /* try to expand with EAs present */ error = ext4_expand_extra_isize_ea(inode, new_extra_isize, raw_inode, handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 95dfea28bf4e..8067ccda34e4 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -374,7 +374,8 @@ static long swap_inode_boot_loader(struct super_block *sb, blkcnt_t blocks; unsigned short bytes; - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, + EXT4_IGET_SPECIAL | EXT4_IGET_BAD); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); @@ -424,7 +425,7 @@ static long swap_inode_boot_loader(struct super_block *sb, /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); - if (inode_bl->i_nlink == 0) { + if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) { /* this inode has never been used as a BOOT_LOADER */ set_nlink(inode_bl, 1); i_uid_write(inode_bl, 0); @@ -731,6 +732,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) if (ext4_is_quota_file(inode)) return err; + err = dquot_initialize(inode); + if (err) + return err; + err = ext4_get_inode_loc(inode, &iloc); if (err) return err; @@ -746,10 +751,6 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) brelse(iloc.bh); } - err = dquot_initialize(inode); - if (err) - return err; - handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); @@ -1153,19 +1154,22 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, if (fsuuid.fsu_len == 0) { fsuuid.fsu_len = UUID_SIZE; - if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len))) + if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len, + sizeof(fsuuid.fsu_len))) return -EFAULT; - return -EINVAL; + return 0; } - if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) + if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0) return -EINVAL; lock_buffer(sbi->s_sbh); memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE); unlock_buffer(sbi->s_sbh); - if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) + fsuuid.fsu_len = UUID_SIZE; + if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) || + copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) return -EFAULT; return 0; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9dad93059945..5b2ae37a8b80 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5204,7 +5204,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) mutex_lock(&ac->ac_lg->lg_mutex); } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { @@ -5253,8 +5253,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, inode_is_open_for_write(ar->inode) ? "" : "non-"); - return 0; - } static noinline_for_stack void @@ -5591,11 +5589,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out; } - *errp = ext4_mb_initialize_context(ac, ar); - if (*errp) { - ar->len = 0; - goto out; - } + ext4_mb_initialize_context(ac, ar); ac->ac_op = EXT4_MB_HISTORY_PREALLOC; seq = this_cpu_read(discard_pa_seq); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 588cb09c5291..4681fff6665f 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -262,13 +262,7 @@ void ext4_stop_mmpd(struct ext4_sb_info *sbi) */ static unsigned int mmp_new_seq(void) { - u32 new_seq; - - do { - new_seq = get_random_u32(); - } while (new_seq > EXT4_MMP_SEQ_MAX); - - return new_seq; + return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1); } /* diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c08c0aba1883..dd28453d6ea3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3204,14 +3204,20 @@ end_rmdir: return retval; } -int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, - struct inode *inode) +int __ext4_unlink(struct inode *dir, const struct qstr *d_name, + struct inode *inode, + struct dentry *dentry /* NULL during fast_commit recovery */) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; + handle_t *handle; int skip_remove_dentry = 0; + /* + * Keep this outside the transaction; it may have to set up the + * directory's encryption key, which isn't GFP_NOFS-safe. + */ bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); @@ -3228,7 +3234,14 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else - goto out; + goto out_bh; + } + + handle = ext4_journal_start(dir, EXT4_HT_DIR, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + goto out_bh; } if (IS_DIRSYNC(dir)) @@ -3237,12 +3250,12 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) - goto out; + goto out_handle; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) - goto out; + goto out_handle; } else { retval = 0; } @@ -3255,15 +3268,17 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); retval = ext4_mark_inode_dirty(handle, inode); - -out: + if (dentry && !retval) + ext4_fc_track_unlink(handle, dentry); +out_handle: + ext4_journal_stop(handle); +out_bh: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { - handle_t *handle; int retval; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) @@ -3281,16 +3296,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (retval) goto out_trace; - handle = ext4_journal_start(dir, EXT4_HT_DIR, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) { - retval = PTR_ERR(handle); - goto out_trace; - } - - retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry)); - if (!retval) - ext4_fc_track_unlink(handle, dentry); + retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); #if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid @@ -3301,8 +3307,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_CASEFOLDED(dir)) d_invalidate(dentry); #endif - if (handle) - ext4_journal_stop(handle); out_trace: trace_ext4_unlink_exit(dentry, retval); @@ -3794,6 +3798,9 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, retval = dquot_initialize(old.dir); if (retval) return retval; + retval = dquot_initialize(old.inode); + if (retval) + return retval; retval = dquot_initialize(new.dir); if (retval) return retval; @@ -4194,7 +4201,7 @@ const struct inode_operations ext4_dir_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, - .get_acl = ext4_get_acl, + .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, @@ -4205,6 +4212,6 @@ const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, - .get_acl = ext4_get_acl, + .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, }; diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 69a9cf9137a6..e5b47dda3317 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -412,7 +412,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " - "clearing orphan list.\n"); + "clearing orphan list."); es->s_last_orphan = 0; } ext4_debug("Skipping orphan recovery on fs with errors.\n"); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 97fa7b4c645f..beaec6d81074 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -430,25 +430,20 @@ submit_and_retry: int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, - int len, - bool keep_towrite) + int len) { struct page *bounce_page = NULL; struct inode *inode = page->mapping->host; unsigned block_start; struct buffer_head *bh, *head; int ret = 0; - int nr_submitted = 0; int nr_to_submit = 0; struct writeback_control *wbc = io->io_wbc; + bool keep_towrite = false; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - if (keep_towrite) - set_page_writeback_keepwrite(page); - else - set_page_writeback(page); ClearPageError(page); /* @@ -482,16 +477,31 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* A hole? We can safely clear the dirty bit */ if (!buffer_mapped(bh)) clear_buffer_dirty(bh); - if (io->io_bio) - ext4_io_submit(io); + /* + * Keeping dirty some buffer we cannot write? Make sure + * to redirty the page and keep TOWRITE tag so that + * racing WB_SYNC_ALL writeback does not skip the page. + * This happens e.g. when doing writeout for + * transaction commit. + */ + if (buffer_dirty(bh)) { + if (!PageDirty(page)) + redirty_page_for_writepage(wbc, page); + keep_towrite = true; + } continue; } if (buffer_new(bh)) clear_buffer_new(bh); set_buffer_async_write(bh); + clear_buffer_dirty(bh); nr_to_submit++; } while ((bh = bh->b_this_page) != head); + /* Nothing to submit? Just unlock the page... */ + if (!nr_to_submit) + goto unlock; + bh = head = page_buffers(page); /* @@ -532,27 +542,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); redirty_page_for_writepage(wbc, page); do { - clear_buffer_async_write(bh); + if (buffer_async_write(bh)) { + clear_buffer_async_write(bh); + set_buffer_dirty(bh); + } bh = bh->b_this_page; } while (bh != head); goto unlock; } } + if (keep_towrite) + set_page_writeback_keepwrite(page); + else + set_page_writeback(page); + /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; io_submit_add_bh(io, inode, bounce_page ? bounce_page : page, bh); - nr_submitted++; - clear_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - unlock: unlock_page(page); - /* Nothing submitted - we have to end page writeback */ - if (!nr_submitted) - end_page_writeback(page); return ret; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3d21eae267fc..d5266932ce6c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -75,14 +75,10 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; - /* PG_error was set if verity failed. */ - if (bio->bi_status || PageError(page)) { + if (bio->bi_status) ClearPageUptodate(page); - /* will re-read again later */ - ClearPageError(page); - } else { + else SetPageUptodate(page); - } unlock_page(page); } if (bio->bi_private) @@ -410,9 +406,8 @@ int ext4_mpage_readpages(struct inode *inode, int __init ext4_init_post_read_processing(void) { - bio_post_read_ctx_cache = - kmem_cache_create("ext4_bio_post_read_ctx", - sizeof(struct bio_post_read_ctx), 0, 0, NULL); + bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); + if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 46b87ffeb304..6b91443d6bf3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1110,6 +1110,16 @@ exit_free: return err; } +static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, + ext4_group_t group) +{ + struct ext4_super_block *es = (struct ext4_super_block *) data; + + es->s_block_group_nr = cpu_to_le16(group); + if (ext4_has_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(sb, es); +} + /* * Update the backup copies of the ext4 metadata. These don't need to be part * of the main resize transaction, because e2fsck will re-write them if there @@ -1158,7 +1168,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, while (group < sbi->s_groups_count) { struct buffer_head *bh; ext4_fsblk_t backup_block; - struct ext4_super_block *es; + int has_super = ext4_bg_has_super(sb, group); + ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group); /* Out of journal space, and can't get more - abort - so sad */ err = ext4_resize_ensure_credits_batch(handle, 1); @@ -1168,8 +1179,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, if (meta_bg == 0) backup_block = ((ext4_fsblk_t)group) * bpg + blk_off; else - backup_block = (ext4_group_first_block_no(sb, group) + - ext4_bg_has_super(sb, group)); + backup_block = first_block + has_super; bh = sb_getblk(sb, backup_block); if (unlikely(!bh)) { @@ -1187,10 +1197,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, memcpy(bh->b_data, data, size); if (rest) memset(bh->b_data + size, 0, rest); - es = (struct ext4_super_block *) bh->b_data; - es->s_block_group_nr = cpu_to_le16(group); - if (ext4_has_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + if (has_super && (backup_block == first_block)) + ext4_set_block_group_nr(sb, bh->b_data, group); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -1476,8 +1484,6 @@ static void ext4_update_super(struct super_block *sb, * active. */ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + reserved_blocks); - ext4_superblock_csum_set(sb); - unlock_buffer(sbi->s_sbh); /* Update the free space counts */ percpu_counter_add(&sbi->s_freeclusters_counter, @@ -1513,6 +1519,8 @@ static void ext4_update_super(struct super_block *sb, ext4_calculate_overhead(sb); es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: added group %u:" "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, @@ -1596,8 +1604,8 @@ exit_journal: int meta_bg = ext4_has_feature_meta_bg(sb); sector_t old_gdb = 0; - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block), 0); + update_backups(sb, ext4_group_first_block_no(sb, 0), + (char *)es, sizeof(struct ext4_super_block), 0); for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; @@ -1808,7 +1816,7 @@ errout: if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, + update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); } return err; @@ -1831,7 +1839,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; - int err; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); @@ -1886,8 +1893,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, } brelse(bh); - err = ext4_group_extend_no_check(sb, o_blocks_count, add); - return err; + return ext4_group_extend_no_check(sb, o_blocks_count, add); } /* ext4_group_extend */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7cdd2138c897..16a343e8047d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -540,8 +540,7 @@ static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else - ret = jbd2_journal_submit_inode_data_buffers(jinode); - + ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } @@ -1206,7 +1205,8 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) - ext4_msg(sb, KERN_INFO, "unmounting filesystem."); + ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", + &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); @@ -1323,6 +1323,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; inode_set_iversion(&ei->vfs_inode, 1); + ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); atomic_set(&ei->i_prealloc_active, 0); @@ -2247,7 +2248,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } - error = fs_lookup_param(fc, param, 1, &path); + error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); @@ -3778,7 +3779,7 @@ cont_thread: } if (!progress) { elr->lr_next_sched = jiffies + - prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ); + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; @@ -3925,8 +3926,7 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, * spread the inode table initialization requests * better. */ - elr->lr_next_sched = jiffies + prandom_u32_max( - EXT4_DEF_LI_MAX_START_DELAY * HZ); + elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } @@ -5287,14 +5287,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); + "journal_async_commit, fs mounted w/o journal"); goto failed_mount3a; } - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit, fs mounted w/o journal"); + "journal_checksum, fs mounted w/o journal"); goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { @@ -5655,8 +5656,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Quota mode: %s.", descr, ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. " + "Quota mode: %s.", &sb->s_uuid, descr, + ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); @@ -5723,7 +5725,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); - if (!S_ISREG(journal_inode->i_mode)) { + if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return NULL; @@ -6611,8 +6613,8 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.", + &sb->s_uuid, ext4_quota_mode(sb)); return 0; } @@ -6886,6 +6888,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return err; } +static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) +{ + switch (type) { + case USRQUOTA: + return qf_inum == EXT4_USR_QUOTA_INO; + case GRPQUOTA: + return qf_inum == EXT4_GRP_QUOTA_INO; + case PRJQUOTA: + return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; + default: + BUG(); + } +} + static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { @@ -6902,9 +6918,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, if (!qf_inums[type]) return -EPERM; + if (!ext4_check_quota_inum(type, qf_inums[type])) { + ext4_error(sb, "Bad quota inum: %lu, type: %d", + qf_inums[type], type); + return -EUCLEAN; + } + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { - ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); + ext4_error(sb, "Bad quota inode: %lu, type: %d", + qf_inums[type], type); return PTR_ERR(qf_inode); } @@ -6943,8 +6966,9 @@ int ext4_enable_quotas(struct super_block *sb) if (err) { ext4_warning(sb, "Failed to enable quota tracking " - "(type=%d, err=%d). Please run " - "e2fsck to fix.", type, err); + "(type=%d, err=%d, ino=%lu). " + "Please run e2fsck to fix.", type, + err, qf_inums[type]); for (type--; type >= 0; type--) { struct inode *inode; @@ -7031,8 +7055,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); bh = ext4_bread(NULL, inode, blk, 0); if (IS_ERR(bh)) return PTR_ERR(bh); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 3c640bd7ecae..30e3b65798b5 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -79,7 +79,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *fsdata; + void *fsdata = NULL; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 36d6ba7190b6..7decaaf27e82 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1281,7 +1281,7 @@ retry_ref: ce = mb_cache_entry_get(ea_block_cache, hash, bh->b_blocknr); if (ce) { - ce->e_reusable = 1; + set_bit(MBE_REUSABLE_B, &ce->e_flags); mb_cache_entry_put(ea_block_cache, ce); } } @@ -1441,6 +1441,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle, if (!err) err = ext4_inode_attach_jinode(ea_inode); if (err) { + if (ext4_xattr_inode_dec_ref(handle, ea_inode)) + ext4_warning_inode(ea_inode, + "cleanup dec ref error %d", err); iput(ea_inode); return ERR_PTR(err); } @@ -1540,7 +1543,8 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { - ext4_xattr_inode_dec_ref(handle, ea_inode); + if (ext4_xattr_inode_dec_ref(handle, ea_inode)) + ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); return err; } @@ -2042,7 +2046,7 @@ inserted: } BHDR(new_bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX) - ce->e_reusable = 0; + clear_bit(MBE_REUSABLE_B, &ce->e_flags); ea_bdebug(new_bh, "reusing; refcount now=%d", ref); ext4_xattr_block_csum_set(inode, new_bh); @@ -2070,19 +2074,11 @@ inserted: goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - - /* non-extent files can't have physical blocks past 2^32 */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); if (error) goto cleanup; - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); - ea_idebug(inode, "creating block %llu", (unsigned long long)block); @@ -2555,7 +2551,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - buffer = kmalloc(value_size, GFP_NOFS); + buffer = kvmalloc(value_size, GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); if (!is || !bs || !buffer || !b_entry_name) { error = -ENOMEM; @@ -2607,7 +2603,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, error = 0; out: kfree(b_entry_name); - kfree(buffer); + kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 5bbc44a5216e..c1c74aa658ae 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -276,9 +276,11 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns, return error; } -int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { + struct inode *inode = d_inode(dentry); + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index a26e33cab4ff..ea2bbb3f264b 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -34,7 +34,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); -extern int f2fs_set_acl(struct user_namespace *, struct inode *, +extern int f2fs_set_acl(struct user_namespace *, struct dentry *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d315c2de136f..2b7a5cc4ed66 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1711,50 +1711,27 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) } } -/* - * Update and unlock the cluster's pagecache pages, and release the reference to - * the decompress_io_ctx that was being held for I/O completion. - */ -static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, - bool in_task) +static void f2fs_verify_cluster(struct work_struct *work) { + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); int i; + /* Verify, update, and unlock the decompressed pages. */ for (i = 0; i < dic->cluster_size; i++) { struct page *rpage = dic->rpages[i]; if (!rpage) continue; - /* PG_error was set if verity failed. */ - if (failed || PageError(rpage)) { - ClearPageUptodate(rpage); - /* will re-read again later */ - ClearPageError(rpage); - } else { + if (fsverity_verify_page(rpage)) SetPageUptodate(rpage); - } + else + ClearPageUptodate(rpage); unlock_page(rpage); } - f2fs_put_dic(dic, in_task); -} - -static void f2fs_verify_cluster(struct work_struct *work) -{ - struct decompress_io_ctx *dic = - container_of(work, struct decompress_io_ctx, verity_work); - int i; - - /* Verify the cluster's decompressed pages with fs-verity. */ - for (i = 0; i < dic->cluster_size; i++) { - struct page *rpage = dic->rpages[i]; - - if (rpage && !fsverity_verify_page(rpage)) - SetPageError(rpage); - } - - __f2fs_decompress_end_io(dic, false, true); + f2fs_put_dic(dic, true); } /* @@ -1764,6 +1741,8 @@ static void f2fs_verify_cluster(struct work_struct *work) void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, bool in_task) { + int i; + if (!failed && dic->need_verity) { /* * Note that to avoid deadlocks, the verity work can't be done @@ -1773,9 +1752,28 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, */ INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); - } else { - __f2fs_decompress_end_io(dic, failed, in_task); + return; + } + + /* Update and unlock the cluster's pagecache pages. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (!rpage) + continue; + + if (failed) + ClearPageUptodate(rpage); + else + SetPageUptodate(rpage); + unlock_page(rpage); } + + /* + * Release the reference to the decompress_io_ctx that was being held + * for I/O completion. + */ + f2fs_put_dic(dic, in_task); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a71e818cd67b..7af75041bd81 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -116,43 +116,56 @@ struct bio_post_read_ctx { struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; + /* + * decompression_attempted keeps track of whether + * f2fs_end_read_compressed_page() has been called on the pages in the + * bio that belong to a compressed cluster yet. + */ + bool decompression_attempted; block_t fs_blkaddr; }; +/* + * Update and unlock a bio's pages, and free the bio. + * + * This marks pages up-to-date only if there was no error in the bio (I/O error, + * decryption error, or verity error), as indicated by bio->bi_status. + * + * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk) + * aren't marked up-to-date here, as decompression is done on a per-compression- + * cluster basis rather than a per-bio basis. Instead, we only must do two + * things for each compressed page here: call f2fs_end_read_compressed_page() + * with failed=true if an error occurred before it would have normally gotten + * called (i.e., I/O error or decryption error, but *not* verity error), and + * release the bio's reference to the decompress_io_ctx of the page's cluster. + */ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; + struct bio_post_read_ctx *ctx = bio->bi_private; - /* - * Update and unlock the bio's pagecache pages, and put the - * decompression context for any compressed pages. - */ bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (f2fs_is_compressed_page(page)) { - if (bio->bi_status) + if (ctx && !ctx->decompression_attempted) f2fs_end_read_compressed_page(page, true, 0, in_task); f2fs_put_page_dic(page, in_task); continue; } - /* PG_error was set if verity failed. */ - if (bio->bi_status || PageError(page)) { + if (bio->bi_status) ClearPageUptodate(page); - /* will re-read again later */ - ClearPageError(page); - } else { + else SetPageUptodate(page); - } dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); + if (ctx) + mempool_free(ctx, bio_post_read_ctx_pool); bio_put(bio); } @@ -185,8 +198,10 @@ static void f2fs_verify_bio(struct work_struct *work) struct page *page = bv->bv_page; if (!f2fs_is_compressed_page(page) && - !fsverity_verify_page(page)) - SetPageError(page); + !fsverity_verify_page(page)) { + bio->bi_status = BLK_STS_IOERR; + break; + } } } else { fsverity_verify_bio(bio); @@ -245,6 +260,8 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, blkaddr++; } + ctx->decompression_attempted = true; + /* * Optimization: if all the bio's pages are compressed, then scheduling * the per-bio verity work is unnecessary, as verity will be fully @@ -1062,6 +1079,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; ctx->fs_blkaddr = blkaddr; + ctx->decompression_attempted = false; bio->bi_private = ctx; } iostat_alloc_and_bind_ctx(sbi, bio, ctx); @@ -1089,7 +1107,6 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, bio_put(bio); return -EFAULT; } - ClearPageError(page); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); __submit_bio(sbi, bio, DATA); @@ -2141,7 +2158,6 @@ submit_and_realloc: inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); - ClearPageError(page); *last_block_in_bio = block_nr; goto out; out: @@ -2289,7 +2305,6 @@ submit_and_realloc: inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); - ClearPageError(page); *last_block_in_bio = blkaddr; } @@ -2306,7 +2321,6 @@ out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { ClearPageUptodate(cc->rpages[i]); - ClearPageError(cc->rpages[i]); unlock_page(cc->rpages[i]); } } @@ -2403,7 +2417,6 @@ read_single_page: #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: #endif - SetPageError(page); zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 82cda1258227..83df6f6173d3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1025,7 +1025,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, __setattr_copy(mnt_userns, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) @@ -1046,7 +1046,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, const struct inode_operations f2fs_file_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, + .get_inode_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, .fiemap = f2fs_fiemap, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4546e01b2ee0..536d332d9e2e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -282,7 +282,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, /* let's select beginning hot/small space first in no_heap mode*/ if (f2fs_need_rand_seg(sbi)) - p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec); else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a389772fd212..c227113b0f26 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1379,7 +1379,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, + .get_inode_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, .fiemap = f2fs_fiemap, @@ -1397,7 +1397,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { const struct inode_operations f2fs_special_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, + .get_inode_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index acf3d3fa4363..b304692c0cf5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2534,7 +2534,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) sanity_check_seg_type(sbi, seg_type); if (f2fs_need_rand_seg(sbi)) - return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2588,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = - prandom_u32_max(sbi->max_fragment_chunk) + 1; + get_random_u32_inclusive(1, sbi->max_fragment_chunk); } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2625,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk <= 0) { seg->fragment_remained_chunk = - prandom_u32_max(sbi->max_fragment_chunk) + 1; + get_random_u32_inclusive(1, sbi->max_fragment_chunk); seg->next_blkoff += - prandom_u32_max(sbi->max_fragment_hole) + 1; + get_random_u32_inclusive(1, sbi->max_fragment_hole); } } } diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index af191371c352..3626eb585a98 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -17,7 +17,7 @@ struct fat_fid { #define FAT_FID_SIZE_WITHOUT_PARENT 3 #define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) -/** +/* * Look up a directory inode given its starting cluster. */ static struct inode *fat_dget(struct super_block *sb, int i_logstart) @@ -135,7 +135,7 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, return type; } -/** +/* * Map a NFS file handle to a corresponding dentry. * The dentry may or may not be connected to the filesystem root. */ diff --git a/fs/fs_parser.c b/fs/fs_parser.c index ed40ce5742fd..edb3712dcfa5 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -138,15 +138,16 @@ EXPORT_SYMBOL(__fs_parse); * @fc: The filesystem context to log errors through. * @param: The parameter. * @want_bdev: T if want a blockdev + * @flags: Pathwalk flags passed to filename_lookup() * @_path: The result of the lookup */ int fs_lookup_param(struct fs_context *fc, struct fs_parameter *param, bool want_bdev, + unsigned int flags, struct path *_path) { struct filename *f; - unsigned int flags = 0; bool put_f; int ret; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 451d8a077e12..bce2492186d0 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -605,6 +605,14 @@ again: set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags); queue = true; } + /* + * We could race with cookie_lru which may set LRU_DISCARD bit + * but has yet to run the cookie state machine. If this happens + * and another thread tries to use the cookie, clear LRU_DISCARD + * so we don't end up withdrawing the cookie while in use. + */ + if (test_and_clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) + fscache_see_cookie(cookie, fscache_cookie_see_lru_discard_clear); break; case FSCACHE_COOKIE_STATE_FAILED: diff --git a/fs/fscache/io.c b/fs/fscache/io.c index 3af3b08a9bb3..0d2b8dec8f82 100644 --- a/fs/fscache/io.c +++ b/fs/fscache/io.c @@ -286,7 +286,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, * taken into account. */ - iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); fscache_write(cres, start, &iter, fscache_wreq_done, wreq); return; diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 337cb29a8dd5..a4850aee2639 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -53,9 +53,10 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) return acl; } -int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { + struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); const char *name; int ret; @@ -98,7 +99,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) && !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index c7d882a9fe33..a06fbb1a8a5b 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -545,7 +545,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) { struct fuse_dev *fud = file->private_data; struct cuse_conn *cc = fc_to_cc(fud->fc); - int rc; /* remove from the conntbl, no more access from this point on */ mutex_lock(&cuse_lock); @@ -560,9 +559,7 @@ static int cuse_channel_release(struct inode *inode, struct file *file) cdev_del(cc->cdev); } - rc = fuse_dev_release(inode, file); /* puts the base reference */ - - return rc; + return fuse_dev_release(inode, file); } static struct file_operations cuse_channel_fops; /* initialized during init */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4a6e0a1b945..c73d9c4132f6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1498,7 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); kfree(buf); return err; @@ -1546,7 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); kfree(buf); return err; @@ -2267,8 +2267,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, * Check against file->f_op because CUSE * uses the same ioctl handler. */ - if (old->f_op == file->f_op && - old->f_cred->user_ns == file->f_cred->user_ns) + if (old->f_op == file->f_op) fud = fuse_get_dev(old); if (fud) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bb97a384dc5d..cd1a071b625a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -214,7 +214,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || - (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) { + (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; @@ -1170,7 +1170,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name) + u64 child_nodeid, struct qstr *name, u32 flags) { int err = -ENOTDIR; struct inode *parent; @@ -1197,7 +1197,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, goto unlock; fuse_dir_changed(parent); - fuse_invalidate_entry(entry); + if (!(flags & FUSE_EXPIRE_ONLY)) + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { inode_lock(d_inode(entry)); @@ -1235,6 +1237,18 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, return err; } +static inline bool fuse_permissible_uidgid(struct fuse_conn *fc) +{ + const struct cred *cred = current_cred(); + + return (uid_eq(cred->euid, fc->user_id) && + uid_eq(cred->suid, fc->user_id) && + uid_eq(cred->uid, fc->user_id) && + gid_eq(cred->egid, fc->group_id) && + gid_eq(cred->sgid, fc->group_id) && + gid_eq(cred->gid, fc->group_id)); +} + /* * Calling into a user-controlled filesystem gives the filesystem * daemon ptrace-like capabilities over the current process. This @@ -1248,26 +1262,19 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ -int fuse_allow_current_process(struct fuse_conn *fc) +bool fuse_allow_current_process(struct fuse_conn *fc) { - const struct cred *cred; - - if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) - return 1; + bool allow; if (fc->allow_other) - return current_in_userns(fc->user_ns); + allow = current_in_userns(fc->user_ns); + else + allow = fuse_permissible_uidgid(fc); - cred = current_cred(); - if (uid_eq(cred->euid, fc->user_id) && - uid_eq(cred->suid, fc->user_id) && - uid_eq(cred->uid, fc->user_id) && - gid_eq(cred->egid, fc->group_id) && - gid_eq(cred->sgid, fc->group_id) && - gid_eq(cred->gid, fc->group_id)) - return 1; + if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + allow = true; - return 0; + return allow; } static int fuse_access(struct inode *inode, int mask) @@ -1935,7 +1942,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_acl = fuse_get_acl, + .get_inode_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, @@ -1957,7 +1964,7 @@ static const struct inode_operations fuse_common_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_acl = fuse_get_acl, + .get_inode_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 89f4741728ba..875314ee6f59 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1313,7 +1313,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - should_remove_suid(file_dentry(file))) { + setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { goto writethrough; } @@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } +static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; + bool exclusive_lock = + !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || + iocb->ki_flags & IOCB_APPEND || + fuse_direct_write_extending_i_size(iocb, from); + + /* + * Take exclusive lock if + * - Parallel direct writes are disabled - a user space decision + * - Parallel direct writes are enabled and i_size is being extended. + * This might not be needed at all, but needs further investigation. + */ + if (exclusive_lock) + inode_lock(inode); + else { + inode_lock_shared(inode); + + /* A race with truncate might have come up as the decision for + * the lock type was done without holding the lock, check again. + */ + if (fuse_direct_write_extending_i_size(iocb, from)) { + inode_unlock_shared(inode); + inode_lock(inode); + exclusive_lock = true; + } + } - /* Don't allow parallel writes to the same file */ - inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) { if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { @@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - inode_unlock(inode); + if (exclusive_lock) + inode_unlock(inode); + else + inode_unlock_shared(inode); return res; } @@ -2931,6 +2967,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { fuse_write_update_attr(inode, pos, ret); + /* For extending writes we already hold exclusive lock */ if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 98a9cf531873..c673faefdcb9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1179,7 +1179,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr); /** * Is current process allowed to perform filesystem operation? */ -int fuse_allow_current_process(struct fuse_conn *fc); +bool fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); @@ -1220,7 +1220,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, * then the dentry is unhashed (d_delete()). */ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name); + u64 child_nodeid, struct qstr *name, u32 flags); int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); @@ -1269,7 +1269,7 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); -int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); /* readdir.c */ diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 61d8afcb10a3..fcce94ace2c2 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -255,7 +255,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ap.args.in_pages = true; err = -EFAULT; - iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); + iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size); for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) @@ -324,7 +324,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, goto out; err = -EFAULT; - iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); + iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred); for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index e8deaacf1832..dc603479b30e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -547,9 +547,9 @@ retry_locked: * Contents of the page are now protected against changing by holding * the page lock. */ - addr = kmap(page); + addr = kmap_local_page(page); res = fuse_parse_cache(ff, addr, size, ctx); - kunmap(page); + kunmap_local(addr); unlock_page(page); put_page(page); diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 734d1f05d823..3dcde4912413 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -109,9 +109,10 @@ out: return error; } -int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; bool need_unlock = false; diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index cd180ca7c959..b8de8c148f5c 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,7 +13,7 @@ extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 04a201584fa7..1371e067d2a7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1997,7 +1997,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, else { error = gfs2_setattr_simple(inode, attr); if (!error && attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, inode, + error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); } @@ -2149,7 +2149,7 @@ static const struct inode_operations gfs2_file_iops = { .getattr = gfs2_getattr, .listxattr = gfs2_listxattr, .fiemap = gfs2_fiemap, - .get_acl = gfs2_get_acl, + .get_inode_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, .update_time = gfs2_update_time, .fileattr_get = gfs2_fileattr_get, @@ -2171,7 +2171,7 @@ static const struct inode_operations gfs2_dir_iops = { .getattr = gfs2_getattr, .listxattr = gfs2_listxattr, .fiemap = gfs2_fiemap, - .get_acl = gfs2_get_acl, + .get_inode_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, .update_time = gfs2_update_time, .atomic_open = gfs2_atomic_open, diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index c4526f16355d..a0746be3c1de 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -458,6 +458,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) /* panic? */ return -EIO; + if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN) + return -EIO; fd.search_key->cat = HFS_I(main_inode)->cat_key; if (hfs_brec_find(&fd)) /* panic? */ diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index 39f5e343bf4d..fdb0edb8a607 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -109,7 +109,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr if (nls_io) { wchar_t ch; - while (srclen > 0) { + while (srclen > 0 && dstlen > 0) { size = nls_io->char2uni(src, srclen, &ch); if (size < 0) { ch = '?'; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a5db2e3b2980..6aa919e59483 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -198,6 +198,8 @@ struct hfsplus_sb_info { #define HFSPLUS_SB_HFSX 3 #define HFSPLUS_SB_CASEFOLD 4 #define HFSPLUS_SB_NOBARRIER 5 +#define HFSPLUS_SB_UID 6 +#define HFSPLUS_SB_GID 7 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index aeab83ed1c9c..b675581aa9d0 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -192,11 +192,11 @@ static void hfsplus_get_perms(struct inode *inode, mode = be16_to_cpu(perms->mode); i_uid_write(inode, be32_to_cpu(perms->owner)); - if (!i_uid_read(inode) && !mode) + if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode)) inode->i_uid = sbi->uid; i_gid_write(inode, be32_to_cpu(perms->group)); - if (!i_gid_read(inode) && !mode) + if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode)) inode->i_gid = sbi->gid; if (dir) { diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 047e05c57560..c94a58762ad6 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -140,6 +140,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (!uid_valid(sbi->uid)) { pr_err("invalid uid specified\n"); return 0; + } else { + set_bit(HFSPLUS_SB_UID, &sbi->flags); } break; case opt_gid: @@ -151,6 +153,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (!gid_valid(sbi->gid)) { pr_err("invalid gid specified\n"); return 0; + } else { + set_bit(HFSPLUS_SB_GID, &sbi->flags); } break; case opt_part: diff --git a/fs/inode.c b/fs/inode.c index b608528efd3a..f453eb58fd03 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1949,40 +1949,12 @@ skip_update: EXPORT_SYMBOL(touch_atime); /* - * The logic we want is - * - * if suid or (sgid and xgrp) - * remove privs - */ -int should_remove_suid(struct dentry *dentry) -{ - umode_t mode = d_inode(dentry)->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) - return kill; - - return 0; -} -EXPORT_SYMBOL(should_remove_suid); - -/* * Return mask of changes for notify_change() that need to be done as a * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ -int dentry_needs_remove_privs(struct dentry *dentry) +int dentry_needs_remove_privs(struct user_namespace *mnt_userns, + struct dentry *dentry) { struct inode *inode = d_inode(dentry); int mask = 0; @@ -1991,7 +1963,7 @@ int dentry_needs_remove_privs(struct dentry *dentry) if (IS_NOSEC(inode)) return 0; - mask = should_remove_suid(dentry); + mask = setattr_should_drop_suidgid(mnt_userns, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; @@ -2023,7 +1995,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; - kill = dentry_needs_remove_privs(dentry); + kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry); if (kill < 0) return kill; @@ -2071,9 +2043,6 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; - if (!sync_it) - return 0; - return sync_it; } @@ -2354,15 +2323,15 @@ EXPORT_SYMBOL(inode_init_owner); bool inode_owner_or_capable(struct user_namespace *mnt_userns, const struct inode *inode) { - kuid_t i_uid; + vfsuid_t vfsuid; struct user_namespace *ns; - i_uid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid)) + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; ns = current_user_ns(); - if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER)) + if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } @@ -2488,6 +2457,28 @@ struct timespec64 current_time(struct inode *inode) EXPORT_SYMBOL(current_time); /** + * in_group_or_capable - check whether caller is CAP_FSETID privileged + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check + * @vfsgid: the new/current vfsgid of @inode + * + * Check wether @vfsgid is in the caller's group list or if the caller is + * privileged with CAP_FSETID over @inode. This can be used to determine + * whether the setgid bit can be kept or must be dropped. + * + * Return: true if the caller is sufficiently privileged, false if not. + */ +bool in_group_or_capable(struct user_namespace *mnt_userns, + const struct inode *inode, vfsgid_t vfsgid) +{ + if (vfsgid_in_group_p(vfsgid)) + return true; + if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + return true; + return false; +} + +/** * mode_strip_sgid - handle the sgid bit for non-directories * @mnt_userns: User namespace of the mount the inode was created from * @dir: parent directory inode @@ -2508,11 +2499,9 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns, return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; - if (in_group_p(i_gid_into_mnt(mnt_userns, dir))) - return mode; - if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) + if (in_group_or_capable(mnt_userns, dir, + i_gid_into_vfsgid(mnt_userns, dir))) return mode; - return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); diff --git a/fs/internal.h b/fs/internal.h index 6f0386b34fae..a803cc3cf716 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -150,7 +150,9 @@ extern int vfs_open(const struct path *, struct file *); * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); -extern int dentry_needs_remove_privs(struct dentry *dentry); +int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry); +bool in_group_or_capable(struct user_namespace *mnt_userns, + const struct inode *inode, vfsgid_t vfsgid); /* * fs-writeback.c @@ -225,12 +227,39 @@ struct xattr_ctx { }; -ssize_t do_getxattr(struct user_namespace *mnt_userns, +ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct xattr_ctx *ctx); int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); -int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx); +int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode); + +#ifdef CONFIG_FS_POSIX_ACL +int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + const char *acl_name, const void *kvalue, size_t size); +ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, + const char *acl_name, void *kvalue, size_t size); +#else +static inline int do_set_acl(struct mnt_idmap *idmap, + struct dentry *dentry, const char *acl_name, + const void *kvalue, size_t size) +{ + return -EOPNOTSUPP; +} +static inline ssize_t do_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, const char *acl_name, + void *kvalue, size_t size) +{ + return -EOPNOTSUPP; +} +#endif ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); + +/* + * fs/attr.c + */ +int setattr_should_drop_sgid(struct user_namespace *mnt_userns, + const struct inode *inode); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 885a7a6cc53e..4810438b7856 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -207,14 +207,13 @@ int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) } /* Send all the data buffers related to an inode */ -int jbd2_submit_inode_data(struct jbd2_inode *jinode) +int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) { - if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) return 0; trace_jbd2_submit_inode_data(jinode->i_vfs_inode); - return jbd2_journal_submit_inode_data_buffers(jinode); + return journal->j_submit_inode_data_buffers(jinode); } EXPORT_SYMBOL(jbd2_submit_inode_data); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index e945e3484788..8bb58ce5c06c 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -229,10 +229,11 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int rc, xprefix; + struct inode *inode = d_inode(dentry); switch (type) { case ACL_TYPE_ACCESS: diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 9d9fb7cf093e..ca36a6eca594 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu); -int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index c0aabbcbfd58..f399b390b5f6 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -62,7 +62,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .get_acl = jffs2_get_acl, + .get_inode_acl = jffs2_get_acl, .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .listxattr = jffs2_listxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index ba86acbe12d3..3cf71befa475 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -64,7 +64,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .get_acl = jffs2_get_acl, + .get_inode_acl = jffs2_get_acl, .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .listxattr = jffs2_listxattr, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 39cec28096a7..66af51c41619 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -202,7 +202,7 @@ int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); return rc; } diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index a653f34c6e26..3b667eccc73b 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -94,12 +94,13 @@ out: return rc; } -int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int rc; tid_t tid; int update_mode = 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; tid = txBegin(inode->i_sb, 0); diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 332dc9ac47a9..88663465aecd 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -123,7 +123,7 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); return rc; } @@ -133,7 +133,7 @@ const struct inode_operations jfs_file_inode_operations = { .fileattr_get = jfs_fileattr_get, .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL - .get_acl = jfs_get_acl, + .get_inode_acl = jfs_get_acl, .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 3de40286d31f..f0704a25835f 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -8,7 +8,7 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu); -int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 6b838d3ae7c2..765838578a72 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -155,7 +155,7 @@ int dbMount(struct inode *ipbmap) struct bmap *bmp; struct dbmap_disk *dbmp_le; struct metapage *mp; - int i; + int i, err; /* * allocate/initialize the in-memory bmap descriptor @@ -170,8 +170,8 @@ int dbMount(struct inode *ipbmap) BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, PSIZE, 0); if (mp == NULL) { - kfree(bmp); - return -EIO; + err = -EIO; + goto err_kfree_bmp; } /* copy the on-disk bmap descriptor to its in-memory version. */ @@ -181,9 +181,8 @@ int dbMount(struct inode *ipbmap) bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); if (!bmp->db_numag) { - release_metapage(mp); - kfree(bmp); - return -EINVAL; + err = -EINVAL; + goto err_release_metapage; } bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); @@ -194,6 +193,16 @@ int dbMount(struct inode *ipbmap) bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); + if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) { + err = -EINVAL; + goto err_release_metapage; + } + + if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { + err = -EINVAL; + goto err_release_metapage; + } + for (i = 0; i < MAXAG; i++) bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); @@ -214,6 +223,12 @@ int dbMount(struct inode *ipbmap) BMAP_LOCK_INIT(bmp); return (0); + +err_release_metapage: + release_metapage(mp); +err_kfree_bmp: + kfree(bmp); + return err; } diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h index 1c984214e95e..a0ee4ccea66e 100644 --- a/fs/jfs/jfs_extent.h +++ b/fs/jfs/jfs_extent.h @@ -10,9 +10,7 @@ (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1) extern int extAlloc(struct inode *, s64, s64, xad_t *, bool); -extern int extFill(struct inode *, xad_t *); extern int extHint(struct inode *, s64, xad_t *); -extern int extRealloc(struct inode *, s64, xad_t *, bool); extern int extRecord(struct inode *, xad_t *); #endif /* _H_JFS_EXTENT */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 799d3837e7c2..390cbfce391f 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -310,8 +310,8 @@ int diRead(struct inode *ip) iagno = INOTOIAG(ip->i_ino); /* read the iag */ - imap = JFS_IP(ipimap)->i_imap; IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + imap = JFS_IP(ipimap)->i_imap; rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) { diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 48d1f70f786c..b83aae56a1f2 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -234,11 +234,15 @@ int jfs_mount_rw(struct super_block *sb, int remount) truncate_inode_pages(sbi->ipimap->i_mapping, 0); truncate_inode_pages(sbi->ipbmap->i_mapping, 0); + + IWRITE_LOCK(sbi->ipimap, RDWRLOCK_IMAP); diUnmount(sbi->ipimap, 1); if ((rc = diMount(sbi->ipimap))) { + IWRITE_UNLOCK(sbi->ipimap); jfs_err("jfs_mount_rw: diMount failed!"); return rc; } + IWRITE_UNLOCK(sbi->ipimap); dbUnmount(sbi->ipbmap, 1); if ((rc = dbMount(sbi->ipbmap))) { diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index 3e8b13e6aa01..8ec43f53f686 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -68,7 +68,6 @@ int jfs_umount(struct super_block *sb) /* * close secondary aggregate inode allocation map */ - ipaimap2 = sbi->ipaimap2; if (ipaimap2) { diUnmount(ipaimap2, 0); diFreeSpecial(ipaimap2); @@ -78,7 +77,6 @@ int jfs_umount(struct super_block *sb) /* * close aggregate inode allocation map */ - ipaimap = sbi->ipaimap; diUnmount(ipaimap, 0); diFreeSpecial(ipaimap); sbi->ipaimap = NULL; @@ -89,7 +87,7 @@ int jfs_umount(struct super_block *sb) dbUnmount(ipbmap, 0); diFreeSpecial(ipbmap); - sbi->ipimap = NULL; + sbi->ipbmap = NULL; /* * Make sure all metadata makes it to disk before we mark diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index c50167a7bc50..0d33816d251d 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -25,7 +25,7 @@ struct jfs_ea_list { struct jfs_ea ea[]; /* Variable length list */ }; -/* Macros for defining maxiumum number of bytes supported for EAs */ +/* Macros for defining maximum number of bytes supported for EAs */ #define MAXEASIZE 65535 #define MAXEALISTSIZE MAXEASIZE diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index 142caafc73b1..ad7592191d76 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -96,12 +96,8 @@ extern int xtInsert(tid_t tid, struct inode *ip, extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, int flag); extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); -extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, - int flag); extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type); extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size); -extern int xtRelocate(tid_t tid, struct inode *ip, - xad_t * oxad, s64 nxaddr, int xtype); extern int xtAppend(tid_t tid, struct inode *ip, int xflag, s64 xoff, int maxblocks, int *xlenp, s64 * xaddrp, int flag); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9db4f5789c0e..a38d14eed047 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -946,7 +946,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, if (ssize <= IDATASIZE) { ip->i_op = &jfs_fast_symlink_inode_operations; - ip->i_link = JFS_IP(ip)->i_inline; + ip->i_link = JFS_IP(ip)->i_inline_all; memcpy(ip->i_link, name, ssize); ip->i_size = ssize - 1; @@ -1525,7 +1525,7 @@ const struct inode_operations jfs_dir_inode_operations = { .fileattr_get = jfs_fileattr_get, .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL - .get_acl = jfs_get_acl, + .get_inode_acl = jfs_get_acl, .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 85d4f44f2ac4..d2f82cb7db1b 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -745,8 +745,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); @@ -785,8 +784,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, inode_lock(inode); while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b2fc85d440d0..9306e10753f9 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -2487,9 +2487,9 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr, fattr->cf_dacls = NULL; if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { - fattr->cf_acls = get_acl(inode, ACL_TYPE_ACCESS); + fattr->cf_acls = get_inode_acl(inode, ACL_TYPE_ACCESS); if (S_ISDIR(inode->i_mode)) - fattr->cf_dacls = get_acl(inode, ACL_TYPE_DEFAULT); + fattr->cf_dacls = get_inode_acl(inode, ACL_TYPE_DEFAULT); } } @@ -2956,7 +2956,7 @@ int smb2_open(struct ksmbd_work *work) struct inode *inode = d_inode(path.dentry); posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns, - inode, + path.dentry, d_inode(path.dentry->d_parent)); if (posix_acl_rc) ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc); @@ -2972,7 +2972,7 @@ int smb2_open(struct ksmbd_work *work) if (rc) { if (posix_acl_rc) ksmbd_vfs_set_init_posix_acl(user_ns, - inode); + path.dentry); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index b05ff9b146b5..ab5c68cc0e13 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -1289,7 +1289,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, } if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { - posix_acls = get_acl(d_inode(path->dentry), ACL_TYPE_ACCESS); + posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS); if (posix_acls && !found) { unsigned int id = -1; @@ -1386,14 +1386,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, inode, + rc = set_posix_acl(user_ns, path->dentry, ACL_TYPE_ACCESS, fattr.cf_acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) { - rc = set_posix_acl(user_ns, inode, + rc = set_posix_acl(user_ns, path->dentry, ACL_TYPE_DEFAULT, fattr.cf_dacls); if (rc) ksmbd_debug(SMB, diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 94b8ed4ef870..ff0e7a4fcd4d 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -321,7 +321,7 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end, unsigned char type) { struct file_lock *flock; - struct file_lock_context *ctx = file_inode(filp)->i_flctx; + struct file_lock_context *ctx = locks_inode_context(file_inode(filp)); int error = 0; if (!ctx || list_empty_careful(&ctx->flc_posix)) @@ -1321,7 +1321,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = ksmbd_vfs_remove_xattr(user_ns, dentry, name); + err = vfs_remove_acl(user_ns, dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); @@ -1375,7 +1375,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) return NULL; - posix_acls = get_acl(inode, acl_type); + posix_acls = get_inode_acl(inode, acl_type); if (!posix_acls) return NULL; @@ -1824,10 +1824,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) } int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, - struct inode *inode) + struct dentry *dentry) { struct posix_acl_state acl_state; struct posix_acl *acls; + struct inode *inode = d_inode(dentry); int rc; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) @@ -1856,14 +1857,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); else if (S_ISDIR(inode->i_mode)) { posix_state_to_acl(&acl_state, acls->a_entries); - rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, - acls); + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); @@ -1874,16 +1874,17 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, } int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, - struct inode *inode, struct inode *parent_inode) + struct dentry *dentry, struct inode *parent_inode) { struct posix_acl *acls; struct posix_acl_entry *pace; + struct inode *inode = d_inode(dentry); int rc, i; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) return -EOPNOTSUPP; - acls = get_acl(parent_inode, ACL_TYPE_DEFAULT); + acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT); if (!acls) return -ENOENT; pace = acls->a_entries; @@ -1895,12 +1896,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, } } - rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls); + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", rc); if (S_ISDIR(inode->i_mode)) { - rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, + rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 593059ca8511..0d73d735cc39 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -160,8 +160,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, struct dentry *dentry, struct xattr_dos_attrib *da); int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, - struct inode *inode); + struct dentry *dentry); int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, - struct inode *inode, + struct dentry *dentry, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/libfs.c b/fs/libfs.c index 682d56345a1c..aada4e7c8713 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -995,8 +995,8 @@ out: EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ -ssize_t simple_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; @@ -1017,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, goto out; attr->set_buf[size] = '\0'; - ret = kstrtoull(attr->set_buf, 0, &val); + if (is_signed) + ret = kstrtoll(attr->set_buf, 0, &val); + else + ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); @@ -1027,8 +1030,21 @@ out: mutex_unlock(&attr->mutex); return ret; } + +ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(simple_attr_write); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(simple_attr_write_signed); + /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index e1c4617de771..720684345817 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -207,7 +207,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct nlm_host *lockhost; if (!flctx || list_empty_careful(&flctx->flc_posix)) @@ -262,7 +262,7 @@ nlm_file_inuse(struct nlm_file *file) { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; diff --git a/fs/locks.c b/fs/locks.c index 607f94a0e789..8f01bee17715 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -175,7 +175,7 @@ locks_get_lock_context(struct inode *inode, int type) struct file_lock_context *ctx; /* paired with cmpxchg() below */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (likely(ctx) || type == F_UNLCK) goto out; @@ -194,7 +194,7 @@ locks_get_lock_context(struct inode *inode, int type) */ if (cmpxchg(&inode->i_flctx, NULL, ctx)) { kmem_cache_free(flctx_cache, ctx); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); } out: trace_locks_get_lock_context(inode, type, ctx); @@ -247,7 +247,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, void locks_free_lock_context(struct inode *inode) { - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx = locks_inode_context(inode); if (unlikely(ctx)) { locks_check_ctx_lists(inode); @@ -891,7 +891,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) void *owner; void (*func)(void); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx || list_empty_careful(&ctx->flc_posix)) { fl->fl_type = F_UNLCK; return; @@ -1483,7 +1483,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) new_fl->fl_flags = type; /* typically we will check that ctx is non-NULL before calling */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) { WARN_ON_ONCE(1); goto free_lock; @@ -1588,7 +1588,7 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time) struct file_lock_context *ctx; struct file_lock *fl; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); fl = list_first_entry_or_null(&ctx->flc_lease, @@ -1634,7 +1634,7 @@ int fcntl_getlease(struct file *filp) int type = F_UNLCK; LIST_HEAD(dispose); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (ctx && !list_empty_careful(&ctx->flc_lease)) { percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); @@ -1823,7 +1823,7 @@ static int generic_delete_lease(struct file *filp, void *owner) struct file_lock_context *ctx; LIST_HEAD(dispose); - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) { trace_generic_delete_lease(inode, NULL); return error; @@ -2096,7 +2096,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) * throw a warning to let people know that they don't actually work. */ if (cmd & LOCK_MAND) { - pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); + pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid); return 0; } @@ -2146,6 +2146,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { + WARN_ON_ONCE(filp != fl->fl_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); @@ -2295,6 +2296,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { + WARN_ON_ONCE(filp != fl->fl_file); if (filp->f_op->lock) return filp->f_op->lock(filp, cmd, fl); else @@ -2561,7 +2563,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2634,7 +2636,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = smp_load_acquire(&locks_inode(filp)->i_flctx); + ctx = locks_inode_context(locks_inode(filp)); if (!ctx) return; @@ -2663,12 +2665,36 @@ void locks_remove_file(struct file *filp) */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { + WARN_ON_ONCE(filp != fl->fl_file); if (filp->f_op->lock) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } EXPORT_SYMBOL_GPL(vfs_cancel_lock); +/** + * vfs_inode_has_locks - are any file locks held on @inode? + * @inode: inode to check for locks + * + * Return true if there are any FL_POSIX or FL_FLOCK locks currently + * set on @inode. + */ +bool vfs_inode_has_locks(struct inode *inode) +{ + struct file_lock_context *ctx; + bool ret; + + ctx = locks_inode_context(inode); + if (!ctx) + return false; + + spin_lock(&ctx->flc_lock); + ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock); + spin_unlock(&ctx->flc_lock); + return ret; +} +EXPORT_SYMBOL_GPL(vfs_inode_has_locks); + #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -2839,7 +2865,7 @@ void show_fd_locks(struct seq_file *f, struct file_lock_context *ctx; int id = 0; - ctx = smp_load_acquire(&inode->i_flctx); + ctx = locks_inode_context(inode); if (!ctx) return; diff --git a/fs/mbcache.c b/fs/mbcache.c index e272ad738faf..2a4b8b549e93 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -100,8 +100,9 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, atomic_set(&entry->e_refcnt, 2); entry->e_key = key; entry->e_value = value; - entry->e_reusable = reusable; - entry->e_referenced = 0; + entry->e_flags = 0; + if (reusable) + set_bit(MBE_REUSABLE_B, &entry->e_flags); head = mb_cache_entry_head(cache, key); hlist_bl_lock(head); hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { @@ -165,7 +166,8 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache, while (node) { entry = hlist_bl_entry(node, struct mb_cache_entry, e_hash_list); - if (entry->e_key == key && entry->e_reusable && + if (entry->e_key == key && + test_bit(MBE_REUSABLE_B, &entry->e_flags) && atomic_inc_not_zero(&entry->e_refcnt)) goto out; node = node->next; @@ -284,7 +286,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_or_get); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry) { - entry->e_referenced = 1; + set_bit(MBE_REFERENCED_B, &entry->e_flags); } EXPORT_SYMBOL(mb_cache_entry_touch); @@ -309,9 +311,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, entry = list_first_entry(&cache->c_list, struct mb_cache_entry, e_list); /* Drop initial hash reference if there is no user */ - if (entry->e_referenced || + if (test_bit(MBE_REFERENCED_B, &entry->e_flags) || atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) { - entry->e_referenced = 0; + clear_bit(MBE_REFERENCED_B, &entry->e_flags); list_move_tail(&entry->e_list, &cache->c_list); continue; } diff --git a/fs/namei.c b/fs/namei.c index 9155ecb547ce..720270dc9fe5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -297,13 +297,13 @@ static int check_acl(struct user_namespace *mnt_userns, acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; - /* no ->get_acl() calls in RCU mode... */ + /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(mnt_userns, inode, acl, mask); } - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { @@ -336,11 +336,11 @@ static int acl_permission_check(struct user_namespace *mnt_userns, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - kuid_t i_uid; + vfsuid_t vfsuid; /* Are we the owner? If so, ACL's don't matter */ - i_uid = i_uid_into_mnt(mnt_userns, inode); - if (likely(uid_eq(current_fsuid(), i_uid))) { + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; @@ -362,8 +362,8 @@ static int acl_permission_check(struct user_namespace *mnt_userns, * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(kgid)) + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } @@ -581,7 +581,7 @@ struct nameidata { struct nameidata *saved; unsigned root_seq; int dfd; - kuid_t dir_uid; + vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; @@ -1095,15 +1095,15 @@ fs_initcall(init_fs_namei_sysctls); static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct user_namespace *mnt_userns; - kuid_t i_uid; + vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; mnt_userns = mnt_user_ns(nd->path.mnt); - i_uid = i_uid_into_mnt(mnt_userns, inode); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); /* Allowed if owner and follower match. */ - if (uid_eq(current_cred()->fsuid, i_uid)) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ @@ -1111,7 +1111,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod return 0; /* Allowed if parent directory and link owner match. */ - if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid)) + if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -1183,8 +1183,8 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link) struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) @@ -1232,13 +1232,13 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; - kuid_t dir_uid = nd->dir_uid; + vfsuid_t dir_vfsuid = nd->dir_vfsuid; if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || - uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) || - uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) + vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) || + vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) return 0; if (likely(dir_mode & 0002) || @@ -2307,7 +2307,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) OK: /* pathname or trailing symlink, done */ if (!depth) { - nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode); + nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; @@ -2885,9 +2885,9 @@ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, { kuid_t fsuid = current_fsuid(); - if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid)) return 0; - if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid)) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER); } @@ -2926,8 +2926,8 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) || + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); diff --git a/fs/namespace.c b/fs/namespace.c index df137ba19d37..ab467ee58341 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,6 +75,22 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +struct mnt_idmap { + struct user_namespace *owner; + refcount_t count; +}; + +/* + * Carries the initial idmapping of 0:0:4294967295 which is an identity + * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is + * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. + */ +struct mnt_idmap nop_mnt_idmap = { + .owner = &init_user_ns, + .count = REFCOUNT_INIT(1), +}; +EXPORT_SYMBOL_GPL(nop_mnt_idmap); + struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; @@ -82,6 +98,7 @@ struct mount_kattr { unsigned int lookup_flags; bool recurse; struct user_namespace *mnt_userns; + struct mnt_idmap *mnt_idmap; }; /* /sys/fs */ @@ -193,6 +210,104 @@ int mnt_get_count(struct mount *mnt) #endif } +/** + * mnt_idmap_owner - retrieve owner of the mount's idmapping + * @idmap: mount idmapping + * + * This helper will go away once the conversion to use struct mnt_idmap + * everywhere has finished at which point the helper will be unexported. + * + * Only code that needs to perform permission checks based on the owner of the + * idmapping will get access to it. All other code will solely rely on + * idmappings. This will get us type safety so it's impossible to conflate + * filesystems idmappings with mount idmappings. + * + * Return: The owner of the idmapping. + */ +struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap) +{ + return idmap->owner; +} +EXPORT_SYMBOL_GPL(mnt_idmap_owner); + +/** + * mnt_user_ns - retrieve owner of an idmapped mount + * @mnt: the relevant vfsmount + * + * This helper will go away once the conversion to use struct mnt_idmap + * everywhere has finished at which point the helper will be unexported. + * + * Only code that needs to perform permission checks based on the owner of the + * idmapping will get access to it. All other code will solely rely on + * idmappings. This will get us type safety so it's impossible to conflate + * filesystems idmappings with mount idmappings. + * + * Return: The owner of the idmapped. + */ +struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) +{ + struct mnt_idmap *idmap = mnt_idmap(mnt); + + /* Return the actual owner of the filesystem instead of the nop. */ + if (idmap == &nop_mnt_idmap && + !initial_idmapping(mnt->mnt_sb->s_user_ns)) + return mnt->mnt_sb->s_user_ns; + return mnt_idmap_owner(idmap); +} +EXPORT_SYMBOL_GPL(mnt_user_ns); + +/** + * alloc_mnt_idmap - allocate a new idmapping for the mount + * @mnt_userns: owning userns of the idmapping + * + * Allocate a new struct mnt_idmap which carries the idmapping of the mount. + * + * Return: On success a new idmap, on error an error pointer is returned. + */ +static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) +{ + struct mnt_idmap *idmap; + + idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); + if (!idmap) + return ERR_PTR(-ENOMEM); + + idmap->owner = get_user_ns(mnt_userns); + refcount_set(&idmap->count, 1); + return idmap; +} + +/** + * mnt_idmap_get - get a reference to an idmapping + * @idmap: the idmap to bump the reference on + * + * If @idmap is not the @nop_mnt_idmap bump the reference count. + * + * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. + */ +static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap) + refcount_inc(&idmap->count); + + return idmap; +} + +/** + * mnt_idmap_put - put a reference to an idmapping + * @idmap: the idmap to put the reference on + * + * If this is a non-initial idmapping, put the reference count when a mount is + * released and free it if we're the last user. + */ +static inline void mnt_idmap_put(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { + put_user_ns(idmap->owner); + kfree(idmap); + } +} + static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -232,7 +347,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); - mnt->mnt.mnt_userns = &init_user_ns; + mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; @@ -602,11 +717,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { - struct user_namespace *mnt_userns; - - mnt_userns = mnt_user_ns(&mnt->mnt); - if (!initial_idmapping(mnt_userns)) - put_user_ns(mnt_userns); + mnt_idmap_put(mnt_idmap(&mnt->mnt)); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); @@ -1009,7 +1120,6 @@ static struct mount *skip_mnt_tree(struct mount *p) struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; - struct user_namespace *fs_userns; if (!fc->root) return ERR_PTR(-EINVAL); @@ -1027,10 +1137,6 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - fs_userns = mnt->mnt.mnt_sb->s_user_ns; - if (!initial_idmapping(fs_userns)) - mnt->mnt.mnt_userns = get_user_ns(fs_userns); - lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); @@ -1120,9 +1226,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); - mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); - if (!initial_idmapping(mnt->mnt.mnt_userns)) - mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); + mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); + mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; @@ -3515,8 +3620,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, q = next_mnt(q, new); if (!q) break; + // an mntns binding we'd skipped? while (p->mnt.mnt_root != q->mnt.mnt_root) - p = next_mnt(p, old); + p = next_mnt(skip_mnt_tree(p), old); } namespace_unlock(); @@ -3981,14 +4087,14 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) struct vfsmount *m = &mnt->mnt; struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; - if (!kattr->mnt_userns) + if (!kattr->mnt_idmap) return 0; /* * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (kattr->mnt_userns == fs_userns) + if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns) return -EINVAL; /* @@ -4028,7 +4134,7 @@ static inline bool mnt_allow_writers(const struct mount_kattr *kattr, { return (!(kattr->attr_set & MNT_READONLY) || (mnt->mnt.mnt_flags & MNT_READONLY)) && - !kattr->mnt_userns; + !kattr->mnt_idmap; } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) @@ -4082,27 +4188,18 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { - struct user_namespace *mnt_userns, *old_mnt_userns; - - if (!kattr->mnt_userns) + if (!kattr->mnt_idmap) return; /* - * We're the only ones able to change the mount's idmapping. So - * mnt->mnt.mnt_userns is stable and we can retrieve it directly. - */ - old_mnt_userns = mnt->mnt.mnt_userns; - - mnt_userns = get_user_ns(kattr->mnt_userns); - /* Pairs with smp_load_acquire() in mnt_user_ns(). */ - smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); - - /* - * If this is an idmapped filesystem drop the reference we've taken - * in vfs_create_mount() before. + * Pairs with smp_load_acquire() in mnt_idmap(). + * + * Since we only allow a mount to change the idmapping once and + * verified this in can_idmap_mount() we know that the mount has + * @nop_mnt_idmap attached to it. So there's no need to drop any + * references. */ - if (!initial_idmapping(old_mnt_userns)) - put_user_ns(old_mnt_userns); + smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) @@ -4136,6 +4233,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) if (path->dentry != mnt->mnt.mnt_root) return -EINVAL; + if (kattr->mnt_userns) { + struct mnt_idmap *mnt_idmap; + + mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns); + if (IS_ERR(mnt_idmap)) + return PTR_ERR(mnt_idmap); + kattr->mnt_idmap = mnt_idmap; + } + if (kattr->propagation) { /* * Only take namespace_lock() if we're actually changing @@ -4323,6 +4429,9 @@ static void finish_mount_kattr(struct mount_kattr *kattr) { put_user_ns(kattr->mnt_userns); kattr->mnt_userns = NULL; + + if (kattr->mnt_idmap) + mnt_idmap_put(kattr->mnt_idmap); } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, diff --git a/fs/netfs/io.c b/fs/netfs/io.c index e374767d1b68..7f753380e047 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -23,7 +23,7 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq) { struct iov_iter iter; - iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); iov_iter_zero(iov_iter_count(&iter), &iter); @@ -49,7 +49,7 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq, struct iov_iter iter; netfs_stat(&netfs_n_rh_read); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); @@ -208,7 +208,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) continue; } - iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages, + iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages, subreq->start, subreq->len); atomic_inc(&rreq->nr_copy_ops); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index ead8a0e06abf..cf7365581031 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -146,7 +146,7 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state { struct inode *inode = state->inode; struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct list_head *list; int status = 0; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e861d7bae305..e731c00a9fcb 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -252,7 +252,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -282,7 +282,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, bvec[0].bv_page = page; bvec[0].bv_offset = 0; bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index 03a4e679fd99..df9ca56db347 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -12,7 +12,7 @@ */ #ifdef CONFIG_NFS_V3_ACL extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu); -extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 93de0b58647a..74d11e3c4205 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -255,23 +255,24 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } -int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { struct posix_acl *orig = acl, *dfacl = NULL, *alloc; + struct inode *inode = d_inode(dentry); int status; if (S_ISDIR(inode->i_mode)) { switch(type) { case ACL_TYPE_ACCESS: - alloc = get_acl(inode, ACL_TYPE_DEFAULT); + alloc = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(alloc)) goto fail; dfacl = alloc; break; case ACL_TYPE_DEFAULT: - alloc = get_acl(inode, ACL_TYPE_ACCESS); + alloc = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(alloc)) goto fail; dfacl = acl; @@ -312,7 +313,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, struct posix_acl *acl; char *p = data + *result; - acl = get_acl(inode, type); + acl = get_inode_acl(inode, type); if (IS_ERR_OR_NULL(acl)) return 0; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2e7579626cf0..4bf208a0a8e9 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -998,7 +998,7 @@ static const struct inode_operations nfs3_dir_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .get_acl = nfs3_get_acl, + .get_inode_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif }; @@ -1009,7 +1009,7 @@ static const struct inode_operations nfs3_file_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .get_acl = nfs3_get_acl, + .get_inode_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a2d2d5d1b088..dd18344648f3 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1501,7 +1501,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct file_lock *fl; struct nfs4_lock_state *lsp; int status = 0; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct list_head *list; if (flctx == NULL) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 317cedfa52bf..16be6dae524f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1055,7 +1055,7 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev, if (prev) { if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev))) return 0; - flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx; + flctx = locks_inode_context(d_inode(nfs_req_openctx(req)->dentry)); if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f41d24b54fd1..80c240e50952 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1185,7 +1185,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct nfs_lock_context *l_ctx; - struct file_lock_context *flctx = file_inode(file)->i_flctx; + struct file_lock_context *flctx = locks_inode_context(file_inode(file)); struct nfs_page *req; int do_flush, status; /* @@ -1321,7 +1321,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode, unsigned int pagelen) { int ret; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct file_lock *fl; if (file->f_flags & O_DSYNC) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 13e6e6897f6c..c43c25a8da2e 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -55,7 +55,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) goto out; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -69,7 +69,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { resp->status = nfserrno(PTR_ERR(acl)); goto fail; @@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); if (error) goto out_drop_lock; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 2fb9ee356455..9daa621817d8 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -47,7 +47,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) resp->mask = argp->mask; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -61,7 +61,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { resp->status = nfserrno(PTR_ERR(acl)); goto fail; @@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) inode_lock(inode); - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS, argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT, argp->acl_default); out_drop_lock: diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index bb8e2f6d7d03..518203821790 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -135,7 +135,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, unsigned int flags = 0; int size = 0; - pacl = get_acl(inode, ACL_TYPE_ACCESS); + pacl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (!pacl) pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -147,7 +147,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, if (S_ISDIR(inode->i_mode)) { flags = NFS4_ACL_DIR; - dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + dpacl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(dpacl)) { error = PTR_ERR(dpacl); goto rel_pacl; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 836bd825ca4a..da8d0ea66229 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4758,7 +4758,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) static bool nfsd4_deleg_present(const struct inode *inode) { - struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx); + struct file_lock_context *ctx = locks_inode_context(inode); return ctx && !list_empty_careful(&ctx->flc_lease); } @@ -5897,7 +5897,7 @@ nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { nf = stp->st_stid.sc_file; - ctx = nf->fi_inode->i_flctx; + ctx = locks_inode_context(nf->fi_inode); if (!ctx) continue; if (locks_owner_has_blockers(ctx, lo)) @@ -7713,7 +7713,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } inode = locks_inode(nf->nf_file); - flctx = inode->i_flctx; + flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 849a720ab43f..08a929607641 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -480,12 +480,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, attr->na_seclabel->data, attr->na_seclabel->len); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl) attr->na_aclerr = set_posix_acl(&init_user_ns, - inode, ACL_TYPE_ACCESS, + dentry, ACL_TYPE_ACCESS, attr->na_pacl); if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode)) attr->na_aclerr = set_posix_acl(&init_user_ns, - inode, ACL_TYPE_DEFAULT, + dentry, ACL_TYPE_DEFAULT, attr->na_dpacl); inode_unlock(inode); if (size_change) @@ -943,7 +943,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, READ, vec, vlen, *count); + iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count); host_err = vfs_iter_read(file, &iter, &ppos, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1033,7 +1033,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && !use_wgather) flags |= RWF_SYNC; - iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); + iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt); since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index c8b89b4f94e0..2064e6473d30 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -13,6 +13,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/random.h> +#include <linux/log2.h> #include <linux/crc32.h> #include "nilfs.h" #include "segment.h" @@ -193,6 +194,34 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, } /** + * nilfs_get_blocksize - get block size from raw superblock data + * @sb: super block instance + * @sbp: superblock raw data buffer + * @blocksize: place to store block size + * + * nilfs_get_blocksize() calculates the block size from the block size + * exponent information written in @sbp and stores it in @blocksize, + * or aborts with an error message if it's too large. + * + * Return Value: On success, 0 is returned. If the block size is too + * large, -EINVAL is returned. + */ +static int nilfs_get_blocksize(struct super_block *sb, + struct nilfs_super_block *sbp, int *blocksize) +{ + unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); + + if (unlikely(shift_bits > + ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) { + nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB", + shift_bits); + return -EINVAL; + } + *blocksize = BLOCK_SIZE << shift_bits; + return 0; +} + +/** * load_nilfs - load and recover the nilfs * @nilfs: the_nilfs structure to be released * @sb: super block instance used to recover past segment @@ -245,11 +274,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); /* verify consistency between two super blocks */ - blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); + err = nilfs_get_blocksize(sb, sbp[0], &blocksize); + if (err) + goto scan_error; + if (blocksize != nilfs->ns_blocksize) { nilfs_warn(sb, "blocksize differs between two super blocks (%d != %d)", blocksize, nilfs->ns_blocksize); + err = -EINVAL; goto scan_error; } @@ -443,11 +476,33 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) return crc == le32_to_cpu(sbp->s_sum); } -static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) +/** + * nilfs_sb2_bad_offset - check the location of the second superblock + * @sbp: superblock raw data buffer + * @offset: byte offset of second superblock calculated from device size + * + * nilfs_sb2_bad_offset() checks if the position on the second + * superblock is valid or not based on the filesystem parameters + * stored in @sbp. If @offset points to a location within the segment + * area, or if the parameters themselves are not normal, it is + * determined to be invalid. + * + * Return Value: true if invalid, false if valid. + */ +static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) { - return offset < ((le64_to_cpu(sbp->s_nsegments) * - le32_to_cpu(sbp->s_blocks_per_segment)) << - (le32_to_cpu(sbp->s_log_block_size) + 10)); + unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); + u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); + u64 nsegments = le64_to_cpu(sbp->s_nsegments); + u64 index; + + if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS || + shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS) + return true; + + index = offset >> (shift_bits + BLOCK_SIZE_BITS); + do_div(index, blocks_per_segment); + return index < nsegments; } static void nilfs_release_super_block(struct the_nilfs *nilfs) @@ -586,9 +641,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto failed_sbh; - blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); - if (blocksize < NILFS_MIN_BLOCK_SIZE || - blocksize > NILFS_MAX_BLOCK_SIZE) { + err = nilfs_get_blocksize(sb, sbp, &blocksize); + if (err) + goto failed_sbh; + + if (blocksize < NILFS_MIN_BLOCK_SIZE) { nilfs_err(sb, "couldn't mount because of unsupported filesystem blocksize %d", blocksize); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 4f2ffc7ef296..c5e4a886593d 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -802,7 +802,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, setattr_copy(mnt_userns, inode, attr); if (mode != inode->i_mode) { - err = ntfs_acl_chmod(mnt_userns, inode); + err = ntfs_acl_chmod(mnt_userns, dentry); if (err) goto out; @@ -1255,7 +1255,7 @@ const struct inode_operations ntfs_file_inode_operations = { .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, .permission = ntfs_permission, - .get_acl = ntfs_get_acl, + .get_inode_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .fiemap = ntfs_fiemap, }; diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index bc22cc321a74..053cc0e0f8b5 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -367,7 +367,7 @@ const struct inode_operations ntfs_dir_inode_operations = { .mknod = ntfs_mknod, .rename = ntfs_rename, .permission = ntfs_permission, - .get_acl = ntfs_get_acl, + .get_inode_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .setattr = ntfs3_setattr, .getattr = ntfs_getattr, @@ -379,7 +379,7 @@ const struct inode_operations ntfs_special_inode_operations = { .setattr = ntfs3_setattr, .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, - .get_acl = ntfs_get_acl, + .get_inode_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, }; // clang-format on diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 2c791222c4e2..a4d292809a33 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -843,7 +843,7 @@ int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); -int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, struct inode *dir); @@ -852,7 +852,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, #define ntfs_set_acl NULL #endif -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode); +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry); int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 7de8718c68a9..aafe98ee0b21 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -619,10 +619,10 @@ out: /* * ntfs_set_acl - inode_operations::set_acl */ -int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false); + return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false); } /* @@ -664,8 +664,9 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, /* * ntfs_acl_chmod - Helper for ntfs3_setattr(). */ -int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry) { + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; if (!(sb->s_flags & SB_POSIXACL)) @@ -674,7 +675,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - return posix_acl_chmod(mnt_userns, inode, inode->i_mode); + return posix_acl_chmod(mnt_userns, dentry, inode->i_mode); } /* diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 23a72a423955..9f19cf9a5a9f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -260,12 +260,13 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; int status, had_lock; struct ocfs2_lock_holder oh; + struct inode *inode = d_inode(dentry); had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh); if (had_lock < 0) diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 95a57c888ab6..a897c4e41b26 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -17,7 +17,7 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); -int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index b13d344d40b6..60b97c92e2b2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -335,7 +335,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg) /* negotiate timeout must be less than write timeout. */ schedule_delayed_work(®->hr_nego_timeout_work, msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); - memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); + bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES); } static void o2hb_disarm_timeout(struct o2hb_region *reg) @@ -375,7 +375,7 @@ static void o2hb_nego_timeout(struct work_struct *work) if (reg->hr_last_hb_status) return; - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); /* lowest node as master node to make negotiate decision. */ master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); @@ -386,8 +386,8 @@ static void o2hb_nego_timeout(struct work_struct *work) config_item_name(®->hr_item), reg->hr_bdev); set_bit(master_node, reg->hr_nego_node_bitmap); } - if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, - sizeof(reg->hr_nego_node_bitmap))) { + if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap, + O2NM_MAX_NODES)) { /* check negotiate bitmap every second to do timeout * approve decision. */ @@ -856,8 +856,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) * live nodes heartbeat on it. In other words, the region has been * added to all nodes. */ - if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, - sizeof(o2hb_live_node_bitmap))) + if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + O2NM_MAX_NODES)) goto unlock; printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", @@ -1087,7 +1087,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * If a node is not configured but is in the livemap, we still need * to read the slot so as to be able to remove it from the livemap. */ - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); i = -1; while ((i = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { @@ -1437,11 +1437,11 @@ void o2hb_init(void) for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) INIT_LIST_HEAD(&o2hb_live_slots[i]); - memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); - memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); - memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); - memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); - memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); + bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES); + bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); o2hb_dependent_users = 0; @@ -1450,23 +1450,21 @@ void o2hb_init(void) /* if we're already in a callback then we're already serialized by the sem */ static void o2hb_fill_node_map_from_callback(unsigned long *map, - unsigned bytes) + unsigned int bits) { - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memcpy(map, &o2hb_live_node_bitmap, bytes); + bitmap_copy(map, o2hb_live_node_bitmap, bits); } /* * get a map of all nodes that are heartbeating in any regions */ -void o2hb_fill_node_map(unsigned long *map, unsigned bytes) +void o2hb_fill_node_map(unsigned long *map, unsigned int bits) { /* callers want to serialize this map and callbacks so that they * can trust that they don't miss nodes coming to the party */ down_read(&o2hb_callback_sem); spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(map, bytes); + o2hb_fill_node_map_from_callback(map, bits); spin_unlock(&o2hb_live_lock); up_read(&o2hb_callback_sem); } @@ -2460,7 +2458,7 @@ int o2hb_check_node_heartbeating_no_sem(u8 node_num) unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, @@ -2477,7 +2475,7 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 1d4100abf6f8..8ef8c1b9eeb7 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -59,7 +59,7 @@ int o2hb_register_callback(const char *region_uuid, void o2hb_unregister_callback(const char *region_uuid, struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, - unsigned bytes); + unsigned int bits); void o2hb_exit(void); void o2hb_init(void); int o2hb_check_node_heartbeating_no_sem(u8 node_num); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 7524994e3199..35c05c18de59 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -438,7 +438,7 @@ static int o2net_fill_bitmap(char *buf, int len) unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; int i = -1, out = 0; - o2net_fill_node_map(map, sizeof(map)); + o2net_fill_node_map(map, O2NM_MAX_NODES); while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 27fee68f860a..2f61d39e4e50 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -54,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes) return -EINVAL; read_lock(&cluster->cl_nodes_lock); - memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); + bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES); read_unlock(&cluster->cl_nodes_lock); return 0; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index f660c0dbdb63..37d222bdfc8c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -900,7 +900,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { struct kvec vec = { .iov_len = len, .iov_base = data, }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len); return sock_recvmsg(sock, &msg, MSG_DONTWAIT); } @@ -990,14 +990,12 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, } /* Get a map of all nodes to which this node is currently connected to */ -void o2net_fill_node_map(unsigned long *map, unsigned bytes) +void o2net_fill_node_map(unsigned long *map, unsigned int bits) { struct o2net_sock_container *sc; int node, ret; - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memset(map, 0, bytes); + bitmap_zero(map, bits); for (node = 0; node < O2NM_MAX_NODES; ++node) { if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) continue; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index fd2022712167..20f790a47484 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1094,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err) static inline void dlm_node_iter_init(unsigned long *map, struct dlm_node_iter *iter) { - memcpy(iter->node_map, map, sizeof(iter->node_map)); + bitmap_copy(iter->node_map, map, O2NM_MAX_NODES); iter->curnode = -1; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c4eccd499db8..5c04dde99981 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1576,8 +1576,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm, spin_lock(&dlm->spinlock); /* For now, we restart the process if the node maps have * changed at all */ - ret = memcmp(ctxt->live_map, dlm->live_nodes_map, - sizeof(dlm->live_nodes_map)); + ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map, + O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); if (ret) @@ -1604,13 +1604,11 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) /* group sem locking should work for us here -- we're already * registered for heartbeat events so filling this should be * atomic wrt getting those handlers called. */ - o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); + o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES); spin_lock(&dlm->spinlock); - memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); - + bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES); __dlm_set_joining_node(dlm, dlm->node_num); - spin_unlock(&dlm->spinlock); node = -1; @@ -1643,8 +1641,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) * yes_resp_map. Copy that into our domain map and send a join * assert message to clean up everyone elses state. */ spin_lock(&dlm->spinlock); - memcpy(dlm->domain_map, ctxt->yes_resp_map, - sizeof(ctxt->yes_resp_map)); + bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES); set_bit(dlm->node_num, dlm->domain_map); spin_unlock(&dlm->spinlock); @@ -2009,9 +2006,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", dlm->recovery_map, &(dlm->recovery_map[0])); - memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); - memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); - memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); + bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES); + bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES); + bitmap_zero(dlm->domain_map, O2NM_MAX_NODES); dlm->dlm_thread_task = NULL; dlm->dlm_reco_thread_task = NULL; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 227da5b1b6ab..d610da8e2f24 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -258,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, mle->type = type; INIT_HLIST_NODE(&mle->master_hash_node); INIT_LIST_HEAD(&mle->hb_events); - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); spin_lock_init(&mle->spinlock); init_waitqueue_head(&mle->wq); atomic_set(&mle->woken, 0); kref_init(&mle->mle_refs); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); mle->master = O2NM_MAX_NODES; mle->new_master = O2NM_MAX_NODES; mle->inuse = 0; @@ -290,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, atomic_inc(&dlm->mle_cur_count[mle->type]); /* copy off the node_map and register hb callbacks on our copy */ - memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); - memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); + bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES); + bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES); clear_bit(dlm->node_num, mle->vote_map); clear_bit(dlm->node_num, mle->node_map); @@ -572,7 +572,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); - memset(res->refmap, 0, sizeof(res->refmap)); + bitmap_zero(res->refmap, O2NM_MAX_NODES); } struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, @@ -1036,10 +1036,10 @@ recheck: spin_lock(&mle->spinlock); m = mle->master; - map_changed = (memcmp(mle->vote_map, mle->node_map, - sizeof(mle->vote_map)) != 0); - voting_done = (memcmp(mle->vote_map, mle->response_map, - sizeof(mle->vote_map)) == 0); + map_changed = !bitmap_equal(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); + voting_done = bitmap_equal(mle->vote_map, mle->response_map, + O2NM_MAX_NODES); /* restart if we hit any errors */ if (map_changed) { @@ -1277,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, /* now blank out everything, as if we had never * contacted anyone */ - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); /* reset the vote_map to the current node_map */ - memcpy(mle->vote_map, mle->node_map, - sizeof(mle->node_map)); + bitmap_copy(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); /* put myself into the maybe map */ if (mle->type != DLM_MLE_BLOCK) set_bit(dlm->node_num, mle->maybe_map); @@ -2094,7 +2094,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) flags = item->u.am.flags; spin_lock(&dlm->spinlock); - memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); + bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); clear_bit(dlm->node_num, nodemap); @@ -3447,7 +3447,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, ret = 0; } - memset(iter.node_map, 0, sizeof(iter.node_map)); + bitmap_zero(iter.node_map, O2NM_MAX_NODES); set_bit(old_master, iter.node_map); mlog(0, "doing assert master of %.*s back to %u\n", res->lockname.len, res->lockname.name, old_master); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 52ad342fec3e..50da8af988c1 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -733,7 +733,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_reco_node_data *ndata; spin_lock(&dlm->spinlock); - memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); + bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES); /* nodes can only be removed (by dying) after dropping * this lock, and death will be trapped later, so this should do */ spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9c67edd215d5..5c60b6bc85bf 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } } - if (file && should_remove_suid(file->f_path.dentry)) { + if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) { ret = __ocfs2_write_remove_suid(inode, di_bh); if (ret) { mlog_errno(ret); @@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (should_remove_suid(dentry)) { + if (setattr_should_drop_suidgid(&init_user_ns, inode)) { if (meta_level == 0) { ocfs2_inode_unlock_for_extent_tree(inode, &di_bh, @@ -2712,7 +2712,7 @@ const struct inode_operations ocfs2_file_iops = { .permission = ocfs2_permission, .listxattr = ocfs2_listxattr, .fiemap = ocfs2_fiemap, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, .fileattr_get = ocfs2_fileattr_get, .fileattr_set = ocfs2_fileattr_set, @@ -2722,7 +2722,7 @@ const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 126671e6caed..3fb98b4569a2 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -157,7 +157,7 @@ static void ocfs2_queue_replay_slots(struct ocfs2_super *osb, replay_map->rm_state = REPLAY_DONE; } -static void ocfs2_free_replay_slots(struct ocfs2_super *osb) +void ocfs2_free_replay_slots(struct ocfs2_super *osb) { struct ocfs2_replay_map *replay_map = osb->replay_map; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 969d0aa28718..41c382f68529 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -150,6 +150,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); +void ocfs2_free_replay_slots(struct ocfs2_super *osb); /* * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 05f32989bad6..a8fd51afb794 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2915,7 +2915,7 @@ const struct inode_operations ocfs2_dir_iops = { .permission = ocfs2_permission, .listxattr = ocfs2_listxattr, .fiemap = ocfs2_fiemap, - .get_acl = ocfs2_iop_get_acl, + .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, .fileattr_get = ocfs2_fileattr_get, .fileattr_set = ocfs2_fileattr_set, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 740b64238312..a503c553bab2 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -560,8 +560,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) u32 nlink = le16_to_cpu(di->i_links_count); u32 hi = le16_to_cpu(di->i_links_count_hi); - if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL)) - nlink |= (hi << OCFS2_LINKS_HI_SHIFT); + nlink |= (hi << OCFS2_LINKS_HI_SHIFT); return nlink; } diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 88f75f7f02d7..c973c03f6fd8 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -273,17 +273,17 @@ static int o2cb_cluster_check(void) */ #define O2CB_MAP_STABILIZE_COUNT 60 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { - o2hb_fill_node_map(hbmap, sizeof(hbmap)); + o2hb_fill_node_map(hbmap, O2NM_MAX_NODES); if (!test_bit(node_num, hbmap)) { printk(KERN_ERR "o2cb: %s heartbeat has not been " "started.\n", (o2hb_global_heartbeat_active() ? "Global" : "Local")); return -EINVAL; } - o2net_fill_node_map(netmap, sizeof(netmap)); + o2net_fill_node_map(netmap, O2NM_MAX_NODES); /* Force set the current node to allow easy compare */ set_bit(node_num, netmap); - if (!memcmp(hbmap, netmap, sizeof(hbmap))) + if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES)) return 0; if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 317126261523..a8d5ca98fa57 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -669,6 +669,8 @@ static struct ctl_table_header *ocfs2_table_header; static int __init ocfs2_stack_glue_init(void) { + int ret; + strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); @@ -678,7 +680,11 @@ static int __init ocfs2_stack_glue_init(void) return -ENOMEM; /* or something. */ } - return ocfs2_sysfs_init(); + ret = ocfs2_sysfs_init(); + if (ret) + unregister_sysctl_table(ocfs2_table_header); + + return ret; } static void __exit ocfs2_stack_glue_exit(void) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 42c993e53924..0b0e6a132101 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1159,6 +1159,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) out_dismount: atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); + ocfs2_free_replay_slots(osb); ocfs2_dismount_volume(sb, 1); goto out; @@ -1822,12 +1823,14 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_truncate_log_init(osb); if (status < 0) { mlog_errno(status); - goto out_system_inodes; + goto out_check_volume; } ocfs2_super_unlock(osb, 1); return 0; +out_check_volume: + ocfs2_free_replay_slots(osb); out_system_inodes: if (osb->local_alloc_state == OCFS2_LA_ENABLED) ocfs2_shutdown_local_alloc(osb); diff --git a/fs/open.c b/fs/open.c index a81319b6177f..9d0197db15e7 100644 --- a/fs/open.c +++ b/fs/open.c @@ -54,7 +54,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Remove suid, sgid, and file capabilities on truncate too */ - ret = dentry_needs_remove_privs(dentry); + ret = dentry_needs_remove_privs(mnt_userns, dentry); if (ret < 0) return ret; if (ret) @@ -723,10 +723,10 @@ retry_deleg: return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; - if (!S_ISDIR(inode->i_mode)) - newattrs.ia_valid |= - ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; inode_lock(inode); + if (!S_ISDIR(inode->i_mode)) + newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | + setattr_should_drop_sgid(mnt_userns, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 605e5a3506ec..c5da2091cefb 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -64,8 +64,7 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu) return acl; } -static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, - int type) +int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error = 0; void *value = NULL; @@ -119,12 +118,13 @@ out: return error; } -int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error; struct iattr iattr; int rc; + struct inode *inode = d_inode(dentry); memset(&iattr, 0, sizeof iattr); @@ -153,46 +153,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, rc = __orangefs_set_acl(inode, acl, type); if (!rc && (iattr.ia_valid == ATTR_MODE)) - rc = __orangefs_setattr(inode, &iattr); + rc = __orangefs_setattr_mode(dentry, &iattr); return rc; } - -int orangefs_init_acl(struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - umode_t mode = inode->i_mode; - struct iattr iattr; - int error = 0; - - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) - return error; - - if (default_acl) { - error = __orangefs_set_acl(inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } else { - inode->i_default_acl = NULL; - } - - if (acl) { - if (!error) - error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); - posix_acl_release(acl); - } else { - inode->i_acl = NULL; - } - - /* If mode of the inode was changed, then do a forcible ->setattr */ - if (mode != inode->i_mode) { - memset(&iattr, 0, sizeof iattr); - inode->i_mode = mode; - iattr.ia_mode = mode; - iattr.ia_valid |= ATTR_MODE; - __orangefs_setattr(inode, &iattr); - } - - return error; -} diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 7a8c0c6e698d..370bd3bbf5e4 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -53,7 +53,7 @@ static int orangefs_writepage_locked(struct page *page, bv.bv_len = wlen; bv.bv_offset = off % PAGE_SIZE; WARN_ON(wlen == 0); - iov_iter_bvec(&iter, WRITE, &bv, 1, wlen); + iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, len, wr, NULL, NULL); @@ -112,7 +112,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, else ow->bv[i].bv_offset = 0; } - iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len); + iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); WARN_ON(ow->off >= len); if (ow->off + ow->len > len) @@ -270,7 +270,7 @@ static void orangefs_readahead(struct readahead_control *rac) offset = readahead_pos(rac); i_pages = &rac->mapping->i_pages; - iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac)); + iov_iter_xarray(&iter, ITER_DEST, i_pages, offset, readahead_length(rac)); /* read in the pages. */ if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, @@ -303,7 +303,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) bv.bv_page = &folio->page; bv.bv_len = folio_size(folio); bv.bv_offset = 0; - iov_iter_bvec(&iter, READ, &bv, 1, folio_size(folio)); + iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio)); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, folio_size(folio), inode->i_size, NULL, NULL, file); @@ -828,15 +828,23 @@ again: spin_unlock(&inode->i_lock); mark_inode_dirty(inode); - if (iattr->ia_valid & ATTR_MODE) - /* change mod on a file that has ACLs */ - ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); - ret = 0; out: return ret; } +int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr) +{ + int ret; + struct inode *inode = d_inode(dentry); + + ret = __orangefs_setattr(inode, iattr); + /* change mode on a file that has ACLs */ + if (!ret && (iattr->ia_valid & ATTR_MODE)) + ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + return ret; +} + /* * Change attributes of an object referenced by dentry. */ @@ -849,7 +857,7 @@ int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ret = setattr_prepare(&init_user_ns, dentry, iattr); if (ret) goto out; - ret = __orangefs_setattr(d_inode(dentry), iattr); + ret = __orangefs_setattr_mode(dentry, iattr); sync_inode_metadata(d_inode(dentry), 1); out: gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", @@ -967,7 +975,7 @@ static int orangefs_fileattr_set(struct user_namespace *mnt_userns, /* ORANGEFS2 implementation of VFS inode operations for files */ static const struct inode_operations orangefs_file_inode_operations = { - .get_acl = orangefs_get_acl, + .get_inode_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, .setattr = orangefs_setattr, .getattr = orangefs_getattr, @@ -1097,8 +1105,9 @@ struct inode *orangefs_iget(struct super_block *sb, * Allocate an inode for a newly created file and insert it into the inode hash. */ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, - int mode, dev_t dev, struct orangefs_object_kref *ref) + umode_t mode, dev_t dev, struct orangefs_object_kref *ref) { + struct posix_acl *acl = NULL, *default_acl = NULL; unsigned long hash = orangefs_handle_hash(ref); struct inode *inode; int error; @@ -1115,6 +1124,10 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, if (!inode) return ERR_PTR(-ENOMEM); + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + goto out_iput; + orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ @@ -1125,6 +1138,19 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, orangefs_init_iops(inode); inode->i_rdev = dev; + if (default_acl) { + error = __orangefs_set_acl(inode, default_acl, + ACL_TYPE_DEFAULT); + if (error) + goto out_iput; + } + + if (acl) { + error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto out_iput; + } + error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref); if (error < 0) goto out_iput; @@ -1132,10 +1158,22 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, gossip_debug(GOSSIP_INODE_DEBUG, "Initializing ACL's for inode %pU\n", get_khandle_from_ino(inode)); - orangefs_init_acl(inode, dir); + if (mode != inode->i_mode) { + struct iattr iattr = { + .ia_mode = mode, + .ia_valid = ATTR_MODE, + }; + inode->i_mode = mode; + __orangefs_setattr(inode, &iattr); + __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + } + posix_acl_release(acl); + posix_acl_release(default_acl); return inode; out_iput: iput(inode); + posix_acl_release(acl); + posix_acl_release(default_acl); return ERR_PTR(error); } diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 600e8eee541f..75c1a3dcf68c 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -430,7 +430,7 @@ static int orangefs_rename(struct user_namespace *mnt_userns, /* ORANGEFS implementation of VFS inode operations for directories */ const struct inode_operations orangefs_dir_inode_operations = { .lookup = orangefs_lookup, - .get_acl = orangefs_get_acl, + .get_inode_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, .create = orangefs_create, .unlink = orangefs_unlink, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index b5940ec1836a..6e0cc01b3a14 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -103,13 +103,13 @@ enum orangefs_vfs_op_states { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif -extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); extern int orangefs_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, + struct dentry *dentry, struct posix_acl *acl, int type); +int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); /* * orangefs data structures @@ -356,11 +356,12 @@ void fsid_key_table_finalize(void); vm_fault_t orangefs_page_mkwrite(struct vm_fault *); struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, - int mode, + umode_t mode, dev_t dev, struct orangefs_object_kref *ref); int __orangefs_setattr(struct inode *, struct iattr *); +int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr); int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index dd188c7996b3..6708e54b0e30 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -96,7 +96,7 @@ config OVERLAY_FS_XINO_AUTO depends on 64BIT help If this config option is enabled then overlay filesystems will use - unused high bits in undelying filesystem inode numbers to map all + unused high bits in underlying filesystem inode numbers to map all inodes to a unified address space. The mapped 64bit inode numbers might not be compatible with applications that expect 32bit inodes. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index f436d8847f08..6e4e65ee050d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -44,6 +44,35 @@ static bool ovl_must_copy_xattr(const char *name) !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } +static int ovl_copy_acl(struct ovl_fs *ofs, const struct path *path, + struct dentry *dentry, const char *acl_name) +{ + int err; + struct posix_acl *clone, *real_acl = NULL; + + real_acl = ovl_get_acl_path(path, acl_name, false); + if (!real_acl) + return 0; + + if (IS_ERR(real_acl)) { + err = PTR_ERR(real_acl); + if (err == -ENODATA || err == -EOPNOTSUPP) + return 0; + return err; + } + + clone = posix_acl_clone(real_acl, GFP_KERNEL); + posix_acl_release(real_acl); /* release original acl */ + if (!clone) + return -ENOMEM; + + err = ovl_do_set_acl(ofs, dentry, acl_name, clone); + + /* release cloned acl */ + posix_acl_release(clone); + return err; +} + int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new) { struct dentry *old = oldpath->dentry; @@ -93,6 +122,15 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de error = 0; continue; /* Discard */ } + + if (is_posix_acl_xattr(name)) { + error = ovl_copy_acl(OVL_FS(sb), oldpath, new, name); + if (!error) + continue; + /* POSIX ACLs must be copied. */ + break; + } + retry: size = ovl_do_getxattr(oldpath, name, value, value_size); if (size == -ERANGE) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 6b03457f72bb..f61e37f4c8ff 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -435,28 +435,12 @@ out: } static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry, - const char *name, const struct posix_acl *acl) + const char *acl_name, struct posix_acl *acl) { - void *buffer; - size_t size; - int err; - if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl) return 0; - size = posix_acl_xattr_size(acl->a_count); - buffer = kmalloc(size, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - if (err < 0) - goto out_free; - - err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE); -out_free: - kfree(buffer); - return err; + return ovl_do_set_acl(ofs, upperdentry, acl_name, acl); } static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, @@ -592,28 +576,42 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, goto out_revert_creds; } - err = -ENOMEM; - override_cred = prepare_creds(); - if (override_cred) { + if (!attr->hardlink) { + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_revert_creds; + /* + * In the creation cases(create, mkdir, mknod, symlink), + * ovl should transfer current's fs{u,g}id to underlying + * fs. Because underlying fs want to initialize its new + * inode owner using current's fs{u,g}id. And in this + * case, the @inode is a new inode that is initialized + * in inode_init_owner() to current's fs{u,g}id. So use + * the inode's i_{u,g}id to override the cred's fs{u,g}id. + * + * But in the other hardlink case, ovl_link() does not + * create a new inode, so just use the ovl mounter's + * fs{u,g}id. + */ override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; - if (!attr->hardlink) { - err = security_dentry_create_files_as(dentry, - attr->mode, &dentry->d_name, old_cred, - override_cred); - if (err) { - put_cred(override_cred); - goto out_revert_creds; - } + err = security_dentry_create_files_as(dentry, + attr->mode, &dentry->d_name, old_cred, + override_cred); + if (err) { + put_cred(override_cred); + goto out_revert_creds; } put_cred(override_creds(override_cred)); put_cred(override_cred); - - if (!ovl_dentry_is_whiteout(dentry)) - err = ovl_create_upper(dentry, inode, attr); - else - err = ovl_create_over_whiteout(dentry, inode, attr); } + + if (!ovl_dentry_is_whiteout(dentry)) + err = ovl_create_upper(dentry, inode, attr); + else + err = ovl_create_over_whiteout(dentry, inode, attr); + out_revert_creds: revert_creds(old_cred); return err; @@ -1311,7 +1309,9 @@ const struct inode_operations ovl_dir_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, + .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, + .set_acl = ovl_set_acl, .update_time = ovl_update_time, .fileattr_get = ovl_fileattr_get, .fileattr_set = ovl_fileattr_set, diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index e065a5b9a442..a25bb3453dde 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -339,7 +339,7 @@ out_iput: return dentry; } -/* Get the upper or lower dentry in stach whose on layer @idx */ +/* Get the upper or lower dentry in stack whose on layer @idx */ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) { struct ovl_entry *oe = dentry->d_fsdata; @@ -463,7 +463,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, /* Get connected upper overlay dir from index */ if (index) { - struct dentry *upper = ovl_index_upper(ofs, index); + struct dentry *upper = ovl_index_upper(ofs, index, true); dput(index); if (IS_ERR_OR_NULL(upper)) @@ -739,7 +739,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, /* Then try to get a connected upper dir by index */ if (index && d_is_dir(index)) { - struct dentry *upper = ovl_index_upper(ofs, index); + struct dentry *upper = ovl_index_upper(ofs, index, true); err = PTR_ERR(upper); if (IS_ERR_OR_NULL(upper)) @@ -796,7 +796,7 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) return ERR_PTR(-ENOMEM); /* Copy unaligned inner fh into aligned buffer */ - memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET); + memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET); return fh; } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index a1a22f58ba18..c9d0c362c7ef 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -34,7 +34,7 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) return 'm'; } -/* No atime modificaton nor notify on underlying */ +/* No atime modification nor notify on underlying */ #define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) static struct file *ovl_open_realfile(const struct file *file, @@ -96,6 +96,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags) spin_lock(&file->f_lock); file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags; + file->f_iocb_flags = iocb_flags(file); spin_unlock(&file->f_lock); return 0; @@ -517,9 +518,16 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len const struct cred *old_cred; int ret; + inode_lock(inode); + /* Update mode */ + ovl_copyattr(inode); + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + ret = ovl_real_fdget(file, &real); if (ret) - return ret; + goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_fallocate(real.file, mode, offset, len); @@ -530,6 +538,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len fdput(real); +out_unlock: + inode_unlock(inode); + return ret; } @@ -567,14 +578,23 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, const struct cred *old_cred; loff_t ret; + inode_lock(inode_out); + if (op != OVL_DEDUPE) { + /* Update mode */ + ovl_copyattr(inode_out); + ret = file_remove_privs(file_out); + if (ret) + goto out_unlock; + } + ret = ovl_real_fdget(file_out, &real_out); if (ret) - return ret; + goto out_unlock; ret = ovl_real_fdget(file_in, &real_in); if (ret) { fdput(real_out); - return ret; + goto out_unlock; } old_cred = ovl_override_creds(file_inode(file_out)->i_sb); @@ -603,6 +623,9 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, fdput(real_in); fdput(real_out); +out_unlock: + inode_unlock(inode_out); + return ret; } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 9e61511de7a7..ee6dfa577c93 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -14,6 +14,8 @@ #include <linux/fileattr.h> #include <linux/security.h> #include <linux/namei.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include "overlayfs.h" @@ -460,7 +462,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) * of the POSIX ACLs retrieved from the lower layer to this function to not * alter the POSIX ACLs for the underlying filesystem. */ -static void ovl_idmap_posix_acl(struct inode *realinode, +static void ovl_idmap_posix_acl(const struct inode *realinode, struct user_namespace *mnt_userns, struct posix_acl *acl) { @@ -485,6 +487,64 @@ static void ovl_idmap_posix_acl(struct inode *realinode, } /* + * The @noperm argument is used to skip permission checking and is a temporary + * measure. Quoting Miklos from an earlier discussion: + * + * > So there are two paths to getting an acl: + * > 1) permission checking and 2) retrieving the value via getxattr(2). + * > This is a similar situation as reading a symlink vs. following it. + * > When following a symlink overlayfs always reads the link on the + * > underlying fs just as if it was a readlink(2) call, calling + * > security_inode_readlink() instead of security_inode_follow_link(). + * > This is logical: we are reading the link from the underlying storage, + * > and following it on overlayfs. + * > + * > Applying the same logic to acl: we do need to call the + * > security_inode_getxattr() on the underlying fs, even if just want to + * > check permissions on overlay. This is currently not done, which is an + * > inconsistency. + * > + * > Maybe adding the check to ovl_get_acl() is the right way to go, but + * > I'm a little afraid of a performance regression. Will look into that. + * + * Until we have made a decision allow this helper to take the @noperm + * argument. We should hopefully be able to remove it soon. + */ +struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, bool noperm) +{ + struct posix_acl *real_acl, *clone; + struct user_namespace *mnt_userns; + struct inode *realinode = d_inode(path->dentry); + + mnt_userns = mnt_user_ns(path->mnt); + + if (noperm) + real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); + else + real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name); + if (IS_ERR_OR_NULL(real_acl)) + return real_acl; + + if (!is_idmapped_mnt(path->mnt)) + return real_acl; + + /* + * We cannot alter the ACLs returned from the relevant layer as that + * would alter the cached values filesystem wide for the lower + * filesystem. Instead we can clone the ACLs and then apply the + * relevant idmapping of the layer. + */ + clone = posix_acl_clone(real_acl, GFP_KERNEL); + posix_acl_release(real_acl); /* release original acl */ + if (!clone) + return ERR_PTR(-ENOMEM); + + ovl_idmap_posix_acl(realinode, mnt_userns, clone); + return clone; +} + +/* * When the relevant layer is an idmapped mount we need to take the idmapping * of the layer into account and translate any ACL_{GROUP,USER} values * according to the idmapped mount. @@ -495,10 +555,12 @@ static void ovl_idmap_posix_acl(struct inode *realinode, * * This is obviously only relevant when idmapped layers are used. */ -struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) +struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, + bool rcu, bool noperm) { struct inode *realinode = ovl_inode_real(inode); - struct posix_acl *acl, *clone; + struct posix_acl *acl; struct path realpath; if (!IS_POSIXACL(realinode)) @@ -512,40 +574,115 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) } if (rcu) { + /* + * If the layer is idmapped drop out of RCU path walk + * so we can clone the ACLs. + */ + if (is_idmapped_mnt(realpath.mnt)) + return ERR_PTR(-ECHILD); + acl = get_cached_acl_rcu(realinode, type); } else { const struct cred *old_cred; old_cred = ovl_override_creds(inode->i_sb); - acl = get_acl(realinode, type); + acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); revert_creds(old_cred); } - /* - * If there are no POSIX ACLs, or we encountered an error, - * or the layer isn't idmapped we don't need to do anything. - */ - if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) - return acl; + + return acl; +} + +static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, + struct posix_acl *acl, int type) +{ + int err; + struct path realpath; + const char *acl_name; + const struct cred *old_cred; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *upperdentry = ovl_dentry_upper(dentry); + struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); + + err = ovl_want_write(dentry); + if (err) + return err; /* - * We only get here if the layer is idmapped. So drop out of RCU path - * walk so we can clone the ACLs. There's no need to release the ACLs - * since get_cached_acl_rcu() doesn't take a reference on the ACLs. + * If ACL is to be removed from a lower file, check if it exists in + * the first place before copying it up. */ - if (rcu) - return ERR_PTR(-ECHILD); + acl_name = posix_acl_xattr_name(type); + if (!acl && !upperdentry) { + struct posix_acl *real_acl; - clone = posix_acl_clone(acl, GFP_KERNEL); - if (!clone) - clone = ERR_PTR(-ENOMEM); + ovl_path_lower(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry, + acl_name); + revert_creds(old_cred); + if (IS_ERR(real_acl)) { + err = PTR_ERR(real_acl); + goto out_drop_write; + } + posix_acl_release(real_acl); + } + + if (!upperdentry) { + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + realdentry = ovl_dentry_upper(dentry); + } + + old_cred = ovl_override_creds(dentry->d_sb); + if (acl) + err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); else - ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone); + err = ovl_do_remove_acl(ofs, realdentry, acl_name); + revert_creds(old_cred); + + /* copy c/mtime */ + ovl_copyattr(inode); + +out_drop_write: + ovl_drop_write(dentry); + return err; +} + +int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + int err; + struct inode *inode = d_inode(dentry); + struct dentry *workdir = ovl_workdir(dentry); + struct inode *realinode = ovl_inode_real(inode); + + if (!IS_POSIXACL(d_inode(workdir))) + return -EOPNOTSUPP; + if (!realinode->i_op->set_acl) + return -EOPNOTSUPP; + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + if (!inode_owner_or_capable(&init_user_ns, inode)) + return -EPERM; + /* - * Since we're not in RCU path walk we always need to release the - * original ACLs. + * Check if sgid bit needs to be cleared (actual setacl operation will + * be done with mounter's capabilities and so that won't do it for us). */ - posix_acl_release(acl); - return clone; + if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && + !in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { + struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; + + err = ovl_setattr(&init_user_ns, dentry, &iattr); + if (err) + return err; + } + + return ovl_set_or_remove_acl(dentry, inode, acl, type); } #endif @@ -721,7 +858,9 @@ static const struct inode_operations ovl_file_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, + .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, + .set_acl = ovl_set_acl, .update_time = ovl_update_time, .fiemap = ovl_fiemap, .fileattr_get = ovl_fileattr_get, @@ -741,7 +880,9 @@ static const struct inode_operations ovl_special_inode_operations = { .permission = ovl_permission, .getattr = ovl_getattr, .listxattr = ovl_listxattr, + .get_inode_acl = ovl_get_inode_acl, .get_acl = ovl_get_acl, + .set_acl = ovl_set_acl, .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 0fd1d5fdfc72..46753134533a 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -487,7 +487,8 @@ fail: } /* Get upper dentry from index */ -struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, + bool connected) { struct ovl_fh *fh; struct dentry *upper; @@ -499,7 +500,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); - upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true); + upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected); kfree(fh); if (IS_ERR_OR_NULL(upper)) @@ -572,7 +573,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) * directly from the index dentry, but for dir index we first need to * decode the upper directory. */ - upper = ovl_index_upper(ofs, index); + upper = ovl_index_upper(ofs, index, false); if (IS_ERR_OR_NULL(upper)) { err = PTR_ERR(upper); /* @@ -1085,6 +1086,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .mnt = ovl_upper_mnt(ofs), }; + /* + * It's safe to assign upperredirect here: the previous + * assignment of happens only if upperdentry is non-NULL, and + * this one only if upperdentry is NULL. + */ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); if (IS_ERR(upperredirect)) { err = PTR_ERR(upperredirect); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index eee8f08d32b6..1df7f850ff3b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -8,6 +8,8 @@ #include <linux/uuid.h> #include <linux/fs.h> #include <linux/namei.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> #include "ovl_entry.h" #undef pr_fmt @@ -108,7 +110,7 @@ struct ovl_fh { u8 padding[3]; /* make sure fb.fid is 32bit aligned */ union { struct ovl_fb fb; - u8 buf[0]; + DECLARE_FLEX_ARRAY(u8, buf); }; } __packed; @@ -278,6 +280,18 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox)); } +static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry, + const char *acl_name, struct posix_acl *acl) +{ + return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl); +} + +static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, + const char *acl_name) +{ + return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name); +} + static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, struct dentry *olddentry, struct inode *newdir, struct dentry *newdentry, unsigned int flags) @@ -401,7 +415,7 @@ const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); -u64 ovl_dentry_version_get(struct dentry *dentry); +u64 ovl_inode_version_get(struct inode *inode); bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); @@ -525,7 +539,8 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, struct dentry *real, bool is_upper, bool set); -struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index); +struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, + bool connected); int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name); @@ -570,9 +585,9 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs); * lower dir was removed under it and possibly before it was rotated from upper * to lower layer. */ -static inline bool ovl_dir_is_real(struct dentry *dir) +static inline bool ovl_dir_is_real(struct inode *dir) { - return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); + return !ovl_test_flag(OVL_WHITEOUTS, dir); } /* inode.c */ @@ -594,9 +609,33 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); #ifdef CONFIG_FS_POSIX_ACL -struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, + bool rcu, bool noperm); +static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type, + bool rcu) +{ + return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true); +} +static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, int type) +{ + return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false); +} +int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + struct posix_acl *acl, int type); +struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, bool noperm); #else -#define ovl_get_acl NULL +#define ovl_get_inode_acl NULL +#define ovl_get_acl NULL +#define ovl_set_acl NULL +static inline struct posix_acl *ovl_get_acl_path(const struct path *path, + const char *acl_name, + bool noperm) +{ + return NULL; +} #endif int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 2b210640036c..8cd2b9947de1 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -235,15 +235,15 @@ void ovl_dir_cache_free(struct inode *inode) } } -static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode) { struct ovl_dir_cache *cache = od->cache; WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { - if (ovl_dir_cache(d_inode(dentry)) == cache) - ovl_set_dir_cache(d_inode(dentry), NULL); + if (ovl_dir_cache(inode) == cache) + ovl_set_dir_cache(inode, NULL); ovl_cache_free(&cache->entries); kfree(cache); @@ -323,15 +323,15 @@ static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; struct ovl_dir_cache *cache = od->cache; - struct dentry *dentry = file->f_path.dentry; + struct inode *inode = file_inode(file); bool is_real; - if (cache && ovl_dentry_version_get(dentry) != cache->version) { - ovl_cache_put(od, dentry); + if (cache && ovl_inode_version_get(inode) != cache->version) { + ovl_cache_put(od, inode); od->cache = NULL; od->cursor = NULL; } - is_real = ovl_dir_is_real(dentry); + is_real = ovl_dir_is_real(inode); if (od->is_real != is_real) { /* is_real can only become false when dir is copied up */ if (WARN_ON(is_real)) @@ -394,9 +394,10 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; struct ovl_dir_cache *cache; + struct inode *inode = d_inode(dentry); - cache = ovl_dir_cache(d_inode(dentry)); - if (cache && ovl_dentry_version_get(dentry) == cache->version) { + cache = ovl_dir_cache(inode); + if (cache && ovl_inode_version_get(inode) == cache->version) { WARN_ON(!cache->refcount); cache->refcount++; return cache; @@ -418,8 +419,8 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) return ERR_PTR(res); } - cache->version = ovl_dentry_version_get(dentry); - ovl_set_dir_cache(d_inode(dentry), cache); + cache->version = ovl_inode_version_get(inode); + ovl_set_dir_cache(inode, cache); return cache; } @@ -596,16 +597,17 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) { int res; struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_dir_cache *cache; - cache = ovl_dir_cache(d_inode(dentry)); - if (cache && ovl_dentry_version_get(dentry) == cache->version) + cache = ovl_dir_cache(inode); + if (cache && ovl_inode_version_get(inode) == cache->version) return cache; /* Impure cache is not refcounted, free it here */ - ovl_dir_cache_free(d_inode(dentry)); - ovl_set_dir_cache(d_inode(dentry), NULL); + ovl_dir_cache_free(inode); + ovl_set_dir_cache(inode, NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) @@ -627,13 +629,13 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) OVL_XATTR_IMPURE); ovl_drop_write(dentry); } - ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); + ovl_clear_flag(OVL_IMPURE, inode); kfree(cache); return NULL; } - cache->version = ovl_dentry_version_get(dentry); - ovl_set_dir_cache(d_inode(dentry), cache); + cache->version = ovl_inode_version_get(inode); + ovl_set_dir_cache(inode, cache); return cache; } @@ -675,7 +677,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name, static bool ovl_is_impure_dir(struct file *file) { struct ovl_dir_file *od = file->private_data; - struct inode *dir = d_inode(file->f_path.dentry); + struct inode *dir = file_inode(file); /* * Only upper dir can be impure, but if we are in the middle of @@ -893,7 +895,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, struct file *realfile; int err; - err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb)); + err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); if (err <= 0) return err; @@ -913,7 +915,7 @@ static int ovl_dir_release(struct inode *inode, struct file *file) if (od->cache) { inode_lock(inode); - ovl_cache_put(od, file->f_path.dentry); + ovl_cache_put(od, inode); inode_unlock(inode); } fput(od->realfile); @@ -942,7 +944,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file) return PTR_ERR(realfile); } od->realfile = realfile; - od->is_real = ovl_dir_is_real(file->f_path.dentry); + od->is_real = ovl_dir_is_real(inode); od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; @@ -1071,14 +1073,10 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa int err; struct inode *dir = path->dentry->d_inode; LIST_HEAD(list); - struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { - .ctx.actor = ovl_fill_merge, - .dentry = NULL, + .ctx.actor = ovl_fill_plain, .list = &list, - .root = &root, - .is_lowest = false, }; bool incompat = false; @@ -1159,14 +1157,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) struct inode *dir = indexdir->d_inode; struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; LIST_HEAD(list); - struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { - .ctx.actor = ovl_fill_merge, - .dentry = NULL, + .ctx.actor = ovl_fill_plain, .list = &list, - .root = &root, - .is_lowest = false, }; err = ovl_dir_read(&path, &rdd); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a29a8afe9b26..85b891152a2c 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -139,11 +139,16 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int flags, bool weak) { struct ovl_entry *oe = dentry->d_fsdata; + struct inode *inode = d_inode_rcu(dentry); struct dentry *upper; unsigned int i; int ret = 1; - upper = ovl_dentry_upper(dentry); + /* Careful in RCU mode */ + if (!inode) + return -ECHILD; + + upper = ovl_i_dentry_upper(inode); if (upper) ret = ovl_revalidate_real(upper, flags, weak); @@ -813,13 +818,11 @@ retry: * allowed as upper are limited to "normal" ones, where checking * for the above two errors is sufficient. */ - err = ovl_do_removexattr(ofs, work, - XATTR_NAME_POSIX_ACL_DEFAULT); + err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; - err = ovl_do_removexattr(ofs, work, - XATTR_NAME_POSIX_ACL_ACCESS); + err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_ACCESS); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; @@ -1001,83 +1004,6 @@ static unsigned int ovl_split_lowerdirs(char *str) return ctr; } -static int __maybe_unused -ovl_posix_acl_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - return ovl_xattr_get(dentry, inode, handler->name, buffer, size); -} - -static int __maybe_unused -ovl_posix_acl_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - struct dentry *workdir = ovl_workdir(dentry); - struct inode *realinode = ovl_inode_real(inode); - struct posix_acl *acl = NULL; - int err; - - /* Check that everything is OK before copy-up */ - if (value) { - /* The above comment can be understood in two ways: - * - * 1. We just want to check whether the basic POSIX ACL format - * is ok. For example, if the header is correct and the size - * is sane. - * 2. We want to know whether the ACL_{GROUP,USER} entries can - * be mapped according to the underlying filesystem. - * - * Currently, we only check 1. If we wanted to check 2. we - * would need to pass the mnt_userns and the fs_userns of the - * underlying filesystem. But frankly, I think checking 1. is - * enough to start the copy-up. - */ - acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - err = -EOPNOTSUPP; - if (!IS_POSIXACL(d_inode(workdir))) - goto out_acl_release; - if (!realinode->i_op->set_acl) - goto out_acl_release; - if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) { - err = acl ? -EACCES : 0; - goto out_acl_release; - } - err = -EPERM; - if (!inode_owner_or_capable(&init_user_ns, inode)) - goto out_acl_release; - - posix_acl_release(acl); - - /* - * Check if sgid bit needs to be cleared (actual setacl operation will - * be done with mounter's capabilities and so that won't do it for us). - */ - if (unlikely(inode->i_mode & S_ISGID) && - handler->flags == ACL_TYPE_ACCESS && - !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { - struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; - - err = ovl_setattr(&init_user_ns, dentry, &iattr); - if (err) - return err; - } - - err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags); - return err; - -out_acl_release: - posix_acl_release(acl); - return err; -} - static int ovl_own_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) @@ -1110,22 +1036,6 @@ static int ovl_other_xattr_set(const struct xattr_handler *handler, return ovl_xattr_set(dentry, inode, name, value, size, flags); } -static const struct xattr_handler __maybe_unused -ovl_posix_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = ovl_posix_acl_xattr_get, - .set = ovl_posix_acl_xattr_set, -}; - -static const struct xattr_handler __maybe_unused -ovl_posix_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = ovl_posix_acl_xattr_get, - .set = ovl_posix_acl_xattr_set, -}; - static const struct xattr_handler ovl_own_trusted_xattr_handler = { .prefix = OVL_XATTR_TRUSTED_PREFIX, .get = ovl_own_xattr_get, @@ -1146,8 +1056,8 @@ static const struct xattr_handler ovl_other_xattr_handler = { static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { #ifdef CONFIG_FS_POSIX_ACL - &ovl_posix_acl_access_xattr_handler, - &ovl_posix_acl_default_xattr_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &ovl_own_trusted_xattr_handler, &ovl_other_xattr_handler, @@ -1156,8 +1066,8 @@ static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { static const struct xattr_handler *ovl_user_xattr_handlers[] = { #ifdef CONFIG_FS_POSIX_ACL - &ovl_posix_acl_access_xattr_handler, - &ovl_posix_acl_default_xattr_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &ovl_own_user_xattr_handler, &ovl_other_xattr_handler, diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 81a57a8d80d9..bde291623c8c 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -463,7 +463,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity) * which have been copied up and have origins), so only need to note * changes to impure entries. */ - if (!ovl_dir_is_real(dentry) || impurity) + if (!ovl_dir_is_real(inode) || impurity) OVL_I(inode)->version++; } @@ -475,10 +475,8 @@ void ovl_dir_modified(struct dentry *dentry, bool impurity) ovl_dir_version_inc(dentry, impurity); } -u64 ovl_dentry_version_get(struct dentry *dentry) +u64 ovl_inode_version_get(struct inode *inode) { - struct inode *inode = d_inode(dentry); - WARN_ON(!inode_is_locked(inode)); return OVL_I(inode)->version; } @@ -1104,13 +1102,18 @@ void ovl_copyattr(struct inode *inode) struct path realpath; struct inode *realinode; struct user_namespace *real_mnt_userns; + vfsuid_t vfsuid; + vfsgid_t vfsgid; ovl_i_path_real(inode, &realpath); realinode = d_inode(realpath.dentry); real_mnt_userns = mnt_user_ns(realpath.mnt); - inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode); - inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode); + vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode); + vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode); + + inode->i_uid = vfsuid_into_kuid(vfsuid); + inode->i_gid = vfsgid_into_kgid(vfsgid); inode->i_mode = realinode->i_mode; inode->i_atime = realinode->i_atime; inode->i_mtime = realinode->i_mtime; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 74dc0f571dc9..d7bc81fc0840 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -25,6 +25,11 @@ #include <linux/namei.h> #include <linux/mnt_idmapping.h> #include <linux/iversion.h> +#include <linux/security.h> +#include <linux/evm.h> +#include <linux/fsnotify.h> + +#include "internal.h" static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -64,7 +69,7 @@ struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) if (acl == ACL_DONT_CACHE) { struct posix_acl *ret; - ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU); + ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU); if (!IS_ERR(ret)) acl = ret; } @@ -106,15 +111,17 @@ void forget_all_cached_acls(struct inode *inode) } EXPORT_SYMBOL(forget_all_cached_acls); -struct posix_acl *get_acl(struct inode *inode, int type) +static struct posix_acl *__get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + int type) { - void *sentinel; + struct posix_acl *sentinel; struct posix_acl **p; struct posix_acl *acl; /* * The sentinel is used to detect when another operation like - * set_cached_acl() or forget_cached_acl() races with get_acl(). + * set_cached_acl() or forget_cached_acl() races with get_inode_acl(). * It is guaranteed that is_uncached_acl(sentinel) is true. */ @@ -133,25 +140,27 @@ struct posix_acl *get_acl(struct inode *inode, int type) * current value of the ACL will not be ACL_NOT_CACHED and so our own * sentinel will not be set; another task will update the cache. We * could wait for that other task to complete its job, but it's easier - * to just call ->get_acl to fetch the ACL ourself. (This is going to - * be an unlikely race.) + * to just call ->get_inode_acl to fetch the ACL ourself. (This is + * going to be an unlikely race.) */ cmpxchg(p, ACL_NOT_CACHED, sentinel); /* - * Normally, the ACL returned by ->get_acl will be cached. + * Normally, the ACL returned by ->get{_inode}_acl will be cached. * A filesystem can prevent that by calling - * forget_cached_acl(inode, type) in ->get_acl. + * forget_cached_acl(inode, type) in ->get{_inode}_acl. * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. + * If the filesystem doesn't have a get{_inode}_ acl() function at all, + * we'll just create the negative cache entry. */ - if (!inode->i_op->get_acl) { + if (dentry && inode->i_op->get_acl) { + acl = inode->i_op->get_acl(mnt_userns, dentry, type); + } else if (inode->i_op->get_inode_acl) { + acl = inode->i_op->get_inode_acl(inode, type, false); + } else { set_cached_acl(inode, type, NULL); return NULL; } - acl = inode->i_op->get_acl(inode, type, false); - if (IS_ERR(acl)) { /* * Remove our sentinel so that we don't block future attempts @@ -169,7 +178,12 @@ struct posix_acl *get_acl(struct inode *inode, int type) posix_acl_release(acl); return acl; } -EXPORT_SYMBOL(get_acl); + +struct posix_acl *get_inode_acl(struct inode *inode, int type) +{ + return __get_acl(&init_user_ns, NULL, inode, type); +} +EXPORT_SYMBOL(get_inode_acl); /* * Init a fresh posix_acl @@ -578,19 +592,20 @@ EXPORT_SYMBOL(__posix_acl_chmod); * posix_acl_chmod - chmod a posix acl * * @mnt_userns: user namespace of the mount @inode was found from - * @inode: inode to check permissions on + * @dentry: dentry to check permissions on * @mode: the new mode of @inode * - * If the inode has been found through an idmapped mount the user namespace of + * If the dentry has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then * take care to map the inode according to @mnt_userns before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs init_user_ns. */ int - posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode, + posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode) { + struct inode *inode = d_inode(dentry); struct posix_acl *acl; int ret = 0; @@ -599,7 +614,7 @@ int if (!inode->i_op->set_acl) return -EOPNOTSUPP; - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR_OR_NULL(acl)) { if (acl == ERR_PTR(-EOPNOTSUPP)) return 0; @@ -609,7 +624,7 @@ int ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; - ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS); + ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } @@ -629,7 +644,7 @@ posix_acl_create(struct inode *dir, umode_t *mode, if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) return 0; - p = get_acl(dir, ACL_TYPE_DEFAULT); + p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (!p || p == ERR_PTR(-EOPNOTSUPP)) { *mode &= ~current_umask(); return 0; @@ -732,118 +747,32 @@ static int posix_acl_fix_xattr_common(const void *value, size_t size) return count; } -void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, - void *value, size_t size) -{ - struct posix_acl_xattr_header *header = value; - struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; - struct user_namespace *fs_userns = i_user_ns(inode); - int count; - vfsuid_t vfsuid; - vfsgid_t vfsgid; - kuid_t uid; - kgid_t gid; - - if (no_idmapping(mnt_userns, i_user_ns(inode))) - return; - - count = posix_acl_fix_xattr_common(value, size); - if (count <= 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch (le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsuid = make_vfsuid(mnt_userns, fs_userns, uid); - entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, - vfsuid_into_kuid(vfsuid))); - break; - case ACL_GROUP: - gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsgid = make_vfsgid(mnt_userns, fs_userns, gid); - entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, - vfsgid_into_kgid(vfsgid))); - break; - default: - break; - } - } -} - -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - void *value, size_t size) -{ - struct posix_acl_xattr_header *header = value; - struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; - int count; - kuid_t uid; - kgid_t gid; - - count = posix_acl_fix_xattr_common(value, size); - if (count <= 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch(le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kuid(to, uid)); - break; - case ACL_GROUP: - gid = make_kgid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kgid(to, gid)); - break; - default: - break; - } - } -} - -void posix_acl_fix_xattr_from_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); -} - -void posix_acl_fix_xattr_to_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); -} - /** - * make_posix_acl - convert POSIX ACLs from uapi to VFS format using the - * provided callbacks to map ACL_{GROUP,USER} entries into the - * appropriate format - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping + * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format + * @userns: the filesystem's idmapping * @value: the uapi representation of POSIX ACLs * @size: the size of @void - * @uid_cb: callback to use for mapping the uid stored in ACL_USER entries - * @gid_cb: callback to use for mapping the gid stored in ACL_GROUP entries * - * The make_posix_acl() helper is an abstraction to translate from uapi format - * into the VFS format allowing the caller to specific callbacks to map - * ACL_{GROUP,USER} entries into the expected format. This is used in - * posix_acl_from_xattr() and vfs_set_acl_prepare() and avoids pointless code - * duplication. + * Filesystems that store POSIX ACLs in the unaltered uapi format should use + * posix_acl_from_xattr() when reading them from the backing store and + * converting them into the struct posix_acl VFS format. The helper is + * specifically intended to be called from the acl inode operation. + * + * The posix_acl_from_xattr() function will map the raw {g,u}id values stored + * in ACL_{GROUP,USER} entries into idmapping in @userns. + * + * Note that posix_acl_from_xattr() does not take idmapped mounts into account. + * If it did it calling it from the get acl inode operation would return POSIX + * ACLs mapped according to an idmapped mount which would mean that the value + * couldn't be cached for the filesystem. Idmapped mounts are taken into + * account on the fly during permission checking or right at the VFS - + * userspace boundary before reporting them to the user. * * Return: Allocated struct posix_acl on success, NULL for a valid header but * without actual POSIX ACL entries, or ERR_PTR() encoded error code. */ -static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, const void *value, size_t size, - kuid_t (*uid_cb)(struct user_namespace *, struct user_namespace *, - const struct posix_acl_xattr_entry *), - kgid_t (*gid_cb)(struct user_namespace *, struct user_namespace *, - const struct posix_acl_xattr_entry *)) +struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns, + const void *value, size_t size) { const struct posix_acl_xattr_header *header = value; const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end; @@ -874,12 +803,14 @@ static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns, break; case ACL_USER: - acl_e->e_uid = uid_cb(mnt_userns, fs_userns, entry); + acl_e->e_uid = make_kuid(userns, + le32_to_cpu(entry->e_id)); if (!uid_valid(acl_e->e_uid)) goto fail; break; case ACL_GROUP: - acl_e->e_gid = gid_cb(mnt_userns, fs_userns, entry); + acl_e->e_gid = make_kgid(userns, + le32_to_cpu(entry->e_id)); if (!gid_valid(acl_e->e_gid)) goto fail; break; @@ -894,181 +825,6 @@ fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } - -/** - * vfs_set_acl_prepare_kuid - map ACL_USER uid according to mount- and - * filesystem idmapping - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @e: a ACL_USER entry in POSIX ACL uapi format - * - * The uid stored as ACL_USER entry in @e is a kuid_t stored as a raw {g,u}id - * value. The vfs_set_acl_prepare_kuid() will recover the kuid_t through - * KUIDT_INIT() and then map it according to the idmapped mount. The resulting - * kuid_t is the value which the filesystem can map up into a raw backing store - * id in the filesystem's idmapping. - * - * This is used in vfs_set_acl_prepare() to generate the proper VFS - * representation of POSIX ACLs with ACL_USER entries during setxattr(). - * - * Return: A kuid in @fs_userns for the uid stored in @e. - */ -static inline kuid_t -vfs_set_acl_prepare_kuid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const struct posix_acl_xattr_entry *e) -{ - kuid_t kuid = KUIDT_INIT(le32_to_cpu(e->e_id)); - return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid)); -} - -/** - * vfs_set_acl_prepare_kgid - map ACL_GROUP gid according to mount- and - * filesystem idmapping - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @e: a ACL_GROUP entry in POSIX ACL uapi format - * - * The gid stored as ACL_GROUP entry in @e is a kgid_t stored as a raw {g,u}id - * value. The vfs_set_acl_prepare_kgid() will recover the kgid_t through - * KGIDT_INIT() and then map it according to the idmapped mount. The resulting - * kgid_t is the value which the filesystem can map up into a raw backing store - * id in the filesystem's idmapping. - * - * This is used in vfs_set_acl_prepare() to generate the proper VFS - * representation of POSIX ACLs with ACL_GROUP entries during setxattr(). - * - * Return: A kgid in @fs_userns for the gid stored in @e. - */ -static inline kgid_t -vfs_set_acl_prepare_kgid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const struct posix_acl_xattr_entry *e) -{ - kgid_t kgid = KGIDT_INIT(le32_to_cpu(e->e_id)); - return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid)); -} - -/** - * vfs_set_acl_prepare - convert POSIX ACLs from uapi to VFS format taking - * mount and filesystem idmappings into account - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * @value: the uapi representation of POSIX ACLs - * @size: the size of @void - * - * When setting POSIX ACLs with ACL_{GROUP,USER} entries they need to be - * mapped according to the relevant mount- and filesystem idmapping. It is - * important that the ACL_{GROUP,USER} entries in struct posix_acl will be - * mapped into k{g,u}id_t that are supposed to be mapped up in the filesystem - * idmapping. This is crucial since the resulting struct posix_acl might be - * cached filesystem wide. The vfs_set_acl_prepare() function will take care to - * perform all necessary idmappings. - * - * Note, that since basically forever the {g,u}id values encoded as - * ACL_{GROUP,USER} entries in the uapi POSIX ACLs passed via @value contain - * values that have been mapped according to the caller's idmapping. In other - * words, POSIX ACLs passed in uapi format as @value during setxattr() contain - * {g,u}id values in their ACL_{GROUP,USER} entries that should actually have - * been stored as k{g,u}id_t. - * - * This means, vfs_set_acl_prepare() needs to first recover the k{g,u}id_t by - * calling K{G,U}IDT_INIT(). Afterwards they can be interpreted as vfs{g,u}id_t - * through from_vfs{g,u}id() to account for any idmapped mounts. The - * vfs_set_acl_prepare_k{g,u}id() helpers will take care to generate the - * correct k{g,u}id_t. - * - * The filesystem will then receive the POSIX ACLs ready to be cached - * filesystem wide and ready to be written to the backing store taking the - * filesystem's idmapping into account. - * - * Return: Allocated struct posix_acl on success, NULL for a valid header but - * without actual POSIX ACL entries, or ERR_PTR() encoded error code. - */ -struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const void *value, size_t size) -{ - return make_posix_acl(mnt_userns, fs_userns, value, size, - vfs_set_acl_prepare_kuid, - vfs_set_acl_prepare_kgid); -} -EXPORT_SYMBOL(vfs_set_acl_prepare); - -/** - * posix_acl_from_xattr_kuid - map ACL_USER uid into filesystem idmapping - * @mnt_userns: unused - * @fs_userns: the filesystem's idmapping - * @e: a ACL_USER entry in POSIX ACL uapi format - * - * Map the uid stored as ACL_USER entry in @e into the filesystem's idmapping. - * This is used in posix_acl_from_xattr() to generate the proper VFS - * representation of POSIX ACLs with ACL_USER entries. - * - * Return: A kuid in @fs_userns for the uid stored in @e. - */ -static inline kuid_t -posix_acl_from_xattr_kuid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const struct posix_acl_xattr_entry *e) -{ - return make_kuid(fs_userns, le32_to_cpu(e->e_id)); -} - -/** - * posix_acl_from_xattr_kgid - map ACL_GROUP gid into filesystem idmapping - * @mnt_userns: unused - * @fs_userns: the filesystem's idmapping - * @e: a ACL_GROUP entry in POSIX ACL uapi format - * - * Map the gid stored as ACL_GROUP entry in @e into the filesystem's idmapping. - * This is used in posix_acl_from_xattr() to generate the proper VFS - * representation of POSIX ACLs with ACL_GROUP entries. - * - * Return: A kgid in @fs_userns for the gid stored in @e. - */ -static inline kgid_t -posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns, - struct user_namespace *fs_userns, - const struct posix_acl_xattr_entry *e) -{ - return make_kgid(fs_userns, le32_to_cpu(e->e_id)); -} - -/** - * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format - * @fs_userns: the filesystem's idmapping - * @value: the uapi representation of POSIX ACLs - * @size: the size of @void - * - * Filesystems that store POSIX ACLs in the unaltered uapi format should use - * posix_acl_from_xattr() when reading them from the backing store and - * converting them into the struct posix_acl VFS format. The helper is - * specifically intended to be called from the ->get_acl() inode operation. - * - * The posix_acl_from_xattr() function will map the raw {g,u}id values stored - * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The - * posix_acl_from_xattr_k{g,u}id() helpers will take care to generate the - * correct k{g,u}id_t. The returned struct posix_acl can be cached. - * - * Note that posix_acl_from_xattr() does not take idmapped mounts into account. - * If it did it calling is from the ->get_acl() inode operation would return - * POSIX ACLs mapped according to an idmapped mount which would mean that the - * value couldn't be cached for the filesystem. Idmapped mounts are taken into - * account on the fly during permission checking or right at the VFS - - * userspace boundary before reporting them to the user. - * - * Return: Allocated struct posix_acl on success, NULL for a valid header but - * without actual POSIX ACL entries, or ERR_PTR() encoded error code. - */ -struct posix_acl * -posix_acl_from_xattr(struct user_namespace *fs_userns, - const void *value, size_t size) -{ - return make_posix_acl(&init_user_ns, fs_userns, value, size, - posix_acl_from_xattr_kuid, - posix_acl_from_xattr_kgid); -} EXPORT_SYMBOL (posix_acl_from_xattr); /* @@ -1113,35 +869,76 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, } EXPORT_SYMBOL (posix_acl_to_xattr); -static int -posix_acl_xattr_get(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *name, void *value, size_t size) -{ - struct posix_acl *acl; - int error; +/** + * vfs_posix_acl_to_xattr - convert from kernel to userspace representation + * @idmap: idmap of the mount + * @inode: inode the posix acls are set on + * @acl: the posix acls as represented by the vfs + * @buffer: the buffer into which to convert @acl + * @size: size of @buffer + * + * This converts @acl from the VFS representation in the filesystem idmapping + * to the uapi form reportable to userspace. And mount and caller idmappings + * are handled appropriately. + * + * Return: On success, the size of the stored uapi posix acls, on error a + * negative errno. + */ +static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, + struct inode *inode, + const struct posix_acl *acl, void *buffer, + size_t size) - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; +{ + struct posix_acl_xattr_header *ext_acl = buffer; + struct posix_acl_xattr_entry *ext_entry; + struct user_namespace *fs_userns, *caller_userns; + struct user_namespace *mnt_userns; + ssize_t real_size, n; + vfsuid_t vfsuid; + vfsgid_t vfsgid; - acl = get_acl(inode, handler->flags); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; + real_size = posix_acl_xattr_size(acl->a_count); + if (!buffer) + return real_size; + if (real_size > size) + return -ERANGE; - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); + ext_entry = (void *)(ext_acl + 1); + ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - return error; + fs_userns = i_user_ns(inode); + caller_userns = current_user_ns(); + mnt_userns = mnt_idmap_owner(idmap); + for (n=0; n < acl->a_count; n++, ext_entry++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid); + ext_entry->e_id = cpu_to_le32(from_kuid( + caller_userns, vfsuid_into_kuid(vfsuid))); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid); + ext_entry->e_id = cpu_to_le32(from_kgid( + caller_userns, vfsgid_into_kgid(vfsgid))); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } + } + return real_size; } int -set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode, +set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry, int type, struct posix_acl *acl) { + struct inode *inode = d_inode(dentry); + if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; if (!inode->i_op->set_acl) @@ -1157,40 +954,10 @@ set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode, if (ret) return ret; } - return inode->i_op->set_acl(mnt_userns, inode, acl, type); + return inode->i_op->set_acl(mnt_userns, dentry, acl, type); } EXPORT_SYMBOL(set_posix_acl); -static int -posix_acl_xattr_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, - struct dentry *unused, struct inode *inode, - const char *name, const void *value, size_t size, - int flags) -{ - struct posix_acl *acl = NULL; - int ret; - - if (value) { - /* - * By the time we end up here the {g,u}ids stored in - * ACL_{GROUP,USER} have already been mapped according to the - * caller's idmapping. The vfs_set_acl_prepare() helper will - * recover them and take idmapped mounts into account. The - * filesystem will receive the POSIX ACLs in the correct - * format ready to be cached or written to the backing store - * taking the filesystem idmapping into account. - */ - acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode), - value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - ret = set_posix_acl(mnt_userns, inode, handler->flags, acl); - posix_acl_release(acl); - return ret; -} - static bool posix_acl_xattr_list(struct dentry *dentry) { @@ -1201,8 +968,6 @@ const struct xattr_handler posix_acl_access_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, .list = posix_acl_xattr_list, - .get = posix_acl_xattr_get, - .set = posix_acl_xattr_set, }; EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); @@ -1210,15 +975,14 @@ const struct xattr_handler posix_acl_default_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = posix_acl_xattr_list, - .get = posix_acl_xattr_get, - .set = posix_acl_xattr_set, }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); -int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error; + struct inode *inode = d_inode(dentry); if (type == ACL_TYPE_ACCESS) { error = posix_acl_update_mode(mnt_userns, inode, @@ -1252,3 +1016,252 @@ int simple_acl_create(struct inode *dir, struct inode *inode) posix_acl_release(acl); return 0; } + +static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + struct posix_acl *acl) +{ + for (int n = 0; n < acl->a_count; n++) { + struct posix_acl_entry *acl_e = &acl->a_entries[n]; + + switch (acl_e->e_tag) { + case ACL_USER: + acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns, + VFSUIDT_INIT(acl_e->e_uid)); + break; + case ACL_GROUP: + acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns, + VFSGIDT_INIT(acl_e->e_gid)); + break; + } + } + + return 0; +} + +/** + * vfs_set_acl - set posix acls + * @mnt_userns: user namespace of the mount + * @dentry: the dentry based on which to set the posix acls + * @acl_name: the name of the posix acl + * @kacl: the posix acls in the appropriate VFS format + * + * This function sets @kacl. The caller must all posix_acl_release() on @kacl + * afterwards. + * + * Return: On success 0, on error negative errno. + */ +int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name, struct posix_acl *kacl) +{ + int acl_type; + int error; + struct inode *inode = d_inode(dentry); + struct inode *delegated_inode = NULL; + + acl_type = posix_acl_type(acl_name); + if (acl_type < 0) + return -EINVAL; + + if (kacl) { + /* + * If we're on an idmapped mount translate from mount specific + * vfs{g,u}id_t into global filesystem k{g,u}id_t. + * Afterwards we can cache the POSIX ACLs filesystem wide and - + * if this is a filesystem with a backing store - ultimately + * translate them to backing store values. + */ + error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl); + if (error) + return error; + } + +retry_deleg: + inode_lock(inode); + + /* + * We only care about restrictions the inode struct itself places upon + * us otherwise POSIX ACLs aren't subject to any VFS restrictions. + */ + error = may_write_xattr(mnt_userns, inode); + if (error) + goto out_inode_unlock; + + error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl); + if (error) + goto out_inode_unlock; + + error = try_break_deleg(inode, &delegated_inode); + if (error) + goto out_inode_unlock; + + if (inode->i_opflags & IOP_XATTR) + error = set_posix_acl(mnt_userns, dentry, acl_type, kacl); + else if (unlikely(is_bad_inode(inode))) + error = -EIO; + else + error = -EOPNOTSUPP; + if (!error) { + fsnotify_xattr(dentry); + evm_inode_post_set_acl(dentry, acl_name, kacl); + } + +out_inode_unlock: + inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + + return error; +} +EXPORT_SYMBOL_GPL(vfs_set_acl); + +/** + * vfs_get_acl - get posix acls + * @mnt_userns: user namespace of the mount + * @dentry: the dentry based on which to retrieve the posix acls + * @acl_name: the name of the posix acl + * + * This function retrieves @kacl from the filesystem. The caller must all + * posix_acl_release() on @kacl. + * + * Return: On success POSIX ACLs in VFS format, on error negative errno. + */ +struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *acl_name) +{ + struct inode *inode = d_inode(dentry); + struct posix_acl *acl; + int acl_type, error; + + acl_type = posix_acl_type(acl_name); + if (acl_type < 0) + return ERR_PTR(-EINVAL); + + /* + * The VFS has no restrictions on reading POSIX ACLs so calling + * something like xattr_permission() isn't needed. Only LSMs get a say. + */ + error = security_inode_get_acl(mnt_userns, dentry, acl_name); + if (error) + return ERR_PTR(error); + + if (!IS_POSIXACL(inode)) + return ERR_PTR(-EOPNOTSUPP); + if (S_ISLNK(inode->i_mode)) + return ERR_PTR(-EOPNOTSUPP); + + acl = __get_acl(mnt_userns, dentry, inode, acl_type); + if (IS_ERR(acl)) + return acl; + if (!acl) + return ERR_PTR(-ENODATA); + + return acl; +} +EXPORT_SYMBOL_GPL(vfs_get_acl); + +/** + * vfs_remove_acl - remove posix acls + * @mnt_userns: user namespace of the mount + * @dentry: the dentry based on which to retrieve the posix acls + * @acl_name: the name of the posix acl + * + * This function removes posix acls. + * + * Return: On success 0, on error negative errno. + */ +int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *acl_name) +{ + int acl_type; + int error; + struct inode *inode = d_inode(dentry); + struct inode *delegated_inode = NULL; + + acl_type = posix_acl_type(acl_name); + if (acl_type < 0) + return -EINVAL; + +retry_deleg: + inode_lock(inode); + + /* + * We only care about restrictions the inode struct itself places upon + * us otherwise POSIX ACLs aren't subject to any VFS restrictions. + */ + error = may_write_xattr(mnt_userns, inode); + if (error) + goto out_inode_unlock; + + error = security_inode_remove_acl(mnt_userns, dentry, acl_name); + if (error) + goto out_inode_unlock; + + error = try_break_deleg(inode, &delegated_inode); + if (error) + goto out_inode_unlock; + + if (inode->i_opflags & IOP_XATTR) + error = set_posix_acl(mnt_userns, dentry, acl_type, NULL); + else if (unlikely(is_bad_inode(inode))) + error = -EIO; + else + error = -EOPNOTSUPP; + if (!error) { + fsnotify_xattr(dentry); + evm_inode_post_remove_acl(mnt_userns, dentry, acl_name); + } + +out_inode_unlock: + inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + + return error; +} +EXPORT_SYMBOL_GPL(vfs_remove_acl); + +int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + const char *acl_name, const void *kvalue, size_t size) +{ + int error; + struct posix_acl *acl = NULL; + + if (size) { + /* + * Note that posix_acl_from_xattr() uses GFP_NOFS when it + * probably doesn't need to here. + */ + acl = posix_acl_from_xattr(current_user_ns(), kvalue, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + + error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl); + posix_acl_release(acl); + return error; +} + +ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, + const char *acl_name, void *kvalue, size_t size) +{ + ssize_t error; + struct posix_acl *acl; + + acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry), + acl, kvalue, size); + posix_acl_release(acl); + return error; +} diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index fa762c5fbcb2..91fe1597af7b 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -3,6 +3,7 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" static int cmdline_proc_show(struct seq_file *m, void *v) { @@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v) static int __init proc_cmdline_init(void) { - proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + pde->size = saved_command_line_len + 1; return 0; } fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index dfe6ce3505ce..e0758fe7936d 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -33,7 +33,16 @@ static int show_console_dev(struct seq_file *m, void *v) if (con->device) { const struct tty_driver *driver; int index; + + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + */ + console_lock(); driver = con->device(con, &index); + console_unlock(); + if (driver) { dev = MKDEV(driver->major, driver->minor_start); dev += index; @@ -63,7 +72,12 @@ static void *c_start(struct seq_file *m, loff_t *pos) struct console *con; loff_t off = 0; - console_lock(); + /* + * Hold the console_list_lock to guarantee safe traversal of the + * console list. SRCU cannot be used because there is no + * place to store the SRCU cookie. + */ + console_list_lock(); for_each_console(con) if (off++ == *pos) break; @@ -74,13 +88,14 @@ static void *c_start(struct seq_file *m, loff_t *pos) static void *c_next(struct seq_file *m, void *v, loff_t *pos) { struct console *con = v; + ++*pos; - return con->next; + return hlist_entry_safe(con->node.next, struct console, node); } static void c_stop(struct seq_file *m, void *v) { - console_unlock(); + console_list_unlock(); } static const struct seq_operations consoles_op = { diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 913bef0d2a36..fc46d6fe080c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -7,6 +7,7 @@ #include <linux/namei.h> #include <linux/pid.h> #include <linux/ptrace.h> +#include <linux/bitmap.h> #include <linux/security.h> #include <linux/file.h> #include <linux/seq_file.h> @@ -279,6 +280,30 @@ out: return 0; } +static int proc_readfd_count(struct inode *inode, loff_t *count) +{ + struct task_struct *p = get_proc_task(inode); + struct fdtable *fdt; + + if (!p) + return -ENOENT; + + task_lock(p); + if (p->files) { + rcu_read_lock(); + + fdt = files_fdtable(p->files); + *count = bitmap_weight(fdt->open_fds, fdt->max_fds); + + rcu_read_unlock(); + } + task_unlock(p); + + put_task_struct(p); + + return 0; +} + static int proc_readfd(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); @@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns, return rv; } +static int proc_fd_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + int rv = 0; + + generic_fillattr(&init_user_ns, inode, stat); + + /* If it's a directory, put the number of open fds there */ + if (S_ISDIR(inode->i_mode)) { + rv = proc_readfd_count(inode, &stat->size); + if (rv < 0) + return rv; + } + + return rv; +} + const struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, .permission = proc_fd_permission, + .getattr = proc_fd_getattr, .setattr = proc_setattr, }; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index f2aa86c421f2..09a81e4b1273 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -199,7 +199,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) struct kvec kvec = { .iov_base = buf, .iov_len = count }; struct iov_iter iter; - iov_iter_kvec(&iter, READ, &kvec, 1, count); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); return read_from_oldmem(&iter, count, ppos, false); } @@ -212,7 +212,7 @@ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) struct kvec kvec = { .iov_base = buf, .iov_len = count }; struct iov_iter iter; - iov_iter_kvec(&iter, READ, &kvec, 1, count); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); @@ -437,7 +437,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) offset = (loff_t) index << PAGE_SHIFT; kvec.iov_base = page_address(page); kvec.iov_len = PAGE_SIZE; - iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE); + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE); rc = __read_vmcore(&iter, &offset); if (rc < 0) { @@ -1567,6 +1567,7 @@ static int __init vmcore_init(void) return rc; rc = parse_crash_elf_headers(); if (rc) { + elfcorehdr_free(elfcorehdr_addr); pr_warn("Kdump: vmcore not initialized\n"); return rc; } diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 0c034ea39954..cbc0b468c1ab 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -89,6 +89,11 @@ static char *compress = module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); +/* How much of the kernel log to snapshot */ +unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, ulong, 0444); +MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); + /* Compression parameters */ static struct crypto_comp *tfm; @@ -100,9 +105,6 @@ struct pstore_zbackend { static char *big_oops_buf; static size_t big_oops_buf_sz; -/* How much of the console log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; - void pstore_set_kmsg_bytes(int bytes) { kmsg_bytes = bytes; @@ -391,6 +393,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, const char *why; unsigned int part = 1; unsigned long flags = 0; + int saved_ret = 0; int ret; why = kmsg_dump_reason_str(reason); @@ -461,12 +464,21 @@ static void pstore_dump(struct kmsg_dumper *dumper, if (ret == 0 && reason == KMSG_DUMP_OOPS) { pstore_new_entry = 1; pstore_timer_kick(); + } else { + /* Preserve only the first non-zero returned value. */ + if (!saved_ret) + saved_ret = ret; } total += record.size; part++; } spin_unlock_irqrestore(&psinfo->buf_lock, flags); + + if (saved_ret) { + pr_err_once("backend (%s) writing error (%d)\n", psinfo->name, + saved_ret); + } } static struct kmsg_dumper pstore_dumper = { @@ -562,8 +574,9 @@ out: int pstore_register(struct pstore_info *psi) { if (backend && strcmp(backend, psi->name)) { - pr_warn("ignoring unexpected backend '%s'\n", psi->name); - return -EPERM; + pr_warn("backend '%s' already in use: ignoring '%s'\n", + backend, psi->name); + return -EBUSY; } /* Sanity check flags. */ @@ -662,6 +675,8 @@ void pstore_unregister(struct pstore_info *psi) psinfo = NULL; kfree(backend); backend = NULL; + + pr_info("Unregistered %s as persistent store backend\n", psi->name); mutex_unlock(&psinfo_lock); } EXPORT_SYMBOL_GPL(pstore_unregister); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index fefe3d391d3a..9a5052431fd3 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -18,10 +18,11 @@ #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/compiler.h> -#include <linux/pstore_ram.h> #include <linux/of.h> #include <linux/of_address.h> + #include "internal.h" +#include "ram_internal.h" #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL @@ -451,20 +452,28 @@ static void ramoops_free_przs(struct ramoops_context *cxt) { int i; + /* Free pmsg PRZ */ + persistent_ram_free(&cxt->mprz); + + /* Free console PRZ */ + persistent_ram_free(&cxt->cprz); + /* Free dump PRZs */ if (cxt->dprzs) { for (i = 0; i < cxt->max_dump_cnt; i++) - persistent_ram_free(cxt->dprzs[i]); + persistent_ram_free(&cxt->dprzs[i]); kfree(cxt->dprzs); + cxt->dprzs = NULL; cxt->max_dump_cnt = 0; } /* Free ftrace PRZs */ if (cxt->fprzs) { for (i = 0; i < cxt->max_ftrace_cnt; i++) - persistent_ram_free(cxt->fprzs[i]); + persistent_ram_free(&cxt->fprzs[i]); kfree(cxt->fprzs); + cxt->fprzs = NULL; cxt->max_ftrace_cnt = 0; } } @@ -548,9 +557,10 @@ static int ramoops_init_przs(const char *name, while (i > 0) { i--; - persistent_ram_free(prz_ar[i]); + persistent_ram_free(&prz_ar[i]); } kfree(prz_ar); + prz_ar = NULL; goto fail; } *paddr += zone_sz; @@ -735,6 +745,7 @@ static int ramoops_probe(struct platform_device *pdev) /* Make sure we didn't get bogus platform data pointer. */ if (!pdata) { pr_err("NULL platform data\n"); + err = -EINVAL; goto fail_out; } @@ -742,6 +753,7 @@ static int ramoops_probe(struct platform_device *pdev) !pdata->ftrace_size && !pdata->pmsg_size)) { pr_err("The memory size and the record/console size must be " "non-zero\n"); + err = -EINVAL; goto fail_out; } @@ -772,12 +784,17 @@ static int ramoops_probe(struct platform_device *pdev) dump_mem_sz, cxt->record_size, &cxt->max_dump_cnt, 0, 0); if (err) - goto fail_out; + goto fail_init; err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr, cxt->console_size, 0); if (err) - goto fail_init_cprz; + goto fail_init; + + err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr, + cxt->pmsg_size, 0); + if (err) + goto fail_init; cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) ? nr_cpu_ids @@ -788,12 +805,7 @@ static int ramoops_probe(struct platform_device *pdev) (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) ? PRZ_FLAG_NO_LOCK : 0); if (err) - goto fail_init_fprz; - - err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr, - cxt->pmsg_size, 0); - if (err) - goto fail_init_mprz; + goto fail_init; cxt->pstore.data = cxt; /* @@ -857,11 +869,7 @@ fail_buf: kfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; - persistent_ram_free(cxt->mprz); -fail_init_mprz: -fail_init_fprz: - persistent_ram_free(cxt->cprz); -fail_init_cprz: +fail_init: ramoops_free_przs(cxt); fail_out: return err; @@ -876,8 +884,6 @@ static int ramoops_remove(struct platform_device *pdev) kfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; - persistent_ram_free(cxt->mprz); - persistent_ram_free(cxt->cprz); ramoops_free_przs(cxt); return 0; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index a89e33719fcf..966191d3a5ba 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -13,13 +13,14 @@ #include <linux/kernel.h> #include <linux/list.h> #include <linux/memblock.h> -#include <linux/pstore_ram.h> #include <linux/rslib.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <asm/page.h> +#include "ram_internal.h" + /** * struct persistent_ram_buffer - persistent circular RAM buffer * @@ -439,7 +440,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size, phys_addr_t addr = page_start + i * PAGE_SIZE; pages[i] = pfn_to_page(addr >> PAGE_SHIFT); } - vaddr = vmap(pages, page_count, VM_MAP, prot); + /* + * VM_IOREMAP used here to bypass this region during vread() + * and kmap_atomic() (i.e. kcore) to avoid __va() failures. + */ + vaddr = vmap(pages, page_count, VM_MAP | VM_IOREMAP, prot); kfree(pages); /* @@ -543,8 +548,14 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, return 0; } -void persistent_ram_free(struct persistent_ram_zone *prz) +void persistent_ram_free(struct persistent_ram_zone **_prz) { + struct persistent_ram_zone *prz; + + if (!_prz) + return; + + prz = *_prz; if (!prz) return; @@ -568,6 +579,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) persistent_ram_free_old(prz); kfree(prz->label); kfree(prz); + *_prz = NULL; } struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, @@ -604,6 +616,6 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, return prz; err: - persistent_ram_free(prz); + persistent_ram_free(&prz); return ERR_PTR(ret); } diff --git a/fs/pstore/ram_internal.h b/fs/pstore/ram_internal.h new file mode 100644 index 000000000000..5f694698351f --- /dev/null +++ b/fs/pstore/ram_internal.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com> + * Copyright (C) 2011 Kees Cook <keescook@chromium.org> + * Copyright (C) 2011 Google, Inc. + */ + +#include <linux/pstore_ram.h> + +/* + * Choose whether access to the RAM zone requires locking or not. If a zone + * can be written to from different CPUs like with ftrace for example, then + * PRZ_FLAG_NO_LOCK is used. For all other cases, locking is required. + */ +#define PRZ_FLAG_NO_LOCK BIT(0) +/* + * If a PRZ should only have a single-boot lifetime, this marks it as + * getting wiped after its contents get copied out after boot. + */ +#define PRZ_FLAG_ZAP_OLD BIT(1) + +/** + * struct persistent_ram_zone - Details of a persistent RAM zone (PRZ) + * used as a pstore backend + * + * @paddr: physical address of the mapped RAM area + * @size: size of mapping + * @label: unique name of this PRZ + * @type: frontend type for this PRZ + * @flags: holds PRZ_FLAGS_* bits + * + * @buffer_lock: + * locks access to @buffer "size" bytes and "start" offset + * @buffer: + * pointer to actual RAM area managed by this PRZ + * @buffer_size: + * bytes in @buffer->data (not including any trailing ECC bytes) + * + * @par_buffer: + * pointer into @buffer->data containing ECC bytes for @buffer->data + * @par_header: + * pointer into @buffer->data containing ECC bytes for @buffer header + * (i.e. all fields up to @data) + * @rs_decoder: + * RSLIB instance for doing ECC calculations + * @corrected_bytes: + * ECC corrected bytes accounting since boot + * @bad_blocks: + * ECC uncorrectable bytes accounting since boot + * @ecc_info: + * ECC configuration details + * + * @old_log: + * saved copy of @buffer->data prior to most recent wipe + * @old_log_size: + * bytes contained in @old_log + * + */ +struct persistent_ram_zone { + phys_addr_t paddr; + size_t size; + void *vaddr; + char *label; + enum pstore_type_id type; + u32 flags; + + raw_spinlock_t buffer_lock; + struct persistent_ram_buffer *buffer; + size_t buffer_size; + + char *par_buffer; + char *par_header; + struct rs_control *rs_decoder; + int corrected_bytes; + int bad_blocks; + struct persistent_ram_ecc_info ecc_info; + + char *old_log; + size_t old_log_size; +}; + +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, + u32 sig, struct persistent_ram_ecc_info *ecc_info, + unsigned int memtype, u32 flags, char *label); +void persistent_ram_free(struct persistent_ram_zone **_prz); +void persistent_ram_zap(struct persistent_ram_zone *prz); + +int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, + unsigned int count); +int persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count); + +void persistent_ram_save_old(struct persistent_ram_zone *prz); +size_t persistent_ram_old_size(struct persistent_ram_zone *prz); +void *persistent_ram_old(struct persistent_ram_zone *prz); +void persistent_ram_free_old(struct persistent_ram_zone *prz); +ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, + char *str, size_t len); diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 017d0d4ad329..2770746bb7aa 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -761,7 +761,7 @@ static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, /* avoid destroying old data, allocate a new one */ len = zone->buffer_size + sizeof(*zone->buffer); zone->oldbuf = zone->buffer; - zone->buffer = kzalloc(len, GFP_KERNEL); + zone->buffer = kzalloc(len, GFP_ATOMIC); if (!zone->buffer) { zone->buffer = zone->oldbuf; return -ENOMEM; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 0427b44bfee5..f27faf5db554 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2324,6 +2324,8 @@ static int vfs_setup_quota_inode(struct inode *inode, int type) struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); + if (is_bad_inode(inode)) + return -EUCLEAN; if (!S_ISREG(inode->i_mode)) return -EACCES; if (IS_RDONLY(inode)) diff --git a/fs/read_write.c b/fs/read_write.c index 24b9668d6377..7a2ff6157eda 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -384,7 +384,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); - iov_iter_ubuf(&iter, READ, buf, len); + iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = call_read_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); @@ -424,7 +424,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; - iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len); + iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) @@ -486,7 +486,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); - iov_iter_ubuf(&iter, WRITE, (void __user *)buf, len); + iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); @@ -533,7 +533,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; - iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); + iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* @@ -911,7 +911,7 @@ static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, struct iov_iter iter; ssize_t ret; - ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { ret = do_iter_read(file, &iter, pos, flags); kfree(iov); @@ -928,7 +928,7 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, struct iov_iter iter; ssize_t ret; - ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { file_start_write(file); ret = do_iter_write(file, &iter, pos, flags); diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index d9052b8ce6dd..29c503a06db4 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -49,9 +49,9 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu); -int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); -int reiserfs_acl_chmod(struct inode *inode); +int reiserfs_acl_chmod(struct dentry *dentry); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode); @@ -63,7 +63,7 @@ int reiserfs_cache_default_acl(struct inode *dir); #define reiserfs_get_acl NULL #define reiserfs_set_acl NULL -static inline int reiserfs_acl_chmod(struct inode *inode) +static inline int reiserfs_acl_chmod(struct dentry *dentry) { return 0; } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 6e228bfbe7ef..467d13da198f 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -256,7 +256,7 @@ const struct inode_operations reiserfs_file_inode_operations = { .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, + .get_inode_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, .fileattr_get = reiserfs_fileattr_get, .fileattr_set = reiserfs_fileattr_set, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index b9580a6515ee..c7d1fa526dea 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3404,7 +3404,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!error && reiserfs_posixacl(inode->i_sb)) { if (attr->ia_valid & ATTR_MODE) - error = reiserfs_acl_chmod(inode); + error = reiserfs_acl_chmod(dentry); } out: diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 3d7a35d6a18b..4d428e8704bc 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1659,7 +1659,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, + .get_inode_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, .fileattr_get = reiserfs_fileattr_get, .fileattr_set = reiserfs_fileattr_set, @@ -1683,6 +1683,6 @@ const struct inode_operations reiserfs_special_inode_operations = { .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, + .get_inode_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, }; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d6fcddc46f5b..93fe414fed18 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, int -reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { int error, error2; @@ -26,6 +26,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, size_t jcreate_blocks; int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; int update_mode = 0; + struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; /* @@ -371,7 +372,7 @@ int reiserfs_cache_default_acl(struct inode *inode) if (IS_PRIVATE(inode)) return 0; - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (acl && !IS_ERR(acl)) { int size = reiserfs_acl_size(acl->a_count); @@ -396,13 +397,15 @@ int reiserfs_cache_default_acl(struct inode *inode) /* * Called under i_mutex */ -int reiserfs_acl_chmod(struct inode *inode) +int reiserfs_acl_chmod(struct dentry *dentry) { + struct inode *inode = d_inode(dentry); + if (IS_PRIVATE(inode)) return 0; if (get_inode_sd_version(inode) == STAT_DATA_V1 || !reiserfs_posixacl(inode->i_sb)) return 0; - return posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); } diff --git a/fs/remap_range.c b/fs/remap_range.c index 654912d06862..290743c8d226 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -429,7 +429,7 @@ static bool allow_file_dedupe(struct file *file) return true; if (file->f_mode & FMODE_WRITE) return true; - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) + if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid())) return true; if (!inode_permission(mnt_userns, inode, MAY_WRITE)) return true; diff --git a/fs/seq_file.c b/fs/seq_file.c index 9456a2032224..f5fdaf3b1572 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -156,7 +156,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) ssize_t ret; init_sync_kiocb(&kiocb, file); - iov_iter_init(&iter, READ, &iov, 1, size); + iov_iter_init(&iter, ITER_DEST, &iov, 1, size); kiocb.ki_pos = *ppos; ret = seq_read_iter(&kiocb, &iter); diff --git a/fs/splice.c b/fs/splice.c index 0878b852b355..5969b7a1d353 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -303,7 +303,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct kiocb kiocb; int ret; - iov_iter_pipe(&to, READ, pipe, len); + iov_iter_pipe(&to, ITER_DEST, pipe, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); @@ -682,7 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, n++; } - iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); + iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; @@ -1263,9 +1263,9 @@ static int vmsplice_type(struct fd f, int *type) if (!f.file) return -EBADF; if (f.file->f_mode & FMODE_WRITE) { - *type = WRITE; + *type = ITER_SOURCE; } else if (f.file->f_mode & FMODE_READ) { - *type = READ; + *type = ITER_DEST; } else { fdput(f); return -EBADF; @@ -1314,7 +1314,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, if (!iov_iter_count(&iter)) error = 0; - else if (iov_iter_rw(&iter) == WRITE) + else if (type == ITER_SOURCE) error = vmsplice_to_pipe(f.file, &iter, flags); else error = vmsplice_to_user(f.file, &iter, flags); diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 916e78fabcaa..60fc98bdf421 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -54,9 +54,35 @@ config SQUASHFS_FILE_DIRECT endchoice +config SQUASHFS_DECOMP_SINGLE + depends on SQUASHFS + def_bool n + +config SQUASHFS_DECOMP_MULTI + depends on SQUASHFS + def_bool n + +config SQUASHFS_DECOMP_MULTI_PERCPU + depends on SQUASHFS + def_bool n + +config SQUASHFS_CHOICE_DECOMP_BY_MOUNT + bool "Select the parallel decompression mode during mount" + depends on SQUASHFS + default n + select SQUASHFS_DECOMP_SINGLE + select SQUASHFS_DECOMP_MULTI + select SQUASHFS_DECOMP_MULTI_PERCPU + select SQUASHFS_MOUNT_DECOMP_THREADS + help + Compile all parallel decompression modes and specify the + decompression mode by setting "threads=" during mount. + default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE + choice - prompt "Decompressor parallelisation options" + prompt "Select decompression parallel mode at compile time" depends on SQUASHFS + depends on !SQUASHFS_CHOICE_DECOMP_BY_MOUNT help Squashfs now supports three parallelisation options for decompression. Each one exhibits various trade-offs between @@ -64,15 +90,17 @@ choice If in doubt, select "Single threaded compression" -config SQUASHFS_DECOMP_SINGLE +config SQUASHFS_COMPILE_DECOMP_SINGLE bool "Single threaded compression" + select SQUASHFS_DECOMP_SINGLE help Traditionally Squashfs has used single-threaded decompression. Only one block (data or metadata) can be decompressed at any one time. This limits CPU and memory usage to a minimum. -config SQUASHFS_DECOMP_MULTI +config SQUASHFS_COMPILE_DECOMP_MULTI bool "Use multiple decompressors for parallel I/O" + select SQUASHFS_DECOMP_MULTI help By default Squashfs uses a single decompressor but it gives poor performance on parallel I/O workloads when using multiple CPU @@ -85,8 +113,9 @@ config SQUASHFS_DECOMP_MULTI decompressors per core. It dynamically allocates decompressors on a demand basis. -config SQUASHFS_DECOMP_MULTI_PERCPU +config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU bool "Use percpu multiple decompressors for parallel I/O" + select SQUASHFS_DECOMP_MULTI_PERCPU help By default Squashfs uses a single decompressor but it gives poor performance on parallel I/O workloads when using multiple CPU @@ -95,9 +124,21 @@ config SQUASHFS_DECOMP_MULTI_PERCPU This decompressor implementation uses a maximum of one decompressor per core. It uses percpu variables to ensure decompression is load-balanced across the cores. - endchoice +config SQUASHFS_MOUNT_DECOMP_THREADS + bool "Add the mount parameter 'threads=' for squashfs" + depends on SQUASHFS + depends on SQUASHFS_DECOMP_MULTI + default n + help + Use threads= to set the decompression parallel mode and the number of threads. + If SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y + threads=<single|multi|percpu|1|2|3|...> + else + threads=<2|3|...> + The upper limit is num_online_cpus() * 2. + config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 833aca92301f..bed3bb8b27fa 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -216,7 +216,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - res = squashfs_decompress(msblk, bio, offset, length, output); + res = msblk->thread_ops->decompress(msblk, bio, offset, length, output); } else { res = copy_bio_to_actor(bio, output, offset, length); } diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index d57bef91ab08..8893cb9b4198 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -134,7 +134,7 @@ void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) if (IS_ERR(comp_opts)) return comp_opts; - stream = squashfs_decompressor_create(msblk, comp_opts); + stream = msblk->thread_ops->create(msblk, comp_opts); if (IS_ERR(stream)) kfree(comp_opts); diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index db9f12a3ea05..416c53eedbd1 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -29,12 +29,11 @@ #define MAX_DECOMPRESSOR (num_online_cpus() * 2) -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return MAX_DECOMPRESSOR; } - struct squashfs_stream { void *comp_opts; struct list_head strm_list; @@ -59,7 +58,7 @@ static void put_decomp_stream(struct decomp_stream *decomp_strm, wake_up(&stream->wait); } -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -103,7 +102,7 @@ out: } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream *stream = msblk->stream; if (stream) { @@ -145,7 +144,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, * If there is no available decomp and already full, * let's wait for releasing decomp from other users. */ - if (stream->avail_decomp >= MAX_DECOMPRESSOR) + if (stream->avail_decomp >= msblk->max_thread_num) goto wait; /* Let's allocate new decomp */ @@ -161,7 +160,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, } stream->avail_decomp++; - WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR); + WARN_ON(stream->avail_decomp > msblk->max_thread_num); mutex_unlock(&stream->mutex); break; @@ -180,7 +179,7 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { @@ -195,3 +194,10 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, msblk->decompressor->name); return res; } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index b881b9283b7f..1dfadf76ed9a 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -25,7 +25,7 @@ struct squashfs_stream { local_lock_t lock; }; -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -59,7 +59,7 @@ out: return ERR_PTR(err); } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream __percpu *percpu = (struct squashfs_stream __percpu *) msblk->stream; @@ -75,19 +75,21 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct squashfs_stream *stream; + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; int res; - local_lock(&msblk->stream->lock); - stream = this_cpu_ptr(msblk->stream); + local_lock(&percpu->lock); + stream = this_cpu_ptr(percpu); res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); - local_unlock(&msblk->stream->lock); + local_unlock(&percpu->lock); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", @@ -96,7 +98,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, return res; } -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return num_possible_cpus(); } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index 4eb3d083d45e..6f161887710b 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -24,7 +24,7 @@ struct squashfs_stream { struct mutex mutex; }; -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -49,7 +49,7 @@ out: return ERR_PTR(err); } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream *stream = msblk->stream; @@ -59,7 +59,7 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { @@ -78,7 +78,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, return res; } -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return 1; } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_single = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 9783e01c8100..a6164fdf9435 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -38,11 +38,24 @@ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ -extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); -extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *, - int, int, struct squashfs_page_actor *); -extern int squashfs_max_decompressors(void); + +struct squashfs_decompressor_thread_ops { + void * (*create)(struct squashfs_sb_info *msblk, void *comp_opts); + void (*destroy)(struct squashfs_sb_info *msblk); + int (*decompress)(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, struct squashfs_page_actor *output); + int (*max_decompressors)(void); +}; + +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_single; +#endif +#ifdef CONFIG_SQUASHFS_DECOMP_MULTI +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi; +#endif +#ifdef CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu; +#endif /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 1e90c2575f9b..659082e9e51d 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -53,7 +53,7 @@ struct squashfs_sb_info { __le64 *xattr_id_table; struct mutex meta_index_mutex; struct meta_index *meta_index; - struct squashfs_stream *stream; + void *stream; __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; @@ -66,5 +66,7 @@ struct squashfs_sb_info { int xattr_ids; unsigned int ids; bool panic_on_errors; + const struct squashfs_decompressor_thread_ops *thread_ops; + int max_thread_num; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 32565dafa7f3..e090fae48e68 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -47,10 +47,13 @@ enum Opt_errors { enum squashfs_param { Opt_errors, + Opt_threads, }; struct squashfs_mount_opts { enum Opt_errors errors; + const struct squashfs_decompressor_thread_ops *thread_ops; + int thread_num; }; static const struct constant_table squashfs_param_errors[] = { @@ -61,9 +64,66 @@ static const struct constant_table squashfs_param_errors[] = { static const struct fs_parameter_spec squashfs_fs_parameters[] = { fsparam_enum("errors", Opt_errors, squashfs_param_errors), + fsparam_string("threads", Opt_threads), {} }; + +static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts) +{ +#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT + if (strcmp(str, "single") == 0) { + opts->thread_ops = &squashfs_decompressor_single; + return 0; + } + if (strcmp(str, "multi") == 0) { + opts->thread_ops = &squashfs_decompressor_multi; + return 0; + } + if (strcmp(str, "percpu") == 0) { + opts->thread_ops = &squashfs_decompressor_percpu; + return 0; + } +#endif + return -EINVAL; +} + +static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts) +{ +#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS + int ret; + unsigned long num; + + ret = kstrtoul(str, 0, &num); + if (ret != 0) + return -EINVAL; + if (num > 1) { + opts->thread_ops = &squashfs_decompressor_multi; + if (num > opts->thread_ops->max_decompressors()) + return -EINVAL; + opts->thread_num = (int)num; + return 0; + } +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE + if (num == 1) { + opts->thread_ops = &squashfs_decompressor_single; + opts->thread_num = 1; + return 0; + } +#endif +#endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */ + return -EINVAL; +} + +static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) +{ + int ret = squashfs_parse_param_threads_str(str, opts); + + if (ret == 0) + return ret; + return squashfs_parse_param_threads_num(str, opts); +} + static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct squashfs_mount_opts *opts = fc->fs_private; @@ -78,6 +138,10 @@ static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *para case Opt_errors: opts->errors = result.uint_32; break; + case Opt_threads: + if (squashfs_parse_param_threads(param->string, opts) != 0) + return -EINVAL; + break; default: return -EINVAL; } @@ -133,6 +197,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; } msblk = sb->s_fs_info; + msblk->thread_ops = opts->thread_ops; msblk->panic_on_errors = (opts->errors == Opt_errors_panic); @@ -168,6 +233,12 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) goto failed_mount; } + if (opts->thread_num == 0) { + msblk->max_thread_num = msblk->thread_ops->max_decompressors(); + } else { + msblk->max_thread_num = opts->thread_num; + } + /* Check the MAJOR & MINOR versions and lookup compression type */ msblk->decompressor = supported_squashfs_filesystem( fc, @@ -252,7 +323,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Allocate read_page block */ msblk->read_page = squashfs_cache_init("data", - squashfs_max_decompressors(), msblk->block_size); + msblk->max_thread_num, msblk->block_size); if (msblk->read_page == NULL) { errorf(fc, "Failed to allocate read_page block"); goto failed_mount; @@ -383,7 +454,7 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); - squashfs_decompressor_destroy(msblk); + msblk->thread_ops->destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); @@ -435,6 +506,19 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root) else seq_puts(s, ",errors=continue"); +#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT + if (msblk->thread_ops == &squashfs_decompressor_single) { + seq_puts(s, ",threads=single"); + return 0; + } + if (msblk->thread_ops == &squashfs_decompressor_percpu) { + seq_puts(s, ",threads=percpu"); + return 0; + } +#endif +#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS + seq_printf(s, ",threads=%d", msblk->max_thread_num); +#endif return 0; } @@ -446,6 +530,16 @@ static int squashfs_init_fs_context(struct fs_context *fc) if (!opts) return -ENOMEM; +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE + opts->thread_ops = &squashfs_decompressor_single; +#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI) + opts->thread_ops = &squashfs_decompressor_multi; +#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) + opts->thread_ops = &squashfs_decompressor_percpu; +#else +#error "fail: unknown squashfs decompression thread mode?" +#endif + opts->thread_num = 0; fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; @@ -478,7 +572,7 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); - squashfs_decompressor_destroy(sbi); + sbi->thread_ops->destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); @@ -568,7 +662,7 @@ static struct file_system_type squashfs_fs_type = { .init_fs_context = squashfs_init_fs_context, .parameters = squashfs_fs_parameters, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("squashfs"); diff --git a/fs/stat.c b/fs/stat.c index ef50573c72a2..d6cc74ca8486 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -44,12 +44,15 @@ void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, struct kstat *stat) { + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = i_uid_into_mnt(mnt_userns, inode); - stat->gid = i_gid_into_mnt(mnt_userns, inode); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode->i_atime; diff --git a/fs/super.c b/fs/super.c index 8d39e4f11cfa..12c08cb20405 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1112,55 +1112,14 @@ static int test_single_super(struct super_block *s, struct fs_context *fc) return 1; } -/** - * vfs_get_super - Get a superblock with a search key set in s_fs_info. - * @fc: The filesystem context holding the parameters - * @keying: How to distinguish superblocks - * @fill_super: Helper to initialise a new superblock - * - * Search for a superblock and create a new one if not found. The search - * criterion is controlled by @keying. If the search fails, a new superblock - * is created and @fill_super() is called to initialise it. - * - * @keying can take one of a number of values: - * - * (1) vfs_get_single_super - Only one superblock of this type may exist on the - * system. This is typically used for special system filesystems. - * - * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have - * distinct keys (where the key is in s_fs_info). Searching for the same - * key again will turn up the superblock for that key. - * - * (3) vfs_get_independent_super - Multiple superblocks may exist and are - * unkeyed. Each call will get a new superblock. - * - * A permissions check is made by sget_fc() unless we're getting a superblock - * for a kernel-internal mount or a submount. - */ -int vfs_get_super(struct fs_context *fc, - enum vfs_get_super_keying keying, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)) +static int vfs_get_super(struct fs_context *fc, bool reconf, + int (*test)(struct super_block *, struct fs_context *), + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) { - int (*test)(struct super_block *, struct fs_context *); struct super_block *sb; int err; - switch (keying) { - case vfs_get_single_super: - case vfs_get_single_reconf_super: - test = test_single_super; - break; - case vfs_get_keyed_super: - test = test_keyed_super; - break; - case vfs_get_independent_super: - test = NULL; - break; - default: - BUG(); - } - sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -1174,7 +1133,7 @@ int vfs_get_super(struct fs_context *fc, fc->root = dget(sb->s_root); } else { fc->root = dget(sb->s_root); - if (keying == vfs_get_single_reconf_super) { + if (reconf) { err = reconfigure_super(fc); if (err < 0) { dput(fc->root); @@ -1190,13 +1149,12 @@ error: deactivate_locked_super(sb); return err; } -EXPORT_SYMBOL(vfs_get_super); int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, vfs_get_independent_super, fill_super); + return vfs_get_super(fc, false, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); @@ -1204,7 +1162,7 @@ int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, vfs_get_single_super, fill_super); + return vfs_get_super(fc, false, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); @@ -1212,7 +1170,7 @@ int get_tree_single_reconf(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super); + return vfs_get_super(fc, true, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single_reconf); @@ -1222,7 +1180,7 @@ int get_tree_keyed(struct fs_context *fc, void *key) { fc->s_fs_info = key; - return vfs_get_super(fc, vfs_get_keyed_super, fill_super); + return vfs_get_super(fc, false, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index d4ec9bb97de9..3b8567564e7e 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -438,7 +438,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) res += blocks; direct = 1; } - return blocks; + return res; } int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path, diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 3f128b9fdfbb..9c9d3f0e36a4 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2467,7 +2467,7 @@ error_dump: static inline int chance(unsigned int n, unsigned int out_of) { - return !!(prandom_u32_max(out_of) + 1 <= n); + return !!(get_random_u32_below(out_of) + 1 <= n); } @@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) if (chance(1, 2)) { d->pc_delay = 1; /* Fail within 1 minute */ - delay = prandom_u32_max(60000); + delay = get_random_u32_below(60000); d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); ubifs_warn(c, "failing after %lums", delay); } else { d->pc_delay = 2; - delay = prandom_u32_max(10000); + delay = get_random_u32_below(10000); /* Fail within 10000 operations */ d->pc_cnt_max = delay; ubifs_warn(c, "failing after %lu calls", delay); @@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = prandom_u32_max(len); + from = get_random_u32_below(len); /* Corruption span max to end of write unit */ to = min(len, ALIGN(from + 1, c->max_write_size)); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index cfbc31f709f4..c4d079328b92 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c) if (!dbg_is_chk_gen(c)) return 0; - if (prandom_u32_max(4)) + if (get_random_u32_below(4)) return 0; for (i = 0; i < c->lsave_cnt; i++) c->lsave[i] = c->main_first; list_for_each_entry(lprops, &c->empty_list, list) - c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->freeable_list, list) - c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->frdi_idx_list, list) - c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum; heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_DIRTY - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_FREE - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; + c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum; return 1; } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 01362ad5f804..a55e04822d16 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_is_chk_index(c) && !prandom_u32_max(8)) + if (dbg_is_chk_index(c) && !get_random_u32_below(8)) return -ENOSPC; return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index dce6ae9ae306..1d7c2a812fc1 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -182,11 +182,6 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) } } -static int udf_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, udf_get_block, wbc); -} - static int udf_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -239,12 +234,12 @@ const struct address_space_operations udf_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = udf_read_folio, .readahead = udf_readahead, - .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, .write_end = generic_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, + .migrate_folio = buffer_migrate_folio, }; /* @@ -439,6 +434,12 @@ static int udf_get_block(struct inode *inode, sector_t block, iinfo->i_next_alloc_goal++; } + /* + * Block beyond EOF and prealloc extents? Just discard preallocation + * as it is not useful and complicates things. + */ + if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents) + udf_discard_prealloc(inode); udf_clear_extent_cache(inode); phys = inode_getblk(inode, block, &err, &new); if (!phys) @@ -488,8 +489,6 @@ static int udf_do_extend_file(struct inode *inode, uint32_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; - struct kernel_lb_addr prealloc_loc = {}; - uint32_t prealloc_len = 0; struct udf_inode_info *iinfo; int err; @@ -510,19 +509,6 @@ static int udf_do_extend_file(struct inode *inode, ~(sb->s_blocksize - 1); } - /* Last extent are just preallocated blocks? */ - if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == - EXT_NOT_RECORDED_ALLOCATED) { - /* Save the extent so that we can reattach it to the end */ - prealloc_loc = last_ext->extLocation; - prealloc_len = last_ext->extLength; - /* Mark the extent as a hole */ - last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); - last_ext->extLocation.logicalBlockNum = 0; - last_ext->extLocation.partitionReferenceNum = 0; - } - /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { @@ -550,7 +536,7 @@ static int udf_do_extend_file(struct inode *inode, * more extents, we may need to enter possible following * empty indirect extent. */ - if (new_block_bytes || prealloc_len) + if (new_block_bytes) udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); } @@ -584,17 +570,6 @@ static int udf_do_extend_file(struct inode *inode, } out: - /* Do we have some preallocated blocks saved? */ - if (prealloc_len) { - err = udf_add_aext(inode, last_pos, &prealloc_loc, - prealloc_len, 1); - if (err) - return err; - last_ext->extLocation = prealloc_loc; - last_ext->extLength = prealloc_len; - count++; - } - /* last_pos should point to the last written extent... */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) last_pos->offset -= sizeof(struct short_ad); @@ -610,13 +585,17 @@ out: static void udf_do_extend_final_block(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, - uint32_t final_block_len) + uint32_t new_elen) { - struct super_block *sb = inode->i_sb; uint32_t added_bytes; - added_bytes = final_block_len - - (last_ext->extLength & (sb->s_blocksize - 1)); + /* + * Extent already large enough? It may be already rounded up to block + * size... + */ + if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) + return; + added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen; last_ext->extLength += added_bytes; UDF_I(inode)->i_lenExtents += added_bytes; @@ -633,12 +612,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = newsize >> sb->s_blocksize_bits, offset; - unsigned long partial_final_block; + loff_t new_elen; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); struct kernel_long_ad extent; int err = 0; - int within_final_block; + bool within_last_ext; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); @@ -647,8 +626,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) else BUG(); + /* + * When creating hole in file, just don't bother with preserving + * preallocation. It likely won't be very useful anyway. + */ + udf_discard_prealloc(inode); + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); - within_final_block = (etype != -1); + within_last_ext = (etype != -1); + /* We don't expect extents past EOF... */ + WARN_ON_ONCE(within_last_ext && + elen > ((loff_t)offset + 1) << inode->i_blkbits); if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { @@ -664,19 +652,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize) extent.extLength |= etype << 30; } - partial_final_block = newsize & (sb->s_blocksize - 1); + new_elen = ((loff_t)offset << inode->i_blkbits) | + (newsize & (sb->s_blocksize - 1)); /* File has extent covering the new size (could happen when extending * inside a block)? */ - if (within_final_block) { + if (within_last_ext) { /* Extending file within the last file block */ - udf_do_extend_final_block(inode, &epos, &extent, - partial_final_block); + udf_do_extend_final_block(inode, &epos, &extent, new_elen); } else { - loff_t add = ((loff_t)offset << sb->s_blocksize_bits) | - partial_final_block; - err = udf_do_extend_file(inode, &epos, &extent, add); + err = udf_do_extend_file(inode, &epos, &extent, new_elen); } if (err < 0) @@ -777,10 +763,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, goto out_free; } - /* Are we beyond EOF? */ + /* Are we beyond EOF and preallocated extent? */ if (etype == -1) { int ret; loff_t hole_len; + isBeyondEOF = true; if (count) { if (c) diff --git a/fs/udf/namei.c b/fs/udf/namei.c index ae7bc13a5298..7c95c549dd64 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1091,8 +1091,9 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, return -EINVAL; ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - if (IS_ERR(ofi)) { - retval = PTR_ERR(ofi); + if (!ofi || IS_ERR(ofi)) { + if (IS_ERR(ofi)) + retval = PTR_ERR(ofi); goto end_rename; } @@ -1101,8 +1102,7 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, brelse(ofibh.sbh); tloc = lelb_to_cpu(ocfi.icb.extLocation); - if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) - != old_inode->i_ino) + if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) goto end_rename; nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); diff --git a/fs/udf/super.c b/fs/udf/super.c index 4042d9739fb7..06eda8177b5f 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -162,7 +162,7 @@ static void udf_free_in_core_inode(struct inode *inode) static void init_once(void *foo) { - struct udf_inode_info *ei = (struct udf_inode_info *)foo; + struct udf_inode_info *ei = foo; ei->i_data = NULL; inode_init_once(&ei->vfs_inode); @@ -820,7 +820,7 @@ static int udf_find_fileset(struct super_block *sb, struct kernel_lb_addr *fileset, struct kernel_lb_addr *root) { - struct buffer_head *bh = NULL; + struct buffer_head *bh; uint16_t ident; int ret; diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 532cda99644e..036ebd892b85 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -120,60 +120,42 @@ void udf_truncate_tail_extent(struct inode *inode) void udf_discard_prealloc(struct inode *inode) { - struct extent_position epos = { NULL, 0, {0, 0} }; + struct extent_position epos = {}; + struct extent_position prev_epos = {}; struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; int8_t etype = -1, netype; - int adsize; struct udf_inode_info *iinfo = UDF_I(inode); + int bsize = 1 << inode->i_blkbits; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || - inode->i_size == iinfo->i_lenExtents) + ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize)) return; - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) - adsize = sizeof(struct short_ad); - else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) - adsize = sizeof(struct long_ad); - else - adsize = 0; - epos.block = iinfo->i_location; /* Find the last extent in the file */ - while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { - etype = netype; + while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) { + brelse(prev_epos.bh); + prev_epos = epos; + if (prev_epos.bh) + get_bh(prev_epos.bh); + + etype = udf_next_aext(inode, &epos, &eloc, &elen, 1); lbcount += elen; } if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { - epos.offset -= adsize; lbcount -= elen; - extent_trunc(inode, &epos, &eloc, etype, elen, 0); - if (!epos.bh) { - iinfo->i_lenAlloc = - epos.offset - - udf_file_entry_alloc_offset(inode); - mark_inode_dirty(inode); - } else { - struct allocExtDesc *aed = - (struct allocExtDesc *)(epos.bh->b_data); - aed->lengthAllocDescs = - cpu_to_le32(epos.offset - - sizeof(struct allocExtDesc)); - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || - UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) - udf_update_tag(epos.bh->b_data, epos.offset); - else - udf_update_tag(epos.bh->b_data, - sizeof(struct allocExtDesc)); - mark_buffer_dirty_inode(epos.bh, inode); - } + udf_delete_aext(inode, prev_epos); + udf_free_blocks(inode->i_sb, inode, &eloc, 0, + DIV_ROUND_UP(elen, 1 << inode->i_blkbits)); } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = lbcount; brelse(epos.bh); + brelse(prev_epos.bh); } static void udf_update_alloc_ext_desc(struct inode *inode, diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 4fa620543d30..291b56dd011e 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -6,7 +6,11 @@ #include <linux/bitops.h> #include <linux/magic.h> -#define UDF_MAX_READ_VERSION 0x0250 +/* + * Even UDF 2.6 media should have version <= 0x250 but apparently there are + * some broken filesystems with version set to 0x260. Accommodate those. + */ +#define UDF_MAX_READ_VERSION 0x0260 #define UDF_MAX_WRITE_VERSION 0x0201 #define UDF_FLAG_USE_EXTENDED_FE 0 diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index dbe1ce5b450a..c7fcb855e068 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -32,6 +32,11 @@ struct fsverity_hash_alg { unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ mempool_t req_pool; /* mempool with a preallocated hash request */ + /* + * The HASH_ALGO_* constant for this algorithm. This is different from + * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme. + */ + enum hash_algo algo_id; }; /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 71d0fccb6d4c..6f8170cf4ae7 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -16,11 +16,13 @@ struct fsverity_hash_alg fsverity_hash_algs[] = { .name = "sha256", .digest_size = SHA256_DIGEST_SIZE, .block_size = SHA256_BLOCK_SIZE, + .algo_id = HASH_ALGO_SHA256, }, [FS_VERITY_HASH_ALG_SHA512] = { .name = "sha512", .digest_size = SHA512_DIGEST_SIZE, .block_size = SHA512_BLOCK_SIZE, + .algo_id = HASH_ALGO_SHA512, }, }; @@ -324,5 +326,9 @@ void __init fsverity_check_hash_algs(void) */ BUG_ON(!is_power_of_2(alg->digest_size)); BUG_ON(!is_power_of_2(alg->block_size)); + + /* Verify that there is a valid mapping to HASH_ALGO_*. */ + BUG_ON(alg->algo_id == 0); + BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]); } } diff --git a/fs/verity/measure.c b/fs/verity/measure.c index e99c00350c28..5c79ea1b2468 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); * @alg: (out) pointer to the hash algorithm enumeration * * Return the file hash algorithm and digest of an fsverity protected file. - * Assumption: before calling fsverity_get_digest(), the file must have been - * opened. + * Assumption: before calling this, the file must have been opened. * * Return: 0 on success, -errno on failure */ @@ -76,27 +75,13 @@ int fsverity_get_digest(struct inode *inode, { const struct fsverity_info *vi; const struct fsverity_hash_alg *hash_alg; - int i; vi = fsverity_get_info(inode); if (!vi) return -ENODATA; /* not a verity file */ hash_alg = vi->tree_params.hash_alg; - memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE); - - /* convert the verity hash algorithm name to a hash_algo_name enum */ - i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name); - if (i < 0) - return -EINVAL; - *alg = i; - - if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg])) - return -EINVAL; memcpy(digest, vi->file_digest, hash_alg->digest_size); - - pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg], - hash_digest_size[*alg], digest); - + *alg = hash_alg->algo_id; return 0; } diff --git a/fs/verity/verify.c b/fs/verity/verify.c index bde8c9b7d25f..961ba248021f 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -200,9 +200,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); * @bio: the bio to verify * * Verify a set of pages that have just been read from a verity file. The pages - * must be pagecache pages that are still locked and not yet uptodate. Pages - * that fail verification are set to the Error state. Verification is skipped - * for pages already in the Error state, e.g. due to fscrypt decryption failure. + * must be pagecache pages that are still locked and not yet uptodate. If a + * page fails verification, then bio->bi_status is set to an error status. * * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that @@ -244,9 +243,10 @@ void fsverity_verify_bio(struct bio *bio) unsigned long level0_ra_pages = min(max_ra_pages, params->level0_blocks - level0_index); - if (!PageError(page) && - !verify_page(inode, vi, req, page, level0_ra_pages)) - SetPageError(page); + if (!verify_page(inode, vi, req, page, level0_ra_pages)) { + bio->bi_status = BLK_STS_IOERR; + break; + } } fsverity_free_hash_request(params->hash_alg, req); diff --git a/fs/xattr.c b/fs/xattr.c index 61107b6bbed2..86668d2ce268 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -80,6 +80,31 @@ xattr_resolve_name(struct inode *inode, const char **name) return ERR_PTR(-EOPNOTSUPP); } +/** + * may_write_xattr - check whether inode allows writing xattr + * @mnt_userns: User namespace of the mount the inode was found from + * @inode: the inode on which to set an xattr + * + * Check whether the inode allows writing xattrs. Specifically, we can never + * set or remove an extended attribute on a read-only filesystem or on an + * immutable / append-only inode. + * + * We also need to ensure that the inode has a mapping in the mount to + * not risk writing back invalid i_{g,u}id values. + * + * Return: On success zero is returned. On error a negative errno is returned. + */ +int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode) +{ + if (IS_IMMUTABLE(inode)) + return -EPERM; + if (IS_APPEND(inode)) + return -EPERM; + if (HAS_UNMAPPED_ID(mnt_userns, inode)) + return -EPERM; + return 0; +} + /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. @@ -88,20 +113,12 @@ static int xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, const char *name, int mask) { - /* - * We can never set or remove an extended attribute on a read-only - * filesystem or on an immutable / append-only inode. - */ if (mask & MAY_WRITE) { - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - /* - * Updating an xattr will likely cause i_uid and i_gid - * to be writen back improperly if their true value is - * unknown to the vfs. - */ - if (HAS_UNMAPPED_ID(mnt_userns, inode)) - return -EPERM; + int ret; + + ret = may_write_xattr(mnt_userns, inode); + if (ret) + return ret; } /* @@ -172,6 +189,9 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, { const struct xattr_handler *handler; + if (is_posix_acl_xattr(name)) + return -EOPNOTSUPP; + handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); @@ -282,12 +302,6 @@ out: } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); -static inline bool is_posix_acl_xattr(const char *name) -{ - return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); -} - int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) @@ -399,6 +413,9 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, { const struct xattr_handler *handler; + if (is_posix_acl_xattr(name)) + return -EOPNOTSUPP; + handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); @@ -437,10 +454,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } nolsm: - error = __vfs_getxattr(dentry, inode, name, value, size); - if (error > 0 && is_posix_acl_xattr(name)) - posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size); - return error; + return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); @@ -471,6 +485,9 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; + if (is_posix_acl_xattr(name)) + return -EOPNOTSUPP; + handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); @@ -580,23 +597,19 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) return error; } -static void setxattr_convert(struct user_namespace *mnt_userns, - struct dentry *d, struct xattr_ctx *ctx) -{ - if (ctx->size && is_posix_acl_xattr(ctx->kname->name)) - posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); -} - -int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_ctx *ctx) { - setxattr_convert(mnt_userns, dentry, ctx); - return vfs_setxattr(mnt_userns, dentry, ctx->kname->name, + if (is_posix_acl_xattr(ctx->kname->name)) + return do_set_acl(idmap, dentry, ctx->kname->name, + ctx->kvalue, ctx->size); + + return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } static long -setxattr(struct user_namespace *mnt_userns, struct dentry *d, +setxattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name, const void __user *value, size_t size, int flags) { @@ -614,7 +627,7 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error) return error; - error = do_setxattr(mnt_userns, d, &ctx); + error = do_setxattr(idmap, d, &ctx); kvfree(ctx.kvalue); return error; @@ -633,7 +646,7 @@ retry: return error; error = mnt_want_write(path.mnt); if (!error) { - error = setxattr(mnt_user_ns(path.mnt), path.dentry, name, + error = setxattr(mnt_idmap(path.mnt), path.dentry, name, value, size, flags); mnt_drop_write(path.mnt); } @@ -670,7 +683,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = setxattr(file_mnt_user_ns(f.file), + error = setxattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name, value, size, flags); mnt_drop_write_file(f.file); @@ -683,7 +696,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, * Extended attribute GET operations */ ssize_t -do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, +do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct xattr_ctx *ctx) { ssize_t error; @@ -697,10 +710,12 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, return -ENOMEM; } - error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); + if (is_posix_acl_xattr(ctx->kname->name)) + error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size); + else + error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname, + ctx->kvalue, ctx->size); if (error > 0) { - if (is_posix_acl_xattr(kname)) - posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { @@ -713,7 +728,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, } static ssize_t -getxattr(struct user_namespace *mnt_userns, struct dentry *d, +getxattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name, void __user *value, size_t size) { ssize_t error; @@ -732,7 +747,7 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error < 0) return error; - error = do_getxattr(mnt_userns, d, &ctx); + error = do_getxattr(idmap, d, &ctx); kvfree(ctx.kvalue); return error; @@ -748,7 +763,7 @@ retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; - error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size); + error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -778,7 +793,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, + error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name, value, size); fdput(f); return error; @@ -863,7 +878,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) * Extended attribute REMOVE operations */ static long -removexattr(struct user_namespace *mnt_userns, struct dentry *d, +removexattr(struct mnt_idmap *idmap, struct dentry *d, const char __user *name) { int error; @@ -875,7 +890,10 @@ removexattr(struct user_namespace *mnt_userns, struct dentry *d, if (error < 0) return error; - return vfs_removexattr(mnt_userns, d, kname); + if (is_posix_acl_xattr(kname)) + return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname); + + return vfs_removexattr(mnt_idmap_owner(idmap), d, kname); } static int path_removexattr(const char __user *pathname, @@ -889,7 +907,7 @@ retry: return error; error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(mnt_user_ns(path.mnt), path.dentry, name); + error = removexattr(mnt_idmap(path.mnt), path.dentry, name); mnt_drop_write(path.mnt); } path_put(&path); @@ -922,7 +940,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = removexattr(file_mnt_user_ns(f.file), + error = removexattr(file_mnt_idmap(f.file), f.file->f_path.dentry, name); mnt_drop_write_file(f.file); } @@ -1140,7 +1158,7 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size, ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { - bool trusted = capable(CAP_SYS_ADMIN); + bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; ssize_t remaining_size = size; int err = 0; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index de79f5d07f65..989cf341779b 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1516,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock( #ifdef DEBUG /* Randomly don't execute the first algorithm. */ - if (prandom_u32_max(2)) + if (get_random_u32_below(2)) return 0; #endif diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 94db50eb706a..5118dedf9267 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc( /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) - do_sparse = prandom_u32_max(2); + do_sparse = get_random_u32_below(2); #endif /* diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b744c62052b6..a05f44eb8178 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -242,12 +242,13 @@ xfs_acl_set_mode( } int -xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type) { umode_t mode; bool set_mode = false; int error = 0; + struct inode *inode = d_inode(dentry); if (!acl) goto set_acl; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 263404d0bfda..dcd176149c7a 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -11,7 +11,7 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu); -extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, +extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index c6b2aabd6f18..822e6a0e9d1a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -279,7 +279,7 @@ xfs_errortag_test( ASSERT(error_tag < XFS_ERRTAG_MAX); randfactor = mp->m_errortag[error_tag]; - if (!randfactor || prandom_u32_max(randfactor)) + if (!randfactor || get_random_u32_below(randfactor)) return false; xfs_warn_ratelimited(mp, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 2e10e1c66ad6..712238305bc3 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -651,6 +651,7 @@ xfs_vn_change_ok( static int xfs_setattr_nonsize( struct user_namespace *mnt_userns, + struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) { @@ -757,7 +758,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if (mask & ATTR_MODE) { - error = posix_acl_chmod(mnt_userns, inode, inode->i_mode); + error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); if (error) return error; } @@ -779,6 +780,7 @@ out_dqrele: STATIC int xfs_setattr_size( struct user_namespace *mnt_userns, + struct dentry *dentry, struct xfs_inode *ip, struct iattr *iattr) { @@ -810,7 +812,7 @@ xfs_setattr_size( * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; - return xfs_setattr_nonsize(mnt_userns, ip, iattr); + return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); } /* @@ -987,7 +989,7 @@ xfs_vn_setattr_size( error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (error) return error; - return xfs_setattr_size(mnt_userns, ip, iattr); + return xfs_setattr_size(mnt_userns, dentry, ip, iattr); } STATIC int @@ -1019,7 +1021,7 @@ xfs_vn_setattr( error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (!error) - error = xfs_setattr_nonsize(mnt_userns, ip, iattr); + error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr); } return error; @@ -1101,7 +1103,7 @@ xfs_vn_tmpfile( } static const struct inode_operations xfs_inode_operations = { - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -1128,7 +1130,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -1155,7 +1157,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .get_acl = xfs_get_acl, + .get_inode_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, |