From ae91c92565494a37c30ce9a691c87890f800d826 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 22 Nov 2019 11:44:53 +0100 Subject: debugfs: remove return value of debugfs_create_regset32() No one checks the return value of debugfs_create_regset32(), as it's not needed, so make the return value void, so that no one tries to do so in the future. Link: https://lore.kernel.org/r/20191122104453.GA2017837@kroah.com Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/file.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 634b09d18b77..db987b5110a9 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1090,21 +1090,12 @@ static const struct file_operations fops_regset32 = { * This function creates a file in debugfs with the given name that reports * the names and values of a set of 32-bit registers. If the @mode variable * is so set it can be read from. Writing is not supported. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_regset32(const char *name, umode_t mode, - struct dentry *parent, - struct debugfs_regset32 *regset) +void debugfs_create_regset32(const char *name, umode_t mode, + struct dentry *parent, + struct debugfs_regset32 *regset) { - return debugfs_create_file(name, mode, parent, regset, &fops_regset32); + debugfs_create_file(name, mode, parent, regset, &fops_regset32); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); -- cgit v1.2.3 From 3e8cb8b2eaeb22f540f1cbc00cbb594047b7ba89 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 13 Feb 2020 09:16:07 +0100 Subject: fuse: fix stack use after return Normal, synchronous requests will have their args allocated on the stack. After the FR_FINISHED bit is set by receiving the reply from the userspace fuse server, the originating task may return and reuse the stack frame, resulting in an Oops if the args structure is dereferenced. Fix by setting a flag in the request itself upon initializing, indicating whether it has an asynchronous ->end() callback. Reported-by: Kyle Sanderson Reported-by: Michael Stapelberg Fixes: 2b319d1f6f92 ("fuse: don't dereference req->args on finished request") Cc: # v5.4 Tested-by: Michael Stapelberg Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 6 +++--- fs/fuse/fuse_i.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8e02d76fe104..97eec7522bf2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -276,12 +276,10 @@ static void flush_bg_queue(struct fuse_conn *fc) void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_iqueue *fiq = &fc->iq; - bool async; if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; - async = req->args->end; /* * test_and_set_bit() implies smp_mb() between bit * changing and below intr_entry check. Pairs with @@ -324,7 +322,7 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); } - if (async) + if (test_bit(FR_ASYNC, &req->flags)) req->args->end(fc, req->args, req->out.h.error); put_request: fuse_put_request(fc, req); @@ -471,6 +469,8 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; req->args = args; + if (args->end) + __set_bit(FR_ASYNC, &req->flags); } ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index aa75e2305b75..ca344bf71404 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -301,6 +301,7 @@ struct fuse_io_priv { * FR_SENT: request is in userspace, waiting for an answer * FR_FINISHED: request is finished * FR_PRIVATE: request is on private list + * FR_ASYNC: request is asynchronous */ enum fuse_req_flag { FR_ISREPLY, @@ -314,6 +315,7 @@ enum fuse_req_flag { FR_SENT, FR_FINISHED, FR_PRIVATE, + FR_ASYNC, }; /** -- cgit v1.2.3 From 1cef21842ff3b6043c459b6462183e70295b5b19 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 21 Feb 2020 15:21:38 -0500 Subject: NFS: Ensure the fs_context has the correct fs_type before mounting This is necessary because unless userspace explicitly requests fstype "nfs4" (either via "mount -t nfs4" or by calling the "mount.nfs4" helper directly), the fstype will default to "nfs". This was fine on older kernels because the super_block->s_type was set via mount_info->nfs_mod->nfs_fs, which was set when parsing the mount options and subsequently passed in the "type" argument of sget(). After commit f2aedb713c28 ("NFS: Add fs_context support."), sget_fc(), which has no "type" argument, is called instead. In sget_fc(), the super_block->s_type is set via fs_context->fs_type, which was set when the filesystem context was initially created. Reported-by: Patrick Steinhardt Fixes: f2aedb713c28 ("NFS: Add fs_context support.") Signed-off-by: Scott Mayhew Signed-off-by: Anna Schumaker --- fs/nfs/fs_context.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index e1b938457ab9..b616263b0eb6 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1240,6 +1240,13 @@ static int nfs_fs_context_validate(struct fs_context *fc) } ctx->nfs_mod = nfs_mod; } + + /* Ensure the filesystem context has the correct fs_type */ + if (fc->fs_type != ctx->nfs_mod->nfs_fs) { + module_put(fc->fs_type->owner); + __module_get(ctx->nfs_mod->nfs_fs->owner); + fc->fs_type = ctx->nfs_mod->nfs_fs; + } return 0; out_no_device_name: -- cgit v1.2.3 From 193155c8c9429f57400daf1f2ef0075016767112 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 22 Feb 2020 23:22:19 -0700 Subject: io_uring: handle multiple personalities in link chains If we have a chain of requests and they don't all use the same credentials, then the head of the chain will be issued with the credentails of the tail of the chain. Ensure __io_queue_sqe() overrides the credentials, if they are different. Once we do that, we can clean up the creds handling as well, by only having io_submit_sqe() do the lookup of a personality. It doesn't need to assign it, since __io_queue_sqe() now always does the right thing. Fixes: 75c6a03904e0 ("io_uring: support using a registered personality for commands") Reported-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index de650df9ac53..7d0be264527d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4705,11 +4705,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; + const struct cred *old_creds = NULL; int ret; again: linked_timeout = io_prep_linked_timeout(req); + if (req->work.creds && req->work.creds != current_cred()) { + if (old_creds) + revert_creds(old_creds); + if (old_creds == req->work.creds) + old_creds = NULL; /* restored original creds */ + else + old_creds = override_creds(req->work.creds); + } + ret = io_issue_sqe(req, sqe, &nxt, true); /* @@ -4759,6 +4769,8 @@ done_req: goto punt; goto again; } + if (old_creds) + revert_creds(old_creds); } static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4803,7 +4815,6 @@ static inline void io_queue_link_head(struct io_kiocb *req) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { - const struct cred *old_creds = NULL; struct io_ring_ctx *ctx = req->ctx; unsigned int sqe_flags; int ret, id; @@ -4818,14 +4829,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, id = READ_ONCE(sqe->personality); if (id) { - const struct cred *personality_creds; - - personality_creds = idr_find(&ctx->personality_idr, id); - if (unlikely(!personality_creds)) { + req->work.creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!req->work.creds)) { ret = -EINVAL; goto err_req; } - old_creds = override_creds(personality_creds); + get_cred(req->work.creds); } /* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -4837,8 +4846,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, err_req: io_cqring_add_event(req, ret); io_double_put_req(req); - if (old_creds) - revert_creds(old_creds); return false; } @@ -4899,8 +4906,6 @@ err_req: } } - if (old_creds) - revert_creds(old_creds); return true; } -- cgit v1.2.3 From 41726c9a50e7464beca7112d0aebf3a0090c62d2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 23 Feb 2020 13:11:42 -0700 Subject: io_uring: fix personality idr leak We somehow never free the idr, even though we init it for every ctx. Free it when the rest of the ring data is freed. Fixes: 071698e13ac6 ("io_uring: allow registering credentials") Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7d0be264527d..d961945cb332 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6339,6 +6339,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); + idr_destroy(&ctx->personality_idr); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { -- cgit v1.2.3 From fc513fac56e1b626ae48a74d7551d9c35c50129e Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 19 Feb 2020 06:01:03 +1000 Subject: cifs: don't leak -EAGAIN for stat() during reconnect If from cifs_revalidate_dentry_attr() the SMB2/QUERY_INFO call fails with an error, such as STATUS_SESSION_EXPIRED, causing the session to be reconnected it is possible we will leak -EAGAIN back to the application even for system calls such as stat() where this is not a valid error. Fix this by re-trying the operation from within cifs_revalidate_dentry_attr() if cifs_get_inode_info*() returns -EAGAIN. This fixes stat() and possibly also other system calls that uses cifs_revalidate_dentry*(). Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky Reviewed-by: Aurelien Aptel CC: Stable --- fs/cifs/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b5e6635c578e..1c6f659110d0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2073,6 +2073,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) struct inode *inode = d_inode(dentry); struct super_block *sb = dentry->d_sb; char *full_path = NULL; + int count = 0; if (inode == NULL) return -ENOENT; @@ -2094,15 +2095,18 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) full_path, inode, inode->i_count.counter, dentry, cifs_get_time(dentry), jiffies); +again: if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); else rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); - + if (rc == -EAGAIN && count++ < 10) + goto again; out: kfree(full_path); free_xid(xid); + return rc; } -- cgit v1.2.3 From 154255233830e1e4dd0d99ac929a5dce588c0b81 Mon Sep 17 00:00:00 2001 From: "Paulo Alcantara (SUSE)" Date: Thu, 20 Feb 2020 19:49:35 -0300 Subject: cifs: fix potential mismatch of UNC paths Ensure that full_path is an UNC path that contains '\\' as delimiter, which is required by cifs_build_devname(). The build_path_from_dentry_optional_prefix() function may return a path with '/' as delimiter when using SMB1 UNIX extensions, for example. Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Acked-by: Ronnie Sahlberg --- fs/cifs/cifs_dfs_ref.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 606f26d862dc..cc3ada12848d 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -324,6 +324,8 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) if (full_path == NULL) goto cdda_exit; + convert_delimiter(full_path, '\\'); + cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path); if (!cifs_sb_master_tlink(cifs_sb)) { -- cgit v1.2.3 From ec57010acd03428a749d2600bf09bd537eaae993 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 19 Feb 2020 23:59:32 -0600 Subject: cifs: add missing mount option to /proc/mounts We were not displaying the mount option "signloosely" in /proc/mounts for cifs mounts which some users found confusing recently Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/cifsfs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 46ebaf3f0824..fa77fe5258b0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -530,6 +530,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) if (tcon->seal) seq_puts(s, ",seal"); + else if (tcon->ses->server->ignore_signature) + seq_puts(s, ",signloosely"); if (tcon->nocase) seq_puts(s, ",nocase"); if (tcon->local_lease) -- cgit v1.2.3 From 86f740f2aed5ea7fe1aa86dc2df0fb4ab0f71088 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Fri, 21 Feb 2020 11:19:06 +0100 Subject: cifs: fix rename() by ensuring source handle opened with DELETE bit To rename a file in SMB2 we open it with the DELETE access and do a special SetInfo on it. If the handle is missing the DELETE bit the server will fail the SetInfo with STATUS_ACCESS_DENIED. We currently try to reuse any existing opened handle we have with cifs_get_writable_path(). That function looks for handles with WRITE access but doesn't check for DELETE, making rename() fail if it finds a handle to reuse. Simple reproducer below. To select handles with the DELETE bit, this patch adds a flag argument to cifs_get_writable_path() and find_writable_file() and the existing 'bool fsuid_only' argument is converted to a flag. The cifsFileInfo struct only stores the UNIX open mode but not the original SMB access flags. Since the DELETE bit is not mapped in that mode, this patch stores the access mask in cifs_fid on file open, which is accessible from cifsFileInfo. Simple reproducer: #include #include #include #include #include #include #define E(s) perror(s), exit(1) int main(int argc, char *argv[]) { int fd, ret; if (argc != 3) { fprintf(stderr, "Usage: %s A B\n" "create&open A in write mode, " "rename A to B, close A\n", argv[0]); return 0; } fd = openat(AT_FDCWD, argv[1], O_WRONLY|O_CREAT|O_SYNC, 0666); if (fd == -1) E("openat()"); ret = rename(argv[1], argv[2]); if (ret) E("rename()"); ret = close(fd); if (ret) E("close()"); return ret; } $ gcc -o bugrename bugrename.c $ ./bugrename /mnt/a /mnt/b rename(): Permission denied Fixes: 8de9e86c67ba ("cifs: create a helper to find a writeable handle by path name") CC: Stable Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky Reviewed-by: Paulo Alcantara (SUSE) --- fs/cifs/cifsglob.h | 7 +++++++ fs/cifs/cifsproto.h | 5 +++-- fs/cifs/cifssmb.c | 3 ++- fs/cifs/file.c | 19 ++++++++++++------- fs/cifs/inode.c | 6 +++--- fs/cifs/smb1ops.c | 2 +- fs/cifs/smb2inode.c | 4 ++-- fs/cifs/smb2ops.c | 3 ++- fs/cifs/smb2pdu.c | 1 + 9 files changed, 33 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index de82cfa44b1a..0d956360e984 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1281,6 +1281,7 @@ struct cifs_fid { __u64 volatile_fid; /* volatile file id for smb2 */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ __u8 create_guid[16]; + __u32 access; struct cifs_pending_open *pending_open; unsigned int epoch; #ifdef CONFIG_CIFS_DEBUG2 @@ -1741,6 +1742,12 @@ static inline bool is_retryable_error(int error) return false; } + +/* cifs_get_writable_file() flags */ +#define FIND_WR_ANY 0 +#define FIND_WR_FSUID_ONLY 1 +#define FIND_WR_WITH_DELETE 2 + #define MID_FREE 0 #define MID_REQUEST_ALLOCATED 1 #define MID_REQUEST_SUBMITTED 2 diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 89eaaf46d1ca..e5cb681ec138 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -134,11 +134,12 @@ extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); -extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); +extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int); extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, - bool fsuid_only, + int flags, struct cifsFileInfo **ret_file); extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, + int flags, struct cifsFileInfo **ret_file); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 3c89569e7210..6f6fb3606a5d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1492,6 +1492,7 @@ openRetry: *oplock = rsp->OplockLevel; /* cifs fid stays in le */ oparms->fid->netfid = rsp->Fid; + oparms->fid->access = desired_access; /* Let caller know file was created so we can set the mode. */ /* Do we care about the CreateAction in any other cases? */ @@ -2115,7 +2116,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) wdata2->tailsz = tailsz; wdata2->bytes = cur_len; - rc = cifs_get_writable_file(CIFS_I(inode), false, + rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &wdata2->cfile); if (!wdata2->cfile) { cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n", diff --git a/fs/cifs/file.c b/fs/cifs/file.c index bc9516ab4b34..3b942ecdd4be 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1958,7 +1958,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, /* Return -EBADF if no handle is found and general rc otherwise */ int -cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, +cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, struct cifsFileInfo **ret_file) { struct cifsFileInfo *open_file, *inv_file = NULL; @@ -1966,7 +1966,8 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, bool any_available = false; int rc = -EBADF; unsigned int refind = 0; - + bool fsuid_only = flags & FIND_WR_FSUID_ONLY; + bool with_delete = flags & FIND_WR_WITH_DELETE; *ret_file = NULL; /* @@ -1998,6 +1999,8 @@ refind_writable: continue; if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; + if (with_delete && !(open_file->fid.access & DELETE)) + continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ @@ -2045,12 +2048,12 @@ refind_writable: } struct cifsFileInfo * -find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) +find_writable_file(struct cifsInodeInfo *cifs_inode, int flags) { struct cifsFileInfo *cfile; int rc; - rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile); + rc = cifs_get_writable_file(cifs_inode, flags, &cfile); if (rc) cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc); @@ -2059,6 +2062,7 @@ find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, + int flags, struct cifsFileInfo **ret_file) { struct list_head *tmp; @@ -2085,7 +2089,7 @@ cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, kfree(full_path); cinode = CIFS_I(d_inode(cfile->dentry)); spin_unlock(&tcon->open_file_lock); - return cifs_get_writable_file(cinode, 0, ret_file); + return cifs_get_writable_file(cinode, flags, ret_file); } spin_unlock(&tcon->open_file_lock); @@ -2162,7 +2166,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) if (mapping->host->i_size - offset < (loff_t)to) to = (unsigned)(mapping->host->i_size - offset); - rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file); + rc = cifs_get_writable_file(CIFS_I(mapping->host), FIND_WR_ANY, + &open_file); if (!rc) { bytes_written = cifs_write(open_file, open_file->pid, write_data, to - from, &offset); @@ -2355,7 +2360,7 @@ retry: if (cfile) cifsFileInfo_put(cfile); - rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile); + rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile); /* in case of an error store it to return later */ if (rc) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1c6f659110d0..49dbf11e2c3f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2282,7 +2282,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, * writebehind data than the SMB timeout for the SetPathInfo * request would allow */ - open_file = find_writable_file(cifsInode, true); + open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY); if (open_file) { tcon = tlink_tcon(open_file->tlink); server = tcon->ses->server; @@ -2432,7 +2432,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) args->ctime = NO_CHANGE_64; args->device = 0; - open_file = find_writable_file(cifsInode, true); + open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY); if (open_file) { u16 nfid = open_file->fid.netfid; u32 npid = open_file->pid; @@ -2535,7 +2535,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = 0; if (attrs->ia_valid & ATTR_MTIME) { - rc = cifs_get_writable_file(cifsInode, false, &wfile); + rc = cifs_get_writable_file(cifsInode, FIND_WR_ANY, &wfile); if (!rc) { tcon = tlink_tcon(wfile->tlink); rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index eb994e313c6a..b130efaf8feb 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -766,7 +766,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, struct cifs_tcon *tcon; /* if the file is already open for write, just use that fileid */ - open_file = find_writable_file(cinode, true); + open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY); if (open_file) { fid.netfid = open_file->fid.netfid; netpid = open_file->pid; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1cf207564ff9..a8c301ae00ed 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -521,7 +521,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_i = CIFS_I(inode); dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; data.Attributes = cpu_to_le32(dosattrs); - cifs_get_writable_path(tcon, name, &cfile); + cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, @@ -577,7 +577,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, { struct cifsFileInfo *cfile; - cifs_get_writable_path(tcon, from_name, &cfile); + cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile); return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, DELETE, SMB2_OP_RENAME, cfile); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index e47190cae163..c31e84ee3c39 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1364,6 +1364,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) cfile->fid.persistent_fid = fid->persistent_fid; cfile->fid.volatile_fid = fid->volatile_fid; + cfile->fid.access = fid->access; #ifdef CONFIG_CIFS_DEBUG2 cfile->fid.mid = fid->mid; #endif /* CIFS_DEBUG2 */ @@ -3327,7 +3328,7 @@ static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offs * some servers (Windows2016) will not reflect recent writes in * QUERY_ALLOCATED_RANGES until SMB2_flush is called. */ - wrcfile = find_writable_file(cifsi, false); + wrcfile = find_writable_file(cifsi, FIND_WR_ANY); if (wrcfile) { filemap_write_and_wait(inode->i_mapping); smb2_flush_file(xid, tcon, &wrcfile->fid); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1234f9ccab03..28c0be5e69b7 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2771,6 +2771,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, atomic_inc(&tcon->num_remote_opens); oparms->fid->persistent_fid = rsp->PersistentFileId; oparms->fid->volatile_fid = rsp->VolatileFileId; + oparms->fid->access = oparms->desired_access; #ifdef CONFIG_CIFS_DEBUG2 oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId); #endif /* CIFS_DEBUG2 */ -- cgit v1.2.3 From fb4b5f13464c468a9c10ae1ab8ba9aa352d0256a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 21 Feb 2020 05:20:45 -0800 Subject: cifs: Use #define in cifs_dbg All other uses of cifs_dbg use defines so change this one. Signed-off-by: Joe Perches Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 49dbf11e2c3f..1e8a4b1579db 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -653,8 +653,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, */ if ((fattr->cf_nlink < 1) && !tcon->unix_ext && !info->DeletePending) { - cifs_dbg(1, "bogus file nlink value %u\n", - fattr->cf_nlink); + cifs_dbg(VFS, "bogus file nlink value %u\n", + fattr->cf_nlink); fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; } } -- cgit v1.2.3 From bdcd3eab2a9ae0ac93f27275b6895dd95e5bf360 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 25 Feb 2020 22:12:08 +0800 Subject: io_uring: fix poll_list race for SETUP_IOPOLL|SETUP_SQPOLL After making ext4 support iopoll method: let ext4_file_operations's iopoll method be iomap_dio_iopoll(), we found fio can easily hang in fio_ioring_getevents() with below fio job: rm -f testfile; sync; sudo fio -name=fiotest -filename=testfile -iodepth=128 -thread -rw=write -ioengine=io_uring -hipri=1 -sqthread_poll=1 -direct=1 -bs=4k -size=10G -numjobs=8 -runtime=2000 -group_reporting with IORING_SETUP_SQPOLL and IORING_SETUP_IOPOLL enabled. There are two issues that results in this hang, one reason is that when IORING_SETUP_SQPOLL and IORING_SETUP_IOPOLL are enabled, fio does not use io_uring_enter to get completed events, it relies on kernel io_sq_thread to poll for completed events. Another reason is that there is a race: when io_submit_sqes() in io_sq_thread() submits a batch of sqes, variable 'inflight' will record the number of submitted reqs, then io_sq_thread will poll for reqs which have been added to poll_list. But note, if some previous reqs have been punted to io worker, these reqs will won't be in poll_list timely. io_sq_thread() will only poll for a part of previous submitted reqs, and then find poll_list is empty, reset variable 'inflight' to be zero. If app just waits these deferred reqs and does not wake up io_sq_thread again, then hang happens. For app that entirely relies on io_sq_thread to poll completed requests, let io_iopoll_req_issued() wake up io_sq_thread properly when adding new element to poll_list, and when io_sq_thread prepares to sleep, check whether poll_list is empty again, if not empty, continue to poll. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 59 +++++++++++++++++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index d961945cb332..ffd9bfa84d86 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1821,6 +1821,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_add(&req->list, &ctx->poll_list); else list_add_tail(&req->list, &ctx->poll_list); + + if ((ctx->flags & IORING_SETUP_SQPOLL) && + wq_has_sleeper(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); } static void io_file_put(struct io_submit_state *state) @@ -5086,9 +5090,8 @@ static int io_sq_thread(void *data) const struct cred *old_cred; mm_segment_t old_fs; DEFINE_WAIT(wait); - unsigned inflight; unsigned long timeout; - int ret; + int ret = 0; complete(&ctx->completions[1]); @@ -5096,39 +5099,19 @@ static int io_sq_thread(void *data) set_fs(USER_DS); old_cred = override_creds(ctx->creds); - ret = timeout = inflight = 0; + timeout = jiffies + ctx->sq_thread_idle; while (!kthread_should_park()) { unsigned int to_submit; - if (inflight) { + if (!list_empty(&ctx->poll_list)) { unsigned nr_events = 0; - if (ctx->flags & IORING_SETUP_IOPOLL) { - /* - * inflight is the count of the maximum possible - * entries we submitted, but it can be smaller - * if we dropped some of them. If we don't have - * poll entries available, then we know that we - * have nothing left to poll for. Reset the - * inflight count to zero in that case. - */ - mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list)) - io_iopoll_getevents(ctx, &nr_events, 0); - else - inflight = 0; - mutex_unlock(&ctx->uring_lock); - } else { - /* - * Normal IO, just pretend everything completed. - * We don't have to poll completions for that. - */ - nr_events = inflight; - } - - inflight -= nr_events; - if (!inflight) + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->poll_list)) + io_iopoll_getevents(ctx, &nr_events, 0); + else timeout = jiffies + ctx->sq_thread_idle; + mutex_unlock(&ctx->uring_lock); } to_submit = io_sqring_entries(ctx); @@ -5157,7 +5140,7 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (inflight || + if (!list_empty(&ctx->poll_list) || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { cond_resched(); @@ -5167,6 +5150,19 @@ static int io_sq_thread(void *data) prepare_to_wait(&ctx->sqo_wait, &wait, TASK_INTERRUPTIBLE); + /* + * While doing polled IO, before going to sleep, we need + * to check if there are new reqs added to poll_list, it + * is because reqs may have been punted to io worker and + * will be added to poll_list later, hence check the + * poll_list again. + */ + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !list_empty_careful(&ctx->poll_list)) { + finish_wait(&ctx->sqo_wait, &wait); + continue; + } + /* Tell userspace we may need a wakeup call */ ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; /* make sure to read SQ tail after writing flags */ @@ -5194,8 +5190,7 @@ static int io_sq_thread(void *data) mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); mutex_unlock(&ctx->uring_lock); - if (ret > 0) - inflight += ret; + timeout = jiffies + ctx->sq_thread_idle; } set_fs(old_fs); -- cgit v1.2.3 From 3030fd4cb783377eca0e2a3eee63724a5c66ee15 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Feb 2020 08:47:30 -0700 Subject: io-wq: remove spin-for-work optimization Andres reports that buffered IO seems to suck up more cycles than we would like, and he narrowed it down to the fact that the io-wq workers will briefly spin for more work on completion of a work item. This was a win on the networking side, but apparently some other cases take a hit because of it. Remove the optimization to avoid burning more CPU than we have to for disk IO. Reported-by: Andres Freund Signed-off-by: Jens Axboe --- fs/io-wq.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 0a5ab1a8f69a..bf8ed1b0b90a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -535,42 +535,23 @@ next: } while (1); } -static inline void io_worker_spin_for_work(struct io_wqe *wqe) -{ - int i = 0; - - while (++i < 1000) { - if (io_wqe_run_queue(wqe)) - break; - if (need_resched()) - break; - cpu_relax(); - } -} - static int io_wqe_worker(void *data) { struct io_worker *worker = data; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; - bool did_work; io_worker_start(wqe, worker); - did_work = false; while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: - if (did_work) - io_worker_spin_for_work(wqe); spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); - did_work = true; goto loop; } - did_work = false; /* drops the lock on success, retry */ if (__io_worker_idle(wqe, worker)) { __release(&wqe->lock); -- cgit v1.2.3 From 1821b26a1fed8fca57a96ef87bac7a6a48e78815 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Thu, 20 Feb 2020 08:06:20 -0500 Subject: NFS: Don't hard-code the fs_type when submounting Hard-coding the fstype causes "nfs4" mounts to appear as "nfs", which breaks scripts that do "umount -at nfs4". Reported-by: Patrick Steinhardt Fixes: f2aedb713c28 ("NFS: Add fs_context support.") Signed-off-by: Scott Mayhew Signed-off-by: Anna Schumaker --- fs/nfs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index ad6077404947..f3ece8ed3203 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -153,7 +153,7 @@ struct vfsmount *nfs_d_automount(struct path *path) /* Open a new filesystem context, transferring parameters from the * parent superblock, including the network namespace. */ - fc = fs_context_for_submount(&nfs_fs_type, path->dentry); + fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); if (IS_ERR(fc)) return ERR_CAST(fc); -- cgit v1.2.3 From 75a9b9176157f3095d3099adf512b5a233addbc7 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 25 Feb 2020 11:05:22 -0500 Subject: NFS: Fix leak of ctx->nfs_server.hostname If userspace passes an nfs_mount_data struct in the data argument of mount(2), then nfs23_parse_monolithic() or nfs4_parse_monolithic() will allocate memory for ctx->nfs_server.hostname. This needs to be freed in nfs_parse_source(), which also allocates memory for ctx->nfs_server.hostname, otherwise a leak will occur. Reported-by: syzbot+193c375dcddb4f345091@syzkaller.appspotmail.com Fixes: f2aedb713c28 ("NFS: Add fs_context support.") Signed-off-by: Scott Mayhew Signed-off-by: Anna Schumaker --- fs/nfs/fs_context.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index b616263b0eb6..e113fcb4bb4c 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -832,6 +832,8 @@ static int nfs_parse_source(struct fs_context *fc, if (len > maxnamlen) goto out_hostname; + kfree(ctx->nfs_server.hostname); + /* N.B. caller will free nfs_server.hostname in all cases */ ctx->nfs_server.hostname = kmemdup_nul(dev_name, len, GFP_KERNEL); if (!ctx->nfs_server.hostname) -- cgit v1.2.3 From 55dee1bc0d72877b99805e42e0205087e98b9edd Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Mon, 24 Feb 2020 16:29:32 -0500 Subject: nfs: add minor version to nfs_server_key for fscache An NFS client that mounts multiple exports from the same NFS server with higher NFSv4 versions disabled (i.e. 4.2) and without forcing a specific NFS version results in fscache index cookie collisions and the following messages: [ 570.004348] FS-Cache: Duplicate cookie detected Each nfs_client structure should have its own fscache index cookie, so add the minorversion to nfs_server_key. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200145 Signed-off-by: Scott Mayhew Signed-off-by: Dave Wysochanski Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 1 + fs/nfs/fscache.c | 2 ++ fs/nfs/nfs4client.c | 1 - 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 989c30c98511..f1ff3076e4a4 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -153,6 +153,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; + clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; if (!try_module_get(clp->cl_nfs_mod->owner)) goto error_dealloc; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 52270bfac120..1abf126c2df4 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -31,6 +31,7 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock); struct nfs_server_key { struct { uint16_t nfsversion; /* NFS protocol version */ + uint32_t minorversion; /* NFSv4 minor version */ uint16_t family; /* address family */ __be16 port; /* IP port */ } hdr; @@ -55,6 +56,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp) memset(&key, 0, sizeof(key)); key.hdr.nfsversion = clp->rpc_ops->version; + key.hdr.minorversion = clp->cl_minorversion; key.hdr.family = clp->cl_addr.ss_family; switch (clp->cl_addr.ss_family) { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 0cd767e5c977..0bd77cc1f639 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -216,7 +216,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_ds_clients); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; - clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; #if IS_ENABLED(CONFIG_NFS_V4_1) -- cgit v1.2.3 From 2d141dd2caa78fbaf87b57c27769bdc14975ab3d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Feb 2020 11:52:56 -0700 Subject: io-wq: ensure work->task_pid is cleared on init We use ->task_pid for exit cancellation, but we need to ensure it's cleared to zero for io_req_work_grab_env() to do the right thing. Take a suggestion from Bart and clear the whole thing, just setting the function passed in. This makes it more future proof as well. Fixes: 36282881a795 ("io-wq: add io_wq_cancel_pid() to cancel based on a specific pid") Signed-off-by: Jens Axboe --- fs/io-wq.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.h b/fs/io-wq.h index ccc7d84af57d..33baba4370c5 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -79,16 +79,10 @@ struct io_wq_work { pid_t task_pid; }; -#define INIT_IO_WORK(work, _func) \ - do { \ - (work)->list.next = NULL; \ - (work)->func = _func; \ - (work)->files = NULL; \ - (work)->mm = NULL; \ - (work)->creds = NULL; \ - (work)->fs = NULL; \ - (work)->flags = 0; \ - } while (0) \ +#define INIT_IO_WORK(work, _func) \ + do { \ + *(work) = (struct io_wq_work){ .func = _func }; \ + } while (0) \ typedef void (get_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *); -- cgit v1.2.3 From 7c69eb84d98a28c428f902318c20c53cf29c9084 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Feb 2020 06:37:23 -0800 Subject: zonefs: fix IOCB_NOWAIT handling IOCB_NOWAIT can't just be ignored as it breaks applications expecting it not to block. Just refuse the operation as applications must handle that (e.g. by falling back to a thread pool). Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system") Signed-off-by: Christoph Hellwig Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 8bc6ef82d693..69aee3dfb660 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -601,13 +601,13 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; /* - * For async direct IOs to sequential zone files, ignore IOCB_NOWAIT + * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT * as this can cause write reordering (e.g. the first aio gets EAGAIN * on the inode lock but the second goes through but is now unaligned). */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) - && (iocb->ki_flags & IOCB_NOWAIT)) - iocb->ki_flags &= ~IOCB_NOWAIT; + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) && + (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) -- cgit v1.2.3 From 0dda2ddb7ded1395189e95d43106469687c07795 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 25 Feb 2020 22:03:33 +0100 Subject: zonefs: select FS_IOMAP Zonefs makes use of iomap internally, so it should also select iomap in Kconfig. Signed-off-by: Johannes Thumshirn Signed-off-by: Damien Le Moal --- fs/zonefs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/zonefs/Kconfig b/fs/zonefs/Kconfig index fb87ad372e29..ef2697b78820 100644 --- a/fs/zonefs/Kconfig +++ b/fs/zonefs/Kconfig @@ -2,6 +2,7 @@ config ZONEFS_FS tristate "zonefs filesystem support" depends on BLOCK depends on BLK_DEV_ZONED + select FS_IOMAP help zonefs is a simple file system which exposes zones of a zoned block device (e.g. host-managed or host-aware SMR disk drives) as files. -- cgit v1.2.3 From 2a44f46781617c5040372b59da33553a02b1f46d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Feb 2020 13:25:41 -0700 Subject: io_uring: pick up link work on submit reference drop If work completes inline, then we should pick up a dependent link item in __io_queue_sqe() as well. If we don't do so, we're forced to go async with that item, which is suboptimal. This also fixes an issue with io_put_req_find_next(), which always looks up the next work item. That should only be done if we're dropping the last reference to the request, to prevent multiple lookups of the same work item. Outside of being a fix, this also enables a good cleanup series for 5.7, where we never have to pass 'nxt' around or into the work handlers. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ffd9bfa84d86..f79ca494bb56 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1483,10 +1483,10 @@ static void io_free_req(struct io_kiocb *req) __attribute__((nonnull)) static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { - io_req_find_next(req, nxtptr); - - if (refcount_dec_and_test(&req->refs)) + if (refcount_dec_and_test(&req->refs)) { + io_req_find_next(req, nxtptr); __io_free_req(req); + } } static void io_put_req(struct io_kiocb *req) @@ -4749,7 +4749,7 @@ punt: err: /* drop submission reference */ - io_put_req(req); + io_put_req_find_next(req, &nxt); if (linked_timeout) { if (!ret) -- cgit v1.2.3 From 3a9015988b3d41027cda61f4fdeaaeee73be8b24 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Feb 2020 17:48:55 -0700 Subject: io_uring: import_single_range() returns 0/-ERROR Unlike the other core import helpers, import_single_range() returns 0 on success, not the length imported. This means that links that depend on the result of non-vec based IORING_OP_{READ,WRITE} that were added for 5.5 get errored when they should not be. Fixes: 3a6820f2bb8a ("io_uring: add non-vectored read/write commands") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index f79ca494bb56..36917c0101fd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2075,7 +2075,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, ssize_t ret; ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; - return ret; + return ret < 0 ? ret : sqe_len; } if (req->io) { -- cgit v1.2.3 From dd3db2a34cff14e152da7c8e320297719a35abf9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Feb 2020 10:23:43 -0700 Subject: io_uring: drop file set ref put/get on switch Dan reports that he triggered a warning on ring exit doing some testing: percpu ref (io_file_data_ref_zero) <= 0 (0) after switching to atomic WARNING: CPU: 3 PID: 0 at lib/percpu-refcount.c:160 percpu_ref_switch_to_atomic_rcu+0xe8/0xf0 Modules linked in: CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.6.0-rc3+ #5648 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 RIP: 0010:percpu_ref_switch_to_atomic_rcu+0xe8/0xf0 Code: e7 ff 55 e8 eb d2 80 3d bd 02 d2 00 00 75 8b 48 8b 55 d8 48 c7 c7 e8 70 e6 81 c6 05 a9 02 d2 00 01 48 8b 75 e8 e8 3a d0 c5 ff <0f> 0b e9 69 ff ff ff 90 55 48 89 fd 53 48 89 f3 48 83 ec 28 48 83 RSP: 0018:ffffc90000110ef8 EFLAGS: 00010292 RAX: 0000000000000045 RBX: 7fffffffffffffff RCX: 0000000000000000 RDX: 0000000000000045 RSI: ffffffff825be7a5 RDI: ffffffff825bc32c RBP: ffff8881b75eac38 R08: 000000042364b941 R09: 0000000000000045 R10: ffffffff825beb40 R11: ffffffff825be78a R12: 0000607e46005aa0 R13: ffff888107dcdd00 R14: 0000000000000000 R15: 0000000000000009 FS: 0000000000000000(0000) GS:ffff8881b9d80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f49e6a5ea20 CR3: 00000001b747c004 CR4: 00000000001606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rcu_core+0x1e4/0x4d0 __do_softirq+0xdb/0x2f1 irq_exit+0xa0/0xb0 smp_apic_timer_interrupt+0x60/0x140 apic_timer_interrupt+0xf/0x20 RIP: 0010:default_idle+0x23/0x170 Code: ff eb ab cc cc cc cc 0f 1f 44 00 00 41 54 55 53 65 8b 2d 10 96 92 7e 0f 1f 44 00 00 e9 07 00 00 00 0f 00 2d 21 d0 51 00 fb f4 <65> 8b 2d f6 95 92 7e 0f 1f 44 00 00 5b 5d 41 5c c3 65 8b 05 e5 95 Turns out that this is due to percpu_ref_switch_to_atomic() only grabbing a reference to the percpu refcount if it's not already in atomic mode. io_uring drops a ref and re-gets it when switching back to percpu mode. We attempt to protect against this with the FFD_F_ATOMIC bit, but that isn't reliable. We don't actually need to juggle these refcounts between atomic and percpu switch, we can just do them when we've switched to atomic mode. This removes the need for FFD_F_ATOMIC, which wasn't reliable. Fixes: 05f3fb3c5397 ("io_uring: avoid ring quiesce for fixed file set unregister and update") Reported-by: Dan Melnic Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 36917c0101fd..e412a1761d93 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -183,17 +183,12 @@ struct fixed_file_table { struct file **files; }; -enum { - FFD_F_ATOMIC, -}; - struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; struct percpu_ref refs; struct llist_head put_llist; - unsigned long state; struct work_struct ref_work; struct completion done; }; @@ -5595,7 +5590,6 @@ static void io_ring_file_ref_switch(struct work_struct *work) data = container_of(work, struct fixed_file_data, ref_work); io_ring_file_ref_flush(data); - percpu_ref_get(&data->refs); percpu_ref_switch_to_percpu(&data->refs); } @@ -5771,8 +5765,13 @@ static void io_atomic_switch(struct percpu_ref *ref) { struct fixed_file_data *data; + /* + * Juggle reference to ensure we hit zero, if needed, so we can + * switch back to percpu mode + */ data = container_of(ref, struct fixed_file_data, refs); - clear_bit(FFD_F_ATOMIC, &data->state); + percpu_ref_put(&data->refs); + percpu_ref_get(&data->refs); } static bool io_queue_file_removal(struct fixed_file_data *data, @@ -5795,11 +5794,7 @@ static bool io_queue_file_removal(struct fixed_file_data *data, llist_add(&pfile->llist, &data->put_llist); if (pfile == &pfile_stack) { - if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { - percpu_ref_put(&data->refs); - percpu_ref_switch_to_atomic(&data->refs, - io_atomic_switch); - } + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); wait_for_completion(&done); flush_work(&data->ref_work); return false; @@ -5873,10 +5868,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, up->offset++; } - if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { - percpu_ref_put(&data->refs); + if (ref_switch) percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); - } return done ? done : err; } -- cgit v1.2.3 From bebdb65e077267957f48e43d205d4a16cc7b8161 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 26 Feb 2020 18:38:32 +0100 Subject: io_uring: define and set show_fdinfo only if procfs is enabled Follow the pattern used with other *_show_fdinfo functions and only define and use io_uring_show_fdinfo and its helper functions if CONFIG_PROC_FS is set. Fixes: 87ce955b24c9 ("io_uring: add ->show_fdinfo() for the io_uring file descriptor") Signed-off-by: Tobias Klauser Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index e412a1761d93..05eea06f5421 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6641,6 +6641,7 @@ out_fput: return submitted ? submitted : ret; } +#ifdef CONFIG_PROC_FS static int io_uring_show_cred(int id, void *p, void *data) { const struct cred *cred = p; @@ -6714,6 +6715,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) percpu_ref_put(&ctx->refs); } } +#endif static const struct file_operations io_uring_fops = { .release = io_uring_release, @@ -6725,7 +6727,9 @@ static const struct file_operations io_uring_fops = { #endif .poll = io_uring_poll, .fasync = io_uring_fasync, +#ifdef CONFIG_PROC_FS .show_fdinfo = io_uring_show_fdinfo, +#endif }; static int io_allocate_scq_urings(struct io_ring_ctx *ctx, -- cgit v1.2.3 From d876836204897b6d7d911f942084f69a1e9d5c4d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Feb 2020 14:17:49 -0700 Subject: io_uring: fix 32-bit compatability with sendmsg/recvmsg We must set MSG_CMSG_COMPAT if we're in compatability mode, otherwise the iovec import for these commands will not do the right thing and fail the command with -EINVAL. Found by running the test suite compiled as 32-bit. Cc: stable@vger.kernel.org Fixes: aa1fa28fc73e ("io_uring: add support for recvmsg()") Fixes: 0fa03c624d8f ("io_uring: add support for sendmsg()") Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 05eea06f5421..6a595c13e108 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3001,6 +3001,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; +#endif + if (!io || req->opcode == IORING_OP_SEND) return 0; /* iovec is already imported */ @@ -3153,6 +3158,11 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; +#endif + if (!io || req->opcode == IORING_OP_RECV) return 0; /* iovec is already imported */ -- cgit v1.2.3 From 6c5d911249290f41f7b50b43344a7520605b1acb Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 21 Feb 2020 23:31:11 -0500 Subject: jbd2: fix data races at struct journal_head journal_head::b_transaction and journal_head::b_next_transaction could be accessed concurrently as noticed by KCSAN, LTP: starting fsync04 /dev/zero: Can't open blockdev EXT4-fs (loop0): mounting ext3 file system using the ext4 subsystem EXT4-fs (loop0): mounted filesystem with ordered data mode. Opts: (null) ================================================================== BUG: KCSAN: data-race in __jbd2_journal_refile_buffer [jbd2] / jbd2_write_access_granted [jbd2] write to 0xffff99f9b1bd0e30 of 8 bytes by task 25721 on cpu 70: __jbd2_journal_refile_buffer+0xdd/0x210 [jbd2] __jbd2_journal_refile_buffer at fs/jbd2/transaction.c:2569 jbd2_journal_commit_transaction+0x2d15/0x3f20 [jbd2] (inlined by) jbd2_journal_commit_transaction at fs/jbd2/commit.c:1034 kjournald2+0x13b/0x450 [jbd2] kthread+0x1cd/0x1f0 ret_from_fork+0x27/0x50 read to 0xffff99f9b1bd0e30 of 8 bytes by task 25724 on cpu 68: jbd2_write_access_granted+0x1b2/0x250 [jbd2] jbd2_write_access_granted at fs/jbd2/transaction.c:1155 jbd2_journal_get_write_access+0x2c/0x60 [jbd2] __ext4_journal_get_write_access+0x50/0x90 [ext4] ext4_mb_mark_diskspace_used+0x158/0x620 [ext4] ext4_mb_new_blocks+0x54f/0xca0 [ext4] ext4_ind_map_blocks+0xc79/0x1b40 [ext4] ext4_map_blocks+0x3b4/0x950 [ext4] _ext4_get_block+0xfc/0x270 [ext4] ext4_get_block+0x3b/0x50 [ext4] __block_write_begin_int+0x22e/0xae0 __block_write_begin+0x39/0x50 ext4_write_begin+0x388/0xb50 [ext4] generic_perform_write+0x15d/0x290 ext4_buffered_write_iter+0x11f/0x210 [ext4] ext4_file_write_iter+0xce/0x9e0 [ext4] new_sync_write+0x29c/0x3b0 __vfs_write+0x92/0xa0 vfs_write+0x103/0x260 ksys_write+0x9d/0x130 __x64_sys_write+0x4c/0x60 do_syscall_64+0x91/0xb05 entry_SYSCALL_64_after_hwframe+0x49/0xbe 5 locks held by fsync04/25724: #0: ffff99f9911093f8 (sb_writers#13){.+.+}, at: vfs_write+0x21c/0x260 #1: ffff99f9db4c0348 (&sb->s_type->i_mutex_key#15){+.+.}, at: ext4_buffered_write_iter+0x65/0x210 [ext4] #2: ffff99f5e7dfcf58 (jbd2_handle){++++}, at: start_this_handle+0x1c1/0x9d0 [jbd2] #3: ffff99f9db4c0168 (&ei->i_data_sem){++++}, at: ext4_map_blocks+0x176/0x950 [ext4] #4: ffffffff99086b40 (rcu_read_lock){....}, at: jbd2_write_access_granted+0x4e/0x250 [jbd2] irq event stamp: 1407125 hardirqs last enabled at (1407125): [] __find_get_block+0x107/0x790 hardirqs last disabled at (1407124): [] __find_get_block+0x49/0x790 softirqs last enabled at (1405528): [] __do_softirq+0x34c/0x57c softirqs last disabled at (1405521): [] irq_exit+0xa2/0xc0 Reported by Kernel Concurrency Sanitizer on: CPU: 68 PID: 25724 Comm: fsync04 Tainted: G L 5.6.0-rc2-next-20200221+ #7 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 The plain reads are outside of jh->b_state_lock critical section which result in data races. Fix them by adding pairs of READ|WRITE_ONCE(). Reviewed-by: Jan Kara Signed-off-by: Qian Cai Link: https://lore.kernel.org/r/20200222043111.2227-1-cai@lca.pw Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d181948c0390..3dccc23cf010 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1150,8 +1150,8 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh, /* For undo access buffer must have data copied */ if (undo && !jh->b_committed_data) goto out; - if (jh->b_transaction != handle->h_transaction && - jh->b_next_transaction != handle->h_transaction) + if (READ_ONCE(jh->b_transaction) != handle->h_transaction && + READ_ONCE(jh->b_next_transaction) != handle->h_transaction) goto out; /* * There are two reasons for the barrier here: @@ -2569,8 +2569,8 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh) * our jh reference and thus __jbd2_journal_file_buffer() must not * take a new one. */ - jh->b_transaction = jh->b_next_transaction; - jh->b_next_transaction = NULL; + WRITE_ONCE(jh->b_transaction, jh->b_next_transaction); + WRITE_ONCE(jh->b_next_transaction, NULL); if (buffer_freed(bh)) jlist = BJ_Forget; else if (jh->b_modified) -- cgit v1.2.3 From 37b0b6b8b99c0e1c1f11abbe7cf49b6d03795b3f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 28 Feb 2020 12:22:56 +0300 Subject: ext4: potential crash on allocation error in ext4_alloc_flex_bg_array() If sbi->s_flex_groups_allocated is zero and the first allocation fails then this code will crash. The problem is that "i--" will set "i" to -1 but when we compare "i >= sbi->s_flex_groups_allocated" then the -1 is type promoted to unsigned and becomes UINT_MAX. Since UINT_MAX is more than zero, the condition is true so we call kvfree(new_groups[-1]). The loop will carry on freeing invalid memory until it crashes. Fixes: 7c990728b99e ("ext4: fix potential race between s_flex_groups online resizing and access") Reviewed-by: Suraj Jitindar Singh Signed-off-by: Dan Carpenter Cc: stable@kernel.org Link: https://lore.kernel.org/r/20200228092142.7irbc44yaz3by7nb@kili.mountain Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ff1b764b0c0e..0c7c4adb664e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2391,7 +2391,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **old_groups, **new_groups; - int size, i; + int size, i, j; if (!sbi->s_log_groups_per_flex) return 0; @@ -2412,8 +2412,8 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) sizeof(struct flex_groups)), GFP_KERNEL); if (!new_groups[i]) { - for (i--; i >= sbi->s_flex_groups_allocated; i--) - kvfree(new_groups[i]); + for (j = sbi->s_flex_groups_allocated; j < i; j++) + kvfree(new_groups[j]); kvfree(new_groups); ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size); -- cgit v1.2.3 From fc04c39bae01a607454f7619665309870c60937a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 1 Mar 2020 19:18:19 +0300 Subject: io-wq: fix IO_WQ_WORK_NO_CANCEL cancellation To cancel a work, io-wq sets IO_WQ_WORK_CANCEL and executes the callback. However, IO_WQ_WORK_NO_CANCEL works will just execute and may return next work, which will be ignored and lost. Cancel the whole link. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index bf8ed1b0b90a..9a7aacc96d84 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -747,6 +747,17 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, return true; } +static void io_run_cancel(struct io_wq_work *work) +{ + do { + struct io_wq_work *old_work = work; + + work->flags |= IO_WQ_WORK_CANCEL; + work->func(&work); + work = (work == old_work) ? NULL : work; + } while (work); +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); @@ -760,8 +771,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) * It's close enough to not be an issue, fork() has the same delay. */ if (unlikely(!io_wq_can_queue(wqe, acct, work))) { - work->flags |= IO_WQ_WORK_CANCEL; - work->func(&work); + io_run_cancel(work); return; } @@ -900,8 +910,7 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, spin_unlock_irqrestore(&wqe->lock, flags); if (found) { - work->flags |= IO_WQ_WORK_CANCEL; - work->func(&work); + io_run_cancel(work); return IO_WQ_CANCEL_OK; } @@ -976,8 +985,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, spin_unlock_irqrestore(&wqe->lock, flags); if (found) { - work->flags |= IO_WQ_WORK_CANCEL; - work->func(&work); + io_run_cancel(work); return IO_WQ_CANCEL_OK; } -- cgit v1.2.3 From 80ad894382bf1d73eb688c29714fa10c0afcf2e7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 2 Mar 2020 23:46:10 +0300 Subject: io-wq: remove io_wq_flush and IO_WQ_WORK_INTERNAL io_wq_flush() is buggy, during cancelation of a flush, the associated work may be passed to the caller's (i.e. io_uring) @match callback. That callback is expecting it to be embedded in struct io_kiocb. Cancelation of internal work probably doesn't make a lot of sense to begin with. As the flush helper is no longer used, just delete it and the associated work flag. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 38 +------------------------------------- fs/io-wq.h | 2 -- 2 files changed, 1 insertion(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 9a7aacc96d84..5cef075c0b37 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -502,7 +502,7 @@ next: if (worker->mm) work->flags |= IO_WQ_WORK_HAS_MM; - if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) { + if (wq->get_work) { put_work = work; wq->get_work(work); } @@ -1057,42 +1057,6 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) return ret; } -struct io_wq_flush_data { - struct io_wq_work work; - struct completion done; -}; - -static void io_wq_flush_func(struct io_wq_work **workptr) -{ - struct io_wq_work *work = *workptr; - struct io_wq_flush_data *data; - - data = container_of(work, struct io_wq_flush_data, work); - complete(&data->done); -} - -/* - * Doesn't wait for previously queued work to finish. When this completes, - * it just means that previously queued work was started. - */ -void io_wq_flush(struct io_wq *wq) -{ - struct io_wq_flush_data data; - int node; - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - if (!node_online(node)) - continue; - init_completion(&data.done); - INIT_IO_WORK(&data.work, io_wq_flush_func); - data.work.flags |= IO_WQ_WORK_INTERNAL; - io_wqe_enqueue(wqe, &data.work); - wait_for_completion(&data.done); - } -} - struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; diff --git a/fs/io-wq.h b/fs/io-wq.h index 33baba4370c5..e5e15f2c93ec 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -8,7 +8,6 @@ enum { IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_UNBOUND = 32, - IO_WQ_WORK_INTERNAL = 64, IO_WQ_WORK_CB = 128, IO_WQ_WORK_NO_CANCEL = 256, IO_WQ_WORK_CONCURRENT = 512, @@ -100,7 +99,6 @@ void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val); -void io_wq_flush(struct io_wq *wq); void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); -- cgit v1.2.3 From e7a04894c766daa4248cb736efee93550f2d5872 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Mar 2020 14:02:49 -0800 Subject: btrfs: fix RAID direct I/O reads with alternate csums btrfs_lookup_and_bind_dio_csum() does pointer arithmetic which assumes 32-bit checksums. If using a larger checksum, this leads to spurious failures when a direct I/O read crosses a stripe. This is easy to reproduce: # mkfs.btrfs -f --checksum blake2 -d raid0 /dev/vdc /dev/vdd ... # mount /dev/vdc /mnt # cd /mnt # dd if=/dev/urandom of=foo bs=1M count=1 status=none # dd if=foo of=/dev/null bs=1M iflag=direct status=none dd: error reading 'foo': Input/output error # dmesg | tail -1 [ 135.821568] BTRFS warning (device vdc): csum failed root 5 ino 257 off 421888 ... Fix it by using the actual checksum size. Fixes: 1e25a2e3ca0d ("btrfs: don't assume ordered sums to be 4 bytes") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Johannes Thumshirn Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1ccb3f8d528d..27076ebadb36 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7783,6 +7783,7 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); + u16 csum_size; blk_status_t ret; /* @@ -7802,7 +7803,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, file_offset -= dip->logical_offset; file_offset >>= inode->i_sb->s_blocksize_bits; - io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); + csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy); + io_bio->csum = orig_io_bio->csum + csum_size * file_offset; return 0; } -- cgit v1.2.3 From 0a68ff5e2e7cf2263674b7f0418b31e10b2a497f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Feb 2020 22:22:43 -0800 Subject: fcntl: Distribute switch variables for initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables declared in a switch statement before any case statements cannot be automatically initialized with compiler instrumentation (as they are not part of any execution flow). With GCC's proposed automatic stack variable initialization feature, this triggers a warning (and they don't get initialized). Clang's automatic stack variable initialization (via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also doesn't initialize such variables[1]. Note that these warnings (or silent skipping) happen before the dead-store elimination optimization phase, so even when the automatic initializations are later elided in favor of direct initializations, the warnings remain. To avoid these problems, move such variables into the "case" where they're used or lift them up into the main function body. fs/fcntl.c: In function ‘send_sigio_to_task’: fs/fcntl.c:738:20: warning: statement will never be executed [-Wswitch-unreachable] 738 | kernel_siginfo_t si; | ^~ [1] https://bugs.llvm.org/show_bug.cgi?id=44916 Signed-off-by: Kees Cook Signed-off-by: Jeff Layton --- fs/fcntl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 9bc167562ee8..2e4c0fa2074b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -735,8 +735,9 @@ static void send_sigio_to_task(struct task_struct *p, return; switch (signum) { - kernel_siginfo_t si; - default: + default: { + kernel_siginfo_t si; + /* Queue a rt signal with the appropriate fd as its value. We use SI_SIGIO as the source, not SI_KERNEL, since kernel signals always get @@ -769,6 +770,7 @@ static void send_sigio_to_task(struct task_struct *p, si.si_fd = fd; if (!do_send_sig_info(signum, &si, p, type)) break; + } /* fall-through - fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); -- cgit v1.2.3 From 8019ad13ef7f64be44d4f892af9c840179009254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Mar 2020 11:28:31 +0100 Subject: futex: Fix inode life-time issue As reported by Jann, ihold() does not in fact guarantee inode persistence. And instead of making it so, replace the usage of inode pointers with a per boot, machine wide, unique inode identifier. This sequence number is global, but shared (file backed) futexes are rare enough that this should not become a performance issue. Reported-by: Jann Horn Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) --- fs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 7d57068b6b7a..93d9252a00ab 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -138,6 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; + atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; -- cgit v1.2.3 From bc87302a093f0eab45cd4e250c2021299f712ec6 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 5 Mar 2020 22:28:36 -0800 Subject: fat: fix uninit-memory access for partial initialized inode When get an error in the middle of reading an inode, some fields in the inode might be still not initialized. And then the evict_inode path may access those fields via iput(). To fix, this makes sure that inode fields are initialized. Reported-by: syzbot+9d82b8de2992579da5d0@syzkaller.appspotmail.com Signed-off-by: Andrew Morton Signed-off-by: OGAWA Hirofumi Cc: Link: http://lkml.kernel.org/r/871rqnreqx.fsf@mail.parknet.co.jp Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 594b05ae16c9..71946da84388 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -750,6 +750,13 @@ static struct inode *fat_alloc_inode(struct super_block *sb) return NULL; init_rwsem(&ei->truncate_lock); + /* Zeroing to allow iput() even if partial initialized inode. */ + ei->mmu_private = 0; + ei->i_start = 0; + ei->i_logstart = 0; + ei->i_attrs = 0; + ei->i_pos = 0; + return &ei->vfs_inode; } @@ -1374,16 +1381,6 @@ out: return 0; } -static void fat_dummy_inode_init(struct inode *inode) -{ - /* Initialize this dummy inode to work as no-op. */ - MSDOS_I(inode)->mmu_private = 0; - MSDOS_I(inode)->i_start = 0; - MSDOS_I(inode)->i_logstart = 0; - MSDOS_I(inode)->i_attrs = 0; - MSDOS_I(inode)->i_pos = 0; -} - static int fat_read_root(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); @@ -1844,13 +1841,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, fat_inode = new_inode(sb); if (!fat_inode) goto out_fail; - fat_dummy_inode_init(fat_inode); sbi->fat_inode = fat_inode; fsinfo_inode = new_inode(sb); if (!fsinfo_inode) goto out_fail; - fat_dummy_inode_init(fsinfo_inode); fsinfo_inode->i_ino = MSDOS_FSINFO_INO; sbi->fsinfo_inode = fsinfo_inode; insert_inode_hash(fsinfo_inode); -- cgit v1.2.3 From 6d390e4b5d48ec03bb87e63cf0a2bff5f4e116da Mon Sep 17 00:00:00 2001 From: yangerkun Date: Wed, 4 Mar 2020 15:25:56 +0800 Subject: locks: fix a potential use-after-free problem when wakeup a waiter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit '16306a61d3b7 ("fs/locks: always delete_block after waiting.")' add the logic to check waiter->fl_blocker without blocked_lock_lock. And it will trigger a UAF when we try to wakeup some waiter: Thread 1 has create a write flock a on file, and now thread 2 try to unlock and delete flock a, thread 3 try to add flock b on the same file. Thread2 Thread3 flock syscall(create flock b) ...flock_lock_inode_wait flock_lock_inode(will insert our fl_blocked_member list to flock a's fl_blocked_requests) sleep flock syscall(unlock) ...flock_lock_inode_wait locks_delete_lock_ctx ...__locks_wake_up_blocks __locks_delete_blocks( b->fl_blocker = NULL) ... break by a signal locks_delete_block b->fl_blocker == NULL && list_empty(&b->fl_blocked_requests) success, return directly locks_free_lock b wake_up(&b->fl_waiter) trigger UAF Fix it by remove this logic, and this patch may also fix CVE-2019-19769. Cc: stable@vger.kernel.org Fixes: 16306a61d3b7 ("fs/locks: always delete_block after waiting.") Signed-off-by: yangerkun Signed-off-by: Jeff Layton --- fs/locks.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 44b6da032842..426b55d333d5 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -753,20 +753,6 @@ int locks_delete_block(struct file_lock *waiter) { int status = -ENOENT; - /* - * If fl_blocker is NULL, it won't be set again as this thread - * "owns" the lock and is the only one that might try to claim - * the lock. So it is safe to test fl_blocker locklessly. - * Also if fl_blocker is NULL, this waiter is not listed on - * fl_blocked_requests for some lock, so no other request can - * be added to the list of fl_blocked_requests for this - * request. So if fl_blocker is NULL, it is safe to - * locklessly check if fl_blocked_requests is empty. If both - * of these checks succeed, there is no need to take the lock. - */ - if (waiter->fl_blocker == NULL && - list_empty(&waiter->fl_blocked_requests)) - return status; spin_lock(&blocked_lock_lock); if (waiter->fl_blocker) status = 0; -- cgit v1.2.3 From c1e2148f8ecb26863b899d402a823dab8e26efd1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Mar 2020 07:25:50 -0700 Subject: io_uring: free fixed_file_data after RCU grace period The percpu refcount protects this structure, and we can have an atomic switch in progress when exiting. This makes it unsafe to just free the struct normally, and can trigger the following KASAN warning: BUG: KASAN: use-after-free in percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0 Read of size 1 at addr ffff888181a19a30 by task swapper/0/0 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.6.0-rc4+ #5747 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x76/0xa0 print_address_description.constprop.0+0x3b/0x60 ? percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0 ? percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0 __kasan_report.cold+0x1a/0x3d ? percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0 percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0 rcu_core+0x370/0x830 ? percpu_ref_exit+0x50/0x50 ? rcu_note_context_switch+0x7b0/0x7b0 ? run_rebalance_domains+0x11d/0x140 __do_softirq+0x10a/0x3e9 irq_exit+0xd5/0xe0 smp_apic_timer_interrupt+0x86/0x200 apic_timer_interrupt+0xf/0x20 RIP: 0010:default_idle+0x26/0x1f0 Fix this by punting the final exit and free of the struct to RCU, then we know that it's safe to do so. Jann suggested the approach of using a double rcu callback to achieve this. It's important that we do a nested call_rcu() callback, as otherwise the free could be ordered before the atomic switch, even if the latter was already queued. Reported-by: syzbot+e017e49c39ab484ac87a@syzkaller.appspotmail.com Suggested-by: Jann Horn Reviewed-by: Paul E. McKenney Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6a595c13e108..68050b61ad0e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -191,6 +191,7 @@ struct fixed_file_data { struct llist_head put_llist; struct work_struct ref_work; struct completion done; + struct rcu_head rcu; }; struct io_ring_ctx { @@ -5329,6 +5330,26 @@ static void io_file_ref_kill(struct percpu_ref *ref) complete(&data->done); } +static void __io_file_ref_exit_and_free(struct rcu_head *rcu) +{ + struct fixed_file_data *data = container_of(rcu, struct fixed_file_data, + rcu); + percpu_ref_exit(&data->refs); + kfree(data); +} + +static void io_file_ref_exit_and_free(struct rcu_head *rcu) +{ + /* + * We need to order our exit+free call against the potentially + * existing call_rcu() for switching to atomic. One way to do that + * is to have this rcu callback queue the final put and free, as we + * could otherwise have a pre-existing atomic switch complete _after_ + * the free callback we queued. + */ + call_rcu(rcu, __io_file_ref_exit_and_free); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_file_data *data = ctx->file_data; @@ -5341,14 +5362,13 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) flush_work(&data->ref_work); wait_for_completion(&data->done); io_ring_file_ref_flush(data); - percpu_ref_exit(&data->refs); __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) kfree(data->table[i].files); kfree(data->table); - kfree(data); + call_rcu(&data->rcu, io_file_ref_exit_and_free); ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; -- cgit v1.2.3 From f0e20b8943509d81200cef5e30af2adfddba0f5c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 7 Mar 2020 01:15:22 +0300 Subject: io_uring: fix lockup with timeouts There is a recipe to deadlock the kernel: submit a timeout sqe with a linked_timeout (e.g. test_single_link_timeout_ception() from liburing), and SIGKILL the process. Then, io_kill_timeouts() takes @ctx->completion_lock, but the timeout isn't flagged with REQ_F_COMP_LOCKED, and will try to double grab it during io_put_free() to cancel the linked timeout. Probably, the same can happen with another io_kill_timeout() call site, that is io_commit_cqring(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 68050b61ad0e..c06082bb039a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1000,6 +1000,7 @@ static void io_kill_timeout(struct io_kiocb *req) if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); + req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); io_put_req(req); } -- cgit v1.2.3 From 2b4eae95c7361e0a147b838715c8baa1380a428f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Mar 2020 00:41:38 -0800 Subject: fscrypt: don't evict dirty inodes after removing key After FS_IOC_REMOVE_ENCRYPTION_KEY removes a key, it syncs the filesystem and tries to get and put all inodes that were unlocked by the key so that unused inodes get evicted via fscrypt_drop_inode(). Normally, the inodes are all clean due to the sync. However, after the filesystem is sync'ed, userspace can modify and close one of the files. (Userspace is *supposed* to close the files before removing the key. But it doesn't always happen, and the kernel can't assume it.) This causes the inode to be dirtied and have i_count == 0. Then, fscrypt_drop_inode() failed to consider this case and indicated that the inode can be dropped, causing the write to be lost. On f2fs, other problems such as a filesystem freeze could occur due to the inode being freed while still on f2fs's dirty inode list. Fix this bug by making fscrypt_drop_inode() only drop clean inodes. I've written an xfstest which detects this bug on ext4, f2fs, and ubifs. Fixes: b1c0ec3599f4 ("fscrypt: add FS_IOC_REMOVE_ENCRYPTION_KEY ioctl") Cc: # v5.4+ Link: https://lore.kernel.org/r/20200305084138.653498-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/keysetup.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 65cb09fa6ead..08c9f216a54d 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -538,6 +538,15 @@ int fscrypt_drop_inode(struct inode *inode) return 0; mk = ci->ci_master_key->payload.data[0]; + /* + * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes + * protected by the key were cleaned by sync_filesystem(). But if + * userspace is still using the files, inodes can be dirtied between + * then and now. We mustn't lose any writes, so skip dirty inodes here. + */ + if (inode->i_state & I_DIRTY_ALL) + return 0; + /* * Note: since we aren't holding ->mk_secret_sem, the result here can * immediately become outdated. But there's no correctness problem with -- cgit v1.2.3 From 805b13adde3964c78cba125a15527e88c19f87b3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 8 Mar 2020 20:07:28 -0600 Subject: io_uring: ensure RCU callback ordering with rcu_barrier() After more careful studying, Paul informs me that we cannot rely on ordering of RCU callbacks in the way that the the tagged commit did. The current construct looks like this: void C(struct rcu_head *rhp) { do_something(rhp); call_rcu(&p->rh, B); } call_rcu(&p->rh, A); call_rcu(&p->rh, C); and we're relying on ordering between A and B, which isn't guaranteed. Make this explicit instead, and have a work item issue the rcu_barrier() to ensure that A has run before we manually execute B. While thorough testing never showed this issue, it's dependent on the per-cpu load in terms of RCU callbacks. The updated method simplifies the code as well, and eliminates the need to maintain an rcu_head in the fileset data. Fixes: c1e2148f8ecb ("io_uring: free fixed_file_data after RCU grace period") Reported-by: Paul E. McKenney Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index c06082bb039a..1b2517291b78 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -191,7 +191,6 @@ struct fixed_file_data { struct llist_head put_llist; struct work_struct ref_work; struct completion done; - struct rcu_head rcu; }; struct io_ring_ctx { @@ -5331,24 +5330,21 @@ static void io_file_ref_kill(struct percpu_ref *ref) complete(&data->done); } -static void __io_file_ref_exit_and_free(struct rcu_head *rcu) +static void io_file_ref_exit_and_free(struct work_struct *work) { - struct fixed_file_data *data = container_of(rcu, struct fixed_file_data, - rcu); - percpu_ref_exit(&data->refs); - kfree(data); -} + struct fixed_file_data *data; + + data = container_of(work, struct fixed_file_data, ref_work); -static void io_file_ref_exit_and_free(struct rcu_head *rcu) -{ /* - * We need to order our exit+free call against the potentially - * existing call_rcu() for switching to atomic. One way to do that - * is to have this rcu callback queue the final put and free, as we - * could otherwise have a pre-existing atomic switch complete _after_ - * the free callback we queued. + * Ensure any percpu-ref atomic switch callback has run, it could have + * been in progress when the files were being unregistered. Once + * that's done, we can safely exit and free the ref and containing + * data structure. */ - call_rcu(rcu, __io_file_ref_exit_and_free); + rcu_barrier(); + percpu_ref_exit(&data->refs); + kfree(data); } static int io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -5369,7 +5365,8 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) for (i = 0; i < nr_tables; i++) kfree(data->table[i].files); kfree(data->table); - call_rcu(&data->rcu, io_file_ref_exit_and_free); + INIT_WORK(&data->ref_work, io_file_ref_exit_and_free); + queue_work(system_wq, &data->ref_work); ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; -- cgit v1.2.3 From 531d3040bc5cf37dea01b118608347cca9325f9d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 2 Mar 2020 15:03:35 +0200 Subject: ovl: fix lock in ovl_llseek() ovl_inode_lock() is interruptible. When inode_lock() in ovl_llseek() was replaced with ovl_inode_lock(), we did not add a check for error. Fix this by making ovl_inode_lock() uninterruptible and change the existing call sites to use an _interruptible variant. Reported-by: syzbot+66a9752fa927f745385e@syzkaller.appspotmail.com Fixes: b1f9d3858f72 ("ovl: use ovl_inode_lock in ovl_llseek()") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/overlayfs.h | 7 ++++++- fs/overlayfs/util.c | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 3623d28aa4fa..3d3f2b8bdae5 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -318,7 +318,12 @@ static inline unsigned int ovl_xino_bits(struct super_block *sb) return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0; } -static inline int ovl_inode_lock(struct inode *inode) +static inline void ovl_inode_lock(struct inode *inode) +{ + mutex_lock(&OVL_I(inode)->lock); +} + +static inline int ovl_inode_lock_interruptible(struct inode *inode) { return mutex_lock_interruptible(&OVL_I(inode)->lock); } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index ea005085803f..042f7eb4f7f4 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -509,7 +509,7 @@ int ovl_copy_up_start(struct dentry *dentry, int flags) struct inode *inode = d_inode(dentry); int err; - err = ovl_inode_lock(inode); + err = ovl_inode_lock_interruptible(inode); if (!err && ovl_already_copied_up_locked(dentry, flags)) { err = 1; /* Already copied up */ ovl_inode_unlock(inode); @@ -764,7 +764,7 @@ int ovl_nlink_start(struct dentry *dentry) return err; } - err = ovl_inode_lock(inode); + err = ovl_inode_lock_interruptible(inode); if (err) return err; -- cgit v1.2.3 From 21039132650281de06a169cbe8a0f7e5c578fd8b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 10 Mar 2020 09:31:41 -0400 Subject: gfs2_atomic_open(): fix O_EXCL|O_CREAT handling on cold dcache with the way fs/namei.c:do_last() had been done, ->atomic_open() instances needed to recognize the case when existing file got found with O_EXCL|O_CREAT, either by falling back to finish_no_open() or failing themselves. gfs2 one didn't. Fixes: 6d4ade986f9c (GFS2: Add atomic_open support) Cc: stable@kernel.org # v3.11 Signed-off-by: Al Viro --- fs/gfs2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2716d56ed0a0..8294851a9dd9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1248,7 +1248,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, if (!(file->f_mode & FMODE_OPENED)) return finish_no_open(file, d); dput(d); - return 0; + return excl && (flags & O_CREAT) ? -EEXIST : 0; } BUG_ON(d != NULL); -- cgit v1.2.3 From d9a9f4849fe0c9d560851ab22a85a666cddfdd24 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 12 Mar 2020 18:25:20 -0400 Subject: cifs_atomic_open(): fix double-put on late allocation failure several iterations of ->atomic_open() calling conventions ago, we used to need fput() if ->atomic_open() failed at some point after successful finish_open(). Now (since 2016) it's not needed - struct file carries enough state to make fput() work regardless of the point in struct file lifecycle and discarding it on failure exits in open() got unified. Unfortunately, I'd missed the fact that we had an instance of ->atomic_open() (cifs one) that used to need that fput(), as well as the stale comment in finish_open() demanding such late failure handling. Trivially fixed... Fixes: fe9ec8291fca "do_last(): take fput() on error after opening to out:" Cc: stable@kernel.org # v4.7+ Signed-off-by: Al Viro --- fs/cifs/dir.c | 1 - fs/open.c | 3 --- 2 files changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 0ef099442f20..36e7b2fd2190 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -555,7 +555,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, if (server->ops->close) server->ops->close(xid, tcon, &fid); cifs_del_pending_open(&open); - fput(file); rc = -ENOMEM; } diff --git a/fs/open.c b/fs/open.c index 0788b3715731..b69d6eed67e6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -860,9 +860,6 @@ cleanup_file: * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * - * On successful return @file is a fully instantiated open file. After this, if - * an error occurs in ->atomic_open(), it needs to clean up with fput(). - * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, -- cgit v1.2.3 From 53afcd310e867d25e394718558783c476301205c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 21 Feb 2020 16:34:42 +0200 Subject: ovl: fix some xino configurations Fix up two bugs in the coversion to xino_mode: 1. xino=off does not always end up in disabled mode 2. xino=auto on 32bit arch should end up in disabled mode Take a proactive approach to disabling xino on 32bit kernel: 1. Disable XINO_AUTO config during build time 2. Disable xino with a warning on mount time As a by product, xino=on on 32bit arch also ends up in disabled mode. We never intended to enable xino on 32bit arch and this will make the rest of the logic simpler. Fixes: 0f831ec85eda ("ovl: simplify ovl_same_sb() helper") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/Kconfig | 1 + fs/overlayfs/super.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 444e2da4f60e..714c14c47ca5 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -93,6 +93,7 @@ config OVERLAY_FS_XINO_AUTO bool "Overlayfs: auto enable inode number mapping" default n depends on OVERLAY_FS + depends on 64BIT help If this config option is enabled then overlay filesystems will use unused high bits in undelying filesystem inode numbers to map all diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 319fe0d355b0..ac967f1cb6e5 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1411,6 +1411,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, if (ofs->config.xino == OVL_XINO_ON) pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n"); ofs->xino_mode = 0; + } else if (ofs->config.xino == OVL_XINO_OFF) { + ofs->xino_mode = -1; } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) { /* * This is a roundup of number of bits needed for encoding @@ -1623,8 +1625,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ - if (ofs->config.xino != OVL_XINO_OFF) + if (ofs->config.xino != OVL_XINO_OFF) { ofs->xino_mode = BITS_PER_LONG - 32; + if (!ofs->xino_mode) { + pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n"); + ofs->config.xino = OVL_XINO_OFF; + } + } /* alloc/destroy_inode needed for setting up traps in inode cache */ sb->s_op = &ovl_super_operations; -- cgit v1.2.3 From c853680453ac235e9010987a8bdaaba0e116d3c8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 13 Mar 2020 15:42:20 +0100 Subject: ovl: fix lockdep warning for async write Lockdep reports "WARNING: lock held when returning to user space!" due to async write holding freeze lock over the write. Apparently aio.c already deals with this by lying to lockdep about the state of the lock. Do the same here. No need to check for S_IFREG() here since these file ops are regular-only. Reported-by: syzbot+9331a354f4f624a52a55@syzkaller.appspotmail.com Fixes: 2406a307ac7d ("ovl: implement async IO routines") Signed-off-by: Miklos Szeredi --- fs/overlayfs/file.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index a5317216de73..87c362f65448 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -244,6 +244,9 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) if (iocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(orig_iocb->ki_filp); + /* Actually acquired in ovl_write_iter() */ + __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, + SB_FREEZE_WRITE); file_end_write(iocb->ki_filp); ovl_copyattr(ovl_inode_real(inode), inode); } @@ -346,6 +349,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) goto out; file_start_write(real.file); + /* Pacify lockdep, same trick as done in aio_write() */ + __sb_writers_release(file_inode(real.file)->i_sb, + SB_FREEZE_WRITE); aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; -- cgit v1.2.3 From ddd2b85ff73bb60061a9fb08ac1f5a03a2d4bce0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 12 Mar 2020 21:36:53 +0000 Subject: afs: Use kfree_rcu() instead of casting kfree() to rcu_callback_t afs_put_addrlist() casts kfree() to rcu_callback_t. Apart from being wrong in theory, this might also blow up when people start enforcing function types via compiler instrumentation, and it means the rcu_head has to be first in struct afs_addr_list. Use kfree_rcu() instead, it's simpler and more correct. Signed-off-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/addr_list.c | 2 +- fs/afs/internal.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index df415c05939e..de1ae0bead3b 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -19,7 +19,7 @@ void afs_put_addrlist(struct afs_addr_list *alist) { if (alist && refcount_dec_and_test(&alist->usage)) - call_rcu(&alist->rcu, (rcu_callback_t)kfree); + kfree_rcu(alist, rcu); } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1d81fc4c3058..35f951ac296f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -81,7 +81,7 @@ enum afs_call_state { * List of server addresses. */ struct afs_addr_list { - struct rcu_head rcu; /* Must be first */ + struct rcu_head rcu; refcount_t usage; u32 version; /* Version */ unsigned char max_addrs; -- cgit v1.2.3 From 39946886fc865a4c26f1b3ea0805936db2d8986d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 28 Feb 2020 12:22:59 +0300 Subject: cifs: potential unintitliazed error code in cifs_getattr() Smatch complains that "rc" could be uninitialized. fs/cifs/inode.c:2206 cifs_getattr() error: uninitialized symbol 'rc'. Changing it to "return 0;" improves readability as well. Fixes: cc1baf98c8f6 ("cifs: do not ignore the SYNC flags in getattr") Signed-off-by: Dan Carpenter Signed-off-by: Steve French Acked-by: Ronnie Sahlberg --- fs/cifs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1e8a4b1579db..b16f8d23e97b 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2191,7 +2191,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat, if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) stat->gid = current_fsgid(); } - return rc; + return 0; } int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, -- cgit v1.2.3 From 1be1fa42ebb73ad8fd67d2c846931361b4e3dd0a Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Mon, 9 Mar 2020 01:35:09 -0700 Subject: CIFS: Increment num_remote_opens stats counter even in case of smb2_query_dir_first The num_remote_opens counter keeps track of the number of open files which must be maintained by the server at any point. This is a per-tree-connect counter, and the value of this counter gets displayed in the /proc/fs/cifs/Stats output as a following... Open files: 0 total (local), 1 open on server ^^^^^^^^^^^^^^^^ As a thumb-rule, we want to increment this counter for each open/create that we successfully execute on the server. Similarly, we should decrement the counter when we successfully execute a close. In this case, an increment was being missed in case of smb2_query_dir_first, in case of successful open. As a result, we would underflow the counter and we could even see the counter go to negative after sufficient smb2_query_dir_first calls. I tested the stats counter for a bunch of filesystem operations with the fix. And it looks like the counter looks correct to me. I also check if we missed the increments and decrements elsewhere. It does not seem so. Few other cases where an open is done and we don't increment the counter are the compound calls where the corresponding close is also sent in the request. Signed-off-by: Shyam Prasad N CC: Stable Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2ops.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c31e84ee3c39..3dddd20c5e2b 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2222,6 +2222,8 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, goto qdf_free; } + atomic_inc(&tcon->num_remote_opens); + qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base; if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { trace_smb3_query_dir_done(xid, fid->persistent_fid, -- cgit v1.2.3 From 979a2665eb6c603ddce0ab374041ab101827b2e7 Mon Sep 17 00:00:00 2001 From: Murphy Zhou Date: Sat, 14 Mar 2020 11:38:31 +0800 Subject: CIFS: fiemap: do not return EINVAL if get nothing If we call fiemap on a truncated file with none blocks allocated, it makes sense we get nothing from this call. No output means no blocks have been counted, but the call succeeded. It's a valid response. Simple example reproducer: xfs_io -f 'truncate 2M' -c 'fiemap -v' /cifssch/testfile xfs_io: ioctl(FS_IOC_FIEMAP) ["/cifssch/testfile"]: Invalid argument Signed-off-by: Murphy Zhou Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky CC: Stable --- fs/cifs/smb2ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 3dddd20c5e2b..cfe9b800ea8c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3419,7 +3419,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon, if (rc) goto out; - if (out_data_len < sizeof(struct file_allocated_range_buffer)) { + if (out_data_len && out_data_len < sizeof(struct file_allocated_range_buffer)) { rc = -EINVAL; goto out; } -- cgit v1.2.3 From dcf23ac3e846ca0cf626c155a0e3fcbbcf4fae8a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 18 Mar 2020 07:52:21 -0400 Subject: locks: reinstate locks_delete_block optimization There is measurable performance impact in some synthetic tests due to commit 6d390e4b5d48 (locks: fix a potential use-after-free problem when wakeup a waiter). Fix the race condition instead by clearing the fl_blocker pointer after the wake_up, using explicit acquire/release semantics. This does mean that we can no longer use the clearing of fl_blocker as the wait condition, so switch the waiters over to checking whether the fl_blocked_member list_head is empty. Reviewed-by: yangerkun Reviewed-by: NeilBrown Fixes: 6d390e4b5d48 (locks: fix a potential use-after-free problem when wakeup a waiter) Signed-off-by: Jeff Layton Signed-off-by: Linus Torvalds --- fs/cifs/file.c | 3 ++- fs/locks.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 50 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3b942ecdd4be..8f9d849a0012 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1169,7 +1169,8 @@ try_again: rc = posix_lock_file(file, flock, NULL); up_write(&cinode->lock_sem); if (rc == FILE_LOCK_DEFERRED) { - rc = wait_event_interruptible(flock->fl_wait, !flock->fl_blocker); + rc = wait_event_interruptible(flock->fl_wait, + list_empty(&flock->fl_blocked_member)); if (!rc) goto try_again; locks_delete_block(flock); diff --git a/fs/locks.c b/fs/locks.c index 426b55d333d5..b8a31c1c4fff 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -725,7 +725,6 @@ static void __locks_delete_block(struct file_lock *waiter) { locks_delete_global_blocked(waiter); list_del_init(&waiter->fl_blocked_member); - waiter->fl_blocker = NULL; } static void __locks_wake_up_blocks(struct file_lock *blocker) @@ -740,6 +739,13 @@ static void __locks_wake_up_blocks(struct file_lock *blocker) waiter->fl_lmops->lm_notify(waiter); else wake_up(&waiter->fl_wait); + + /* + * The setting of fl_blocker to NULL marks the "done" + * point in deleting a block. Paired with acquire at the top + * of locks_delete_block(). + */ + smp_store_release(&waiter->fl_blocker, NULL); } } @@ -753,11 +759,42 @@ int locks_delete_block(struct file_lock *waiter) { int status = -ENOENT; + /* + * If fl_blocker is NULL, it won't be set again as this thread "owns" + * the lock and is the only one that might try to claim the lock. + * + * We use acquire/release to manage fl_blocker so that we can + * optimize away taking the blocked_lock_lock in many cases. + * + * The smp_load_acquire guarantees two things: + * + * 1/ that fl_blocked_requests can be tested locklessly. If something + * was recently added to that list it must have been in a locked region + * *before* the locked region when fl_blocker was set to NULL. + * + * 2/ that no other thread is accessing 'waiter', so it is safe to free + * it. __locks_wake_up_blocks is careful not to touch waiter after + * fl_blocker is released. + * + * If a lockless check of fl_blocker shows it to be NULL, we know that + * no new locks can be inserted into its fl_blocked_requests list, and + * can avoid doing anything further if the list is empty. + */ + if (!smp_load_acquire(&waiter->fl_blocker) && + list_empty(&waiter->fl_blocked_requests)) + return status; + spin_lock(&blocked_lock_lock); if (waiter->fl_blocker) status = 0; __locks_wake_up_blocks(waiter); __locks_delete_block(waiter); + + /* + * The setting of fl_blocker to NULL marks the "done" point in deleting + * a block. Paired with acquire at the top of this function. + */ + smp_store_release(&waiter->fl_blocker, NULL); spin_unlock(&blocked_lock_lock); return status; } @@ -1350,7 +1387,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = posix_lock_inode(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker); + error = wait_event_interruptible(fl->fl_wait, + list_empty(&fl->fl_blocked_member)); if (error) break; } @@ -1435,7 +1473,8 @@ int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, error = posix_lock_inode(inode, &fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl.fl_wait, !fl.fl_blocker); + error = wait_event_interruptible(fl.fl_wait, + list_empty(&fl.fl_blocked_member)); if (!error) { /* * If we've been sleeping someone might have @@ -1638,7 +1677,8 @@ restart: locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, - !new_fl->fl_blocker, break_time); + list_empty(&new_fl->fl_blocked_member), + break_time); percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); @@ -2122,7 +2162,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker); + error = wait_event_interruptible(fl->fl_wait, + list_empty(&fl->fl_blocked_member)); if (error) break; } @@ -2399,7 +2440,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, error = vfs_lock_file(filp, cmd, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker); + error = wait_event_interruptible(fl->fl_wait, + list_empty(&fl->fl_blocked_member)); if (error) break; } -- cgit v1.2.3