From fcd4904e2f6908d5c255fa5818bcf8ad32a6f0e8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 19:23:03 +0100 Subject: netfs: Remove call to folio_index() Calling folio_index() is pointless overhead; directly dereferencing folio->index is fine. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20241005182307.3190401-2-willy@infradead.org Signed-off-by: Christian Brauner --- include/trace/events/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 1d7c52821e55..72a208fd4496 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -451,7 +451,7 @@ TRACE_EVENT(netfs_folio, struct address_space *__m = READ_ONCE(folio->mapping); __entry->ino = __m ? __m->host->i_ino : 0; __entry->why = why; - __entry->index = folio_index(folio); + __entry->index = folio->index; __entry->nr = folio_nr_pages(folio); ), -- cgit v1.2.3 From c6a90fe7f080d71271b723490454cfda1f81e4b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 19:23:04 +0100 Subject: netfs: Fix a few minor bugs in netfs_page_mkwrite() We can't return with VM_FAULT_SIGBUS | VM_FAULT_LOCKED; the core code will not unlock the folio in this instance. Introduce a new "unlock" error exit to handle this case. Use it to handle the "folio is truncated" check, and change the "writeback interrupted by a fatal signal" to do a NOPAGE exit instead of letting the core code install the folio currently under writeback before killing the process. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20241005182307.3190401-3-willy@infradead.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index b3910dfcb56d..ff2814da88b1 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -491,7 +491,9 @@ EXPORT_SYMBOL(netfs_file_write_iter); /* * Notification that a previously read-only page is about to become writable. - * Note that the caller indicates a single page of a multipage folio. + * The caller indicates the precise page that needs to be written to, but + * we only track group on a per-folio basis, so we block more often than + * we might otherwise. */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) { @@ -501,7 +503,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file); struct netfs_inode *ictx = netfs_inode(inode); - vm_fault_t ret = VM_FAULT_RETRY; + vm_fault_t ret = VM_FAULT_NOPAGE; int err; _enter("%lx", folio->index); @@ -510,21 +512,15 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr if (folio_lock_killable(folio) < 0) goto out; - if (folio->mapping != mapping) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - - if (folio_wait_writeback_killable(folio)) { - ret = VM_FAULT_LOCKED; - goto out; - } + if (folio->mapping != mapping) + goto unlock; + if (folio_wait_writeback_killable(folio) < 0) + goto unlock; /* Can we see a streaming write here? */ if (WARN_ON(!folio_test_uptodate(folio))) { - ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; - goto out; + ret = VM_FAULT_SIGBUS; + goto unlock; } group = netfs_folio_group(folio); @@ -559,5 +555,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr out: sb_end_pagefault(inode->i_sb); return ret; +unlock: + folio_unlock(folio); + goto out; } EXPORT_SYMBOL(netfs_page_mkwrite); -- cgit v1.2.3 From e995e8b600260cff3cfaf2607a62be8bdc4aa9c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 19:23:05 +0100 Subject: netfs: Remove unnecessary references to pages These places should all use folios instead of pages. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20241005182307.3190401-4-willy@infradead.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 8 ++++---- fs/netfs/buffered_write.c | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index c40e226053cc..17aaec00002b 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -646,7 +646,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, if (unlikely(always_fill)) { if (pos - offset + len <= i_size) return false; /* Page entirely before EOF */ - zero_user_segment(&folio->page, 0, plen); + folio_zero_segment(folio, 0, plen); folio_mark_uptodate(folio); return true; } @@ -665,7 +665,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, return false; zero_out: - zero_user_segments(&folio->page, 0, offset, offset + len, plen); + folio_zero_segments(folio, 0, offset, offset + len, plen); return true; } @@ -732,7 +732,7 @@ retry: if (folio_test_uptodate(folio)) goto have_folio; - /* If the page is beyond the EOF, we want to clear it - unless it's + /* If the folio is beyond the EOF, we want to clear it - unless it's * within the cache granule containing the EOF, in which case we need * to preload the granule. */ @@ -792,7 +792,7 @@ error: EXPORT_SYMBOL(netfs_write_begin); /* - * Preload the data into a page we're proposing to write into. + * Preload the data into a folio we're proposing to write into. */ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index ff2814da88b1..b4826360a411 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -83,13 +83,13 @@ static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, * netfs_perform_write - Copy data into the pagecache. * @iocb: The operation parameters * @iter: The source buffer - * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * @netfs_group: Grouping for dirty folios (eg. ceph snaps). * - * Copy data into pagecache pages attached to the inode specified by @iocb. + * Copy data into pagecache folios attached to the inode specified by @iocb. * The caller must hold appropriate inode locks. * - * Dirty pages are tagged with a netfs_folio struct if they're not up to date - * to indicate the range modified. Dirty pages may also be tagged with a + * Dirty folios are tagged with a netfs_folio struct if they're not up to date + * to indicate the range modified. Dirty folios may also be tagged with a * netfs-specific grouping such that data from an old group gets flushed before * a new one is started. */ @@ -223,11 +223,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, * we try to read it. */ if (fpos >= ctx->zero_point) { - zero_user_segment(&folio->page, 0, offset); + folio_zero_segment(folio, 0, offset); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - zero_user_segment(&folio->page, offset + copied, flen); + folio_zero_segment(folio, offset + copied, flen); __netfs_set_group(folio, netfs_group); folio_mark_uptodate(folio); trace_netfs_folio(folio, netfs_modify_and_clear); @@ -407,7 +407,7 @@ EXPORT_SYMBOL(netfs_perform_write); * netfs_buffered_write_iter_locked - write data to a file * @iocb: IO state structure (file, offset, etc.) * @from: iov_iter with data to write - * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * @netfs_group: Grouping for dirty folios (eg. ceph snaps). * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates -- cgit v1.2.3 From 10c35abd35aa62c9aac56898ae0c63b4d7d115e5 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 7 Nov 2024 19:06:45 +0800 Subject: cachefiles: Fix incorrect length return value in cachefiles_ondemand_fd_write_iter() cachefiles_ondemand_fd_write_iter() function first aligns "pos" and "len" to block boundaries. When calling __cachefiles_write(), the aligned "pos" is passed in, but "len" is the original unaligned value(iter->count). Additionally, the returned length of the write operation is the modified "len" aligned by block size, which is unreasonable. The alignment of "pos" and "len" is intended only to check whether the cache has enough space. But the modified len should not be used as the return value of cachefiles_ondemand_fd_write_iter() because the length we passed to __cachefiles_write() is the previous "len". Doing so would result in a mismatch in the data written on-demand. For example, if the length of the user state passed in is not aligned to the block size (the preread scene/DIO writes only need 512 alignment/Fault injection), the length of the write will differ from the actual length of the return. To solve this issue, since the __cachefiles_prepare_write() modifies the size of "len", we pass "aligned_len" to __cachefiles_prepare_write() to calculate the free blocks and use the original "len" as the return value of cachefiles_ondemand_fd_write_iter(). Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Signed-off-by: Zizhi Wo Link: https://lore.kernel.org/r/20241107110649.3980193-2-wozizhi@huawei.com Reviewed-by: David Howells Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 470c96658385..bdd321017f1c 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -61,7 +61,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, struct cachefiles_object *object = kiocb->ki_filp->private_data; struct cachefiles_cache *cache = object->volume->cache; struct file *file = object->file; - size_t len = iter->count; + size_t len = iter->count, aligned_len = len; loff_t pos = kiocb->ki_pos; const struct cred *saved_cred; int ret; @@ -70,7 +70,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, return -ENOBUFS; cachefiles_begin_secure(cache, &saved_cred); - ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true); + ret = __cachefiles_prepare_write(object, file, &pos, &aligned_len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) return ret; -- cgit v1.2.3 From 56f4856b425a30e1d8b3e41e6cde8bfba90ba5f8 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 7 Nov 2024 19:06:46 +0800 Subject: cachefiles: Fix missing pos updates in cachefiles_ondemand_fd_write_iter() In the erofs on-demand loading scenario, read and write operations are usually delivered through "off" and "len" contained in read req in user mode. Naturally, pwrite is used to specify a specific offset to complete write operations. However, if the write(not pwrite) syscall is called multiple times in the read-ahead scenario, we need to manually update ki_pos after each write operation to update file->f_pos. This step is currently missing from the cachefiles_ondemand_fd_write_iter function, added to address this issue. Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Signed-off-by: Zizhi Wo Link: https://lore.kernel.org/r/20241107110649.3980193-3-wozizhi@huawei.com Acked-by: David Howells Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index bdd321017f1c..38ca6dce8ef2 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -77,8 +77,10 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len); ret = __cachefiles_write(object, file, pos, iter, NULL, NULL); - if (!ret) + if (!ret) { ret = len; + kiocb->ki_pos += ret; + } return ret; } -- cgit v1.2.3 From 09ecf8f5505465b5527a39dff4b159af62306eee Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 7 Nov 2024 19:06:47 +0800 Subject: cachefiles: Clean up in cachefiles_commit_tmpfile() Currently, cachefiles_commit_tmpfile() will only be called if object->flags is set to CACHEFILES_OBJECT_USING_TMPFILE. Only cachefiles_create_file() and cachefiles_invalidate_cookie() set this flag. Both of these functions replace object->file with the new tmpfile, and both are called by fscache_cookie_state_machine(), so there are no concurrency issues. So the equation "d_backing_inode(dentry) == file_inode(object->file)" in cachefiles_commit_tmpfile() will never hold true according to the above conditions. This patch removes this part of the redundant code and does not involve any other logical changes. Signed-off-by: Zizhi Wo Link: https://lore.kernel.org/r/20241107110649.3980193-4-wozizhi@huawei.com Acked-by: David Howells Signed-off-by: Christian Brauner --- fs/cachefiles/namei.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 2b3f9935dbb4..7cf59713f0f7 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -691,11 +691,6 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, } if (!d_is_negative(dentry)) { - if (d_backing_inode(dentry) == file_inode(object->file)) { - success = true; - goto out_dput; - } - ret = cachefiles_unlink(volume->cache, object, fan, dentry, FSCACHE_OBJECT_IS_STALE); if (ret < 0) -- cgit v1.2.3 From 31ad74b20227ce6b40910ff78b1c604e42975cf1 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 7 Nov 2024 19:06:48 +0800 Subject: cachefiles: Fix NULL pointer dereference in object->file At present, the object->file has the NULL pointer dereference problem in ondemand-mode. The root cause is that the allocated fd and object->file lifetime are inconsistent, and the user-space invocation to anon_fd uses object->file. Following is the process that triggers the issue: [write fd] [umount] cachefiles_ondemand_fd_write_iter fscache_cookie_state_machine cachefiles_withdraw_cookie if (!file) return -ENOBUFS cachefiles_clean_up_object cachefiles_unmark_inode_in_use fput(object->file) object->file = NULL // file NULL pointer dereference! __cachefiles_write(..., file, ...) Fix this issue by add an additional reference count to the object->file before write/llseek, and decrement after it finished. Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Signed-off-by: Zizhi Wo Link: https://lore.kernel.org/r/20241107110649.3980193-5-wozizhi@huawei.com Reviewed-by: David Howells Signed-off-by: Christian Brauner --- fs/cachefiles/interface.c | 14 ++++++++++---- fs/cachefiles/ondemand.c | 30 ++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 35ba2117a6f6..3e63cfe15874 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -327,6 +327,8 @@ static void cachefiles_commit_object(struct cachefiles_object *object, static void cachefiles_clean_up_object(struct cachefiles_object *object, struct cachefiles_cache *cache) { + struct file *file; + if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) { if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { cachefiles_see_object(object, cachefiles_obj_see_clean_delete); @@ -342,10 +344,14 @@ static void cachefiles_clean_up_object(struct cachefiles_object *object, } cachefiles_unmark_inode_in_use(object, object->file); - if (object->file) { - fput(object->file); - object->file = NULL; - } + + spin_lock(&object->lock); + file = object->file; + object->file = NULL; + spin_unlock(&object->lock); + + if (file) + fput(file); } /* diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 38ca6dce8ef2..fe3de9ad57bf 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -60,20 +60,26 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, { struct cachefiles_object *object = kiocb->ki_filp->private_data; struct cachefiles_cache *cache = object->volume->cache; - struct file *file = object->file; + struct file *file; size_t len = iter->count, aligned_len = len; loff_t pos = kiocb->ki_pos; const struct cred *saved_cred; int ret; - if (!file) + spin_lock(&object->lock); + file = object->file; + if (!file) { + spin_unlock(&object->lock); return -ENOBUFS; + } + get_file(file); + spin_unlock(&object->lock); cachefiles_begin_secure(cache, &saved_cred); ret = __cachefiles_prepare_write(object, file, &pos, &aligned_len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) - return ret; + goto out; trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len); ret = __cachefiles_write(object, file, pos, iter, NULL, NULL); @@ -82,6 +88,8 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, kiocb->ki_pos += ret; } +out: + fput(file); return ret; } @@ -89,12 +97,22 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos, int whence) { struct cachefiles_object *object = filp->private_data; - struct file *file = object->file; + struct file *file; + loff_t ret; - if (!file) + spin_lock(&object->lock); + file = object->file; + if (!file) { + spin_unlock(&object->lock); return -ENOBUFS; + } + get_file(file); + spin_unlock(&object->lock); - return vfs_llseek(file, pos, whence); + ret = vfs_llseek(file, pos, whence); + fput(file); + + return ret; } static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl, -- cgit v1.2.3 From 22f9400a6f3560629478e0a64247b8fcc811a24d Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 7 Nov 2024 19:06:49 +0800 Subject: netfs/fscache: Add a memory barrier for FSCACHE_VOLUME_CREATING In fscache_create_volume(), there is a missing memory barrier between the bit-clearing operation and the wake-up operation. This may cause a situation where, after a wake-up, the bit-clearing operation hasn't been detected yet, leading to an indefinite wait. The triggering process is as follows: [cookie1] [cookie2] [volume_work] fscache_perform_lookup fscache_create_volume fscache_perform_lookup fscache_create_volume fscache_create_volume_work cachefiles_acquire_volume clear_and_wake_up_bit test_and_set_bit test_and_set_bit goto maybe_wait goto no_wait In the above process, cookie1 and cookie2 has the same volume. When cookie1 enters the -no_wait- process, it will clear the bit and wake up the waiting process. If a barrier is missing, it may cause cookie2 to remain in the -wait- process indefinitely. In commit 3288666c7256 ("fscache: Use clear_and_wake_up_bit() in fscache_create_volume_work()"), barriers were added to similar operations in fscache_create_volume_work(), but fscache_create_volume() was missed. By combining the clear and wake operations into clear_and_wake_up_bit() to fix this issue. Fixes: bfa22da3ed65 ("fscache: Provide and use cache methods to lookup/create/free a volume") Signed-off-by: Zizhi Wo Link: https://lore.kernel.org/r/20241107110649.3980193-6-wozizhi@huawei.com Acked-by: David Howells Signed-off-by: Christian Brauner --- fs/netfs/fscache_volume.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c index cb75c07b5281..ced14ac78cc1 100644 --- a/fs/netfs/fscache_volume.c +++ b/fs/netfs/fscache_volume.c @@ -322,8 +322,7 @@ maybe_wait: } return; no_wait: - clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); - wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags); } /* -- cgit v1.2.3