From ca82a7ea2299b4586af1f77daee66ee781202320 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 19 Sep 2025 14:42:50 -0700 Subject: iomap: simplify iomap_iter_advance() Most callers of iomap_iter_advance() do not need the remaining length returned. Get rid of the extra iomap_length() call that iomap_iter_advance() does. Signed-off-by: Joanne Koong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 73dceabc21c8..4469b2318b08 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -245,7 +245,7 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); -int iomap_iter_advance(struct iomap_iter *iter, u64 *count); +int iomap_iter_advance(struct iomap_iter *iter, u64 count); /** * iomap_length_trim - trimmed length of the current iomap iteration @@ -282,9 +282,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter) */ static inline int iomap_iter_advance_full(struct iomap_iter *iter) { - u64 length = iomap_length(iter); - - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, iomap_length(iter)); } /** -- cgit v1.2.3 From b2f35ac4146d32d4424aaa941bbc681f12c1b9e6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:04 -0700 Subject: iomap: add caller-provided callbacks for read and readahead Add caller-provided callbacks for read and readahead so that it can be used generically, especially by filesystems that are not block-based. In particular, this: * Modifies the read and readahead interface to take in a struct iomap_read_folio_ctx that is publicly defined as: struct iomap_read_folio_ctx { const struct iomap_read_ops *ops; struct folio *cur_folio; struct readahead_control *rac; void *read_ctx; }; where struct iomap_read_ops is defined as: struct iomap_read_ops { int (*read_folio_range)(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t len); void (*read_submit)(struct iomap_read_folio_ctx *ctx); }; read_folio_range() reads in the folio range and is required by the caller to provide. read_submit() is optional and is used for submitting any pending read requests. * Modifies existing filesystems that use iomap for read and readahead to use the new API, through the new statically inlined helpers iomap_bio_read_folio() and iomap_bio_readahead(). There is no change in functionality for those filesystems. Signed-off-by: Joanne Koong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4469b2318b08..37435b912755 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -16,6 +16,7 @@ struct inode; struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; +struct iomap_read_folio_ctx; struct iov_iter; struct kiocb; struct page; @@ -337,8 +338,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); -void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); +int iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); @@ -465,6 +468,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, loff_t pos, loff_t end_pos, unsigned int dirty_len); int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error); void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, @@ -473,6 +478,34 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio, int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio); int iomap_writepages(struct iomap_writepage_ctx *wpc); +struct iomap_read_folio_ctx { + const struct iomap_read_ops *ops; + struct folio *cur_folio; + struct readahead_control *rac; + void *read_ctx; +}; + +struct iomap_read_ops { + /* + * Read in a folio range. + * + * The caller is responsible for calling iomap_finish_folio_read() after + * reading in the folio range. This should be done even if an error is + * encountered during the read. + * + * Returns 0 on success or a negative error on failure. + */ + int (*read_folio_range)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t len); + + /* + * Submit any pending read requests. + * + * This is optional. + */ + void (*submit_read)(struct iomap_read_folio_ctx *ctx); +}; + /* * Flags for direct I/O ->end_io: */ @@ -538,4 +571,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, extern struct bio_set iomap_ioend_bioset; +#ifdef CONFIG_BLOCK +extern const struct iomap_read_ops iomap_bio_read_ops; + +static inline void iomap_bio_read_folio(struct folio *folio, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .cur_folio = folio, + }; + + iomap_read_folio(ops, &ctx); +} + +static inline void iomap_bio_readahead(struct readahead_control *rac, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .rac = rac, + }; + + iomap_readahead(ops, &ctx); +} +#endif /* CONFIG_BLOCK */ + #endif /* LINUX_IOMAP_H */ -- cgit v1.2.3 From d4e88bb08e5f7e6eb4e9c3685894b9b57bfdfb08 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:06 -0700 Subject: iomap: make iomap_read_folio() a void return No errors are propagated in iomap_read_folio(). Change iomap_read_folio() to a void return to make this clearer to callers. Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 37435b912755..6d864b446b6e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -338,7 +338,7 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(const struct iomap_ops *ops, +void iomap_read_folio(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); void iomap_readahead(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); -- cgit v1.2.3 From f8d98072feee32722086ddae4f288b6c45ae4330 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 3 Oct 2025 09:46:35 -0400 Subject: filemap: add helper to look up dirty folios in a range Add a new filemap_get_folios_dirty() helper to look up existing dirty folios in a range and add them to a folio_batch. This is to support optimization of certain iomap operations that only care about dirty folios in a target range. For example, zero range only zeroes the subset of dirty pages over unwritten mappings, seek hole/data may use similar logic in the future, etc. Note that the helper is intended for use under internal fs locks. Therefore it trylocks folios in order to filter out clean folios. This loosely follows the logic from filemap_range_has_writeback(). Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..7274a86b4871 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -977,6 +977,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); +unsigned filemap_get_folios_dirty(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); -- cgit v1.2.3 From 395ed1ef0012e1bb1e4050e84ba0173b3623112a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 3 Oct 2025 09:46:37 -0400 Subject: iomap: optional zero range dirty folio processing The only way zero range can currently process unwritten mappings with dirty pagecache is to check whether the range is dirty before mapping lookup and then flush when at least one underlying mapping is unwritten. This ordering is required to prevent iomap lookup from racing with folio writeback and reclaim. Since zero range can skip ranges of unwritten mappings that are clean in cache, this operation can be improved by allowing the filesystem to provide a set of dirty folios that require zeroing. In turn, rather than flush or iterate file offsets, zero range can iterate on folios in the batch and advance over clean or uncached ranges in between. Add a folio_batch in struct iomap and provide a helper for filesystems to populate the batch at lookup time. Update the folio lookup path to return the next folio in the batch, if provided, and advance the iter if the folio starts beyond the current offset. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6d864b446b6e..65d123114883 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -9,6 +9,7 @@ #include #include #include +#include struct address_space; struct fiemap_extent_info; @@ -242,6 +243,7 @@ struct iomap_iter { unsigned flags; struct iomap iomap; struct iomap srcmap; + struct folio_batch *fbatch; void *private; }; @@ -350,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops); +loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, + loff_t length); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -- cgit v1.2.3 From 001397f5ef4908ea46a63059439e8c3bf3552d9f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 31 Oct 2025 14:10:26 +0100 Subject: iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag Btrfs requires all of its bios to be fs block aligned, normally it's totally fine but with the incoming block size larger than page size (bs > ps) support, the requirement is no longer met for direct IOs. Because iomap_dio_bio_iter() calls bio_iov_iter_get_pages(), only requiring alignment to be bdev_logical_block_size(). In the real world that value is either 512 or 4K, on 4K page sized systems it means bio_iov_iter_get_pages() can break the bio at any page boundary, breaking btrfs' requirement for bs > ps cases. To address this problem, introduce a new public iomap dio flag, IOMAP_DIO_FSBLOCK_ALIGNED. When calling __iomap_dio_rw() with that new flag, iomap_dio::flags will inherit that new flag, and iomap_dio_bio_iter() will take fs block size into the calculation of the alignment, and pass the alignment to bio_iov_iter_get_pages(), respecting the fs block size requirement. The initial user of this flag will be btrfs, which needs to calculate the checksum for direct read and thus requires the biovec to be fs block aligned for the incoming bs > ps support. Signed-off-by: Qu Wenruo Reviewed-by: Pankaj Raghav [hch: also align pos/len, incorporate the trace flags from Darrick] Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251031131045.1613229-2-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 65d123114883..8b1ac08c7474 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -553,6 +553,14 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Ensure each bio is aligned to fs block size. + * + * For filesystems which need to calculate/verify the checksum of each fs + * block. Otherwise they may not be able to handle unaligned bios. + */ +#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); -- cgit v1.2.3 From 7e6cea5ae2f5e62112fce69acc07ee8b694b6dd0 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:52 -0800 Subject: docs: document iomap writeback's iomap_finish_folio_write() requirement Document that iomap_finish_folio_write() must be called after writeback on the range completes. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-4-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b1ac08c7474..a5032e456079 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -435,6 +435,10 @@ struct iomap_writeback_ops { * An existing mapping from a previous call to this method can be reused * by the file system if it is still valid. * + * If this succeeds, iomap_finish_folio_write() must be called once + * writeback completes for the range, regardless of whether the + * writeback succeeded or failed. + * * Returns the number of bytes processed or a negative errno. */ ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc, -- cgit v1.2.3 From 6b1fd2281fb0873ec56f8791d4e4898302070804 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:53 -0800 Subject: iomap: optimize pending async writeback accounting Pending writebacks must be accounted for to determine when all requests have completed and writeback on the folio should be ended. Currently this is done by atomically incrementing ifs->write_bytes_pending for every range to be written back. Instead, the number of atomic operations can be minimized by setting ifs->write_bytes_pending to the folio size, internally tracking how many bytes are written back asynchronously, and then after sending off all the requests, decrementing ifs->write_bytes_pending by the number of bytes not written back asynchronously. Now, for N ranges written back, only N + 2 atomic operations are required instead of 2N + 2. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-5-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index a5032e456079..b49e47f069db 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -478,8 +478,6 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len); -- cgit v1.2.3 From f8eaf79406fe9415db0e7a5c175b50cb01265199 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:54 -0800 Subject: iomap: simplify ->read_folio_range() error handling for reads Instead of requiring that the caller calls iomap_finish_folio_read() even if the ->read_folio_range() callback returns an error, account for this internally in iomap instead, which makes the interface simpler and makes it match writeback's ->read_folio_range() error handling expectations. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-6-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b49e47f069db..520e967cb501 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -495,9 +495,8 @@ struct iomap_read_ops { /* * Read in a folio range. * - * The caller is responsible for calling iomap_finish_folio_read() after - * reading in the folio range. This should be done even if an error is - * encountered during the read. + * If this succeeds, iomap_finish_folio_read() must be called after the + * range is read in, regardless of whether the read succeeded or failed. * * Returns 0 on success or a negative error on failure. */ -- cgit v1.2.3 From f9f85149994dbb9db43202ae8fabf68940c0ac0f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 18:06:26 +0100 Subject: fs, iomap: remove IOCB_DIO_CALLER_COMP This was added by commit 099ada2c8726 ("io_uring/rw: add write support for IOCB_DIO_CALLER_COMP") and disabled a little later by commit 838b35bb6a89 ("io_uring/rw: disable IOCB_DIO_CALLER_COMP") because it didn't work. Remove all the related code that sat unused for 2 years. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113170633.1453259-2-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 43 +++++++++---------------------------------- 1 file changed, 9 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..e210d2d8af53 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -367,23 +367,9 @@ struct readahead_control; #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) -/* - * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the - * iocb completion can be passed back to the owner for execution from a safe - * context rather than needing to be punted through a workqueue. If this - * flag is set, the bio completion handling may set iocb->dio_complete to a - * handler function and iocb->private to context information for that handler. - * The issuer should call the handler with that context information from task - * context to complete the processing of the iocb. Note that while this - * provides a task context for the dio_complete() callback, it should only be - * used on the completion side for non-IO generating completions. It's fine to - * call blocking functions from this callback, but they should not wait for - * unrelated IO (like cache flushing, new IO generation, etc). - */ -#define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ -#define IOCB_AIO_RW (1 << 23) -#define IOCB_HAS_METADATA (1 << 24) +#define IOCB_AIO_RW (1 << 22) +#define IOCB_HAS_METADATA (1 << 23) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ @@ -400,7 +386,6 @@ struct readahead_control; { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ - { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ { IOCB_AIO_RW, "AIO_RW" }, \ { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } @@ -412,23 +397,13 @@ struct kiocb { int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ u8 ki_write_stream; - union { - /* - * Only used for async buffered reads, where it denotes the - * page waitqueue associated with completing the read. Valid - * IFF IOCB_WAITQ is set. - */ - struct wait_page_queue *ki_waitq; - /* - * Can be used for O_DIRECT IO, where the completion handling - * is punted back to the issuer of the IO. May only be set - * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer - * must then check for presence of this handler when ki_complete - * is invoked. The data passed in to this handler must be - * assigned to ->private when dio_complete is assigned. - */ - ssize_t (*dio_complete)(void *data); - }; + + /* + * Only used for async buffered reads, where it denotes the page + * waitqueue associated with completing the read. + * Valid IFF IOCB_WAITQ is set. + */ + struct wait_page_queue *ki_waitq; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) -- cgit v1.2.3