From c50105933f0c75aacc4f95c9bf36f7fbd9a83884 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:39:59 +0100 Subject: iomap: allow the file system to submit the writeback bios Change ->prepare_ioend to ->submit_ioend and require file systems that implement it to submit the bio. This is needed for file systems that do their own work on the bios before submitting them to the block layer like btrfs or zoned xfs. To make this easier also pass the writeback context to the method. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-2-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..dc8df4f779d4 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -362,12 +362,14 @@ struct iomap_writeback_ops { loff_t offset, unsigned len); /* - * Optional, allows the file systems to perform actions just before - * submitting the bio and/or override the bio end_io handler for complex - * operations like copy on write extent manipulation or unwritten extent - * conversions. + * Optional, allows the file systems to hook into bio submission, + * including overriding the bi_end_io handler. + * + * Returns 0 if the bio was successfully submitted, or a negative + * error code if status was non-zero or another error happened and + * the bio could not be submitted. */ - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); /* * Optional, allows the file system to discard state on a page where -- cgit v1.2.3 From 710273330663241d9ca5fbed51909e65807556ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:00 +0100 Subject: iomap: simplify io_flags and io_type in struct iomap_ioend The ioend fields for distinct types of I/O are a bit complicated. Consolidate them into a single io_flag field with it's own flags decoupled from the iomap flags. This also prepares for adding a new flag that is unrelated to both of the iomap namespaces. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-3-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index dc8df4f779d4..9583f6456165 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -327,13 +327,29 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset, sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); +/* + * Flags for iomap_ioend->io_flags. + */ +/* shared COW extent */ +#define IOMAP_IOEND_SHARED (1U << 0) +/* unwritten extent */ +#define IOMAP_IOEND_UNWRITTEN (1U << 1) +/* don't merge into previous ioend */ +#define IOMAP_IOEND_BOUNDARY (1U << 2) + +/* + * Flags that if set on either ioend prevent the merge of two ioends. + * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) + */ +#define IOMAP_IOEND_NOMERGE_FLAGS \ + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + /* * Structure for writeback I/O completions. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ - u16 io_type; - u16 io_flags; /* IOMAP_F_* */ + u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ -- cgit v1.2.3 From 034c29fb3e7c119c42e650986e280f025a1bec7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:01 +0100 Subject: iomap: add a IOMAP_F_ANON_WRITE flag Add a IOMAP_F_ANON_WRITE flag that indicates that the write I/O does not have a target block assigned to it yet at iomap time and the file system will do that in the bio submission handler, splitting the I/O as needed. This is used to implement Zone Append based I/O for zoned XFS, where splitting writes to the hardware limits and assigning a zone to them happens just before sending the I/O off to the block layer, but could also be useful for other things like compressed I/O. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-4-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 9583f6456165..eb0764945b42 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -56,6 +56,10 @@ struct vm_fault; * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. + * + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block + * assigned to it yet and the file system will do that in the bio submission + * handler, splitting the I/O as needed. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -68,6 +72,7 @@ struct vm_fault; #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) +#define IOMAP_F_ANON_WRITE (1U << 7) /* * Flags set by the core iomap code during operations: @@ -111,6 +116,8 @@ struct iomap { static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { + if (iomap->flags & IOMAP_F_ANON_WRITE) + return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } -- cgit v1.2.3 From 5fcbd555d48390a8c819ba7fdf55fbfcabe05c80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:02 +0100 Subject: iomap: split bios to zone append limits in the submission handlers Provide helpers for file systems to split bios in the direct I/O and writeback I/O submission handlers. The split ioends are chained to the parent ioend so that only the parent ioend originally generated by the iomap layer will be processed after all the chained off children have completed. This is based on the block layer bio chaining that has supported a similar mechanism for a long time. This Follows btrfs' lead and don't try to build bios to hardware limits for zone append commands, but instead build them as normal unconstrained bios and split them to the hardware limits in the I/O submission handler. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-5-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eb0764945b42..90c27875e39d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -353,12 +353,19 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, /* * Structure for writeback I/O completions. + * + * File systems implementing ->submit_ioend can split a bio generated + * by iomap. In that case the parent ioend it was split from is recorded + * in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of data within eof */ + size_t io_size; /* size of the extent */ + atomic_t io_remaining; /* completetion defer count */ + int io_error; /* stashed away status */ + struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ @@ -408,6 +415,10 @@ struct iomap_writepage_ctx { u32 nr_folios; /* folios added to the ioend */ }; +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, + loff_t file_offset, u16 ioend_flags); +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); @@ -479,4 +490,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ +extern struct bio_set iomap_ioend_bioset; + #endif /* LINUX_IOMAP_H */ -- cgit v1.2.3 From e523f2d4c974a819730830ce1c38834ee0cd7318 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:05 +0100 Subject: iomap: optionally use ioends for direct I/O struct iomap_ioend currently tracks outstanding buffered writes and has some really nice code in core iomap and XFS to merge contiguous I/Os an defer them to userspace for completion in a very efficient way. For zoned writes we'll also need a per-bio user context completion to record the written blocks, and the infrastructure for that would look basically like the ioend handling for buffered I/O. So instead of reinventing the wheel, reuse the existing infrastructure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-8-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 90c27875e39d..5768b9f2a1cc 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -343,20 +343,22 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, #define IOMAP_IOEND_UNWRITTEN (1U << 1) /* don't merge into previous ioend */ #define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) /* * Flags that if set on either ioend prevent the merge of two ioends. * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) */ #define IOMAP_IOEND_NOMERGE_FLAGS \ - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) /* * Structure for writeback I/O completions. * - * File systems implementing ->submit_ioend can split a bio generated - * by iomap. In that case the parent ioend it was split from is recorded - * in ioend->io_parent. + * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io + * for direct I/O) can split a bio generated by iomap. In that case the parent + * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ -- cgit v1.2.3 From d06244c60aec1d5d1589efe6b611a5b91a49465c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:06 +0100 Subject: iomap: add a io_private field to struct iomap_ioend Add a private data field to struct iomap_ioend so that the file system can attach information to it. Zoned XFS will use this for a pointer to the open zone. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-9-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5768b9f2a1cc..b4be07e8ec94 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -370,6 +370,7 @@ struct iomap_ioend { struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ + void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; -- cgit v1.2.3 From 02b39c4655d52141e07e80e9b2772d96daf67ff6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:07 +0100 Subject: iomap: pass private data to iomap_page_mkwrite Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-10-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b4be07e8ec94..d528eb4d5cfe 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -316,9 +316,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, - const struct iomap_ops *ops); - +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, -- cgit v1.2.3 From c6d1b8d15450cf061648d4e36d622da9d755654a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:08 +0100 Subject: iomap: pass private data to iomap_zero_range Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-11-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d528eb4d5cfe..eddf524ac749 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -313,7 +313,7 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, -- cgit v1.2.3 From ddd402bbbf669c4ada106fd2e4c799e2b5745e3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:09 +0100 Subject: iomap: pass private data to iomap_truncate_page Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-12-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eddf524ac749..022d7f338c68 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -315,7 +315,7 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); + const struct iomap_ops *ops, void *private); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, -- cgit v1.2.3 From abb0ea1923a68ec8f45a7615ebad1fc87ea06da6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:44 -0500 Subject: iomap: factor out iomap length helper In preparation to support more granular iomap iter advancing, factor the pos/len values as parameters to length calculation. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-2-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 022d7f338c68..feb748eb6294 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -238,18 +238,33 @@ struct iomap_iter { int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); /** - * iomap_length - length of the current iomap iteration + * iomap_length_trim - trimmed length of the current iomap iteration * @iter: iteration structure + * @pos: File position to trim from. + * @len: Length of the mapping to trim to. * - * Returns the length that the operation applies to for the current iteration. + * Returns a trimmed length that the operation applies to for the current + * iteration. */ -static inline u64 iomap_length(const struct iomap_iter *iter) +static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos, + u64 len) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); - return min(iter->len, end - iter->pos); + return min(len, end - pos); +} + +/** + * iomap_length - length of the current iomap iteration + * @iter: iteration structure + * + * Returns the length that the operation applies to for the current iteration. + */ +static inline u64 iomap_length(const struct iomap_iter *iter) +{ + return iomap_length_trim(iter, iter->pos, iter->len); } /** -- cgit v1.2.3 From b51d30ff51f9c325b65c8cd66ff6590530b14041 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:49 -0500 Subject: iomap: export iomap_iter_advance() and return remaining length As a final step for generic iter advance, export the helper and update it to return the remaining length of the current iteration after the advance. This will usually be 0 in the iomap_iter() case, but will be useful for the various operations that iterate on their own and will be updated to advance as they progress. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-7-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index feb748eb6294..eed06ffdcfbd 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -236,6 +236,7 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); +int iomap_iter_advance(struct iomap_iter *iter, u64 *count); /** * iomap_length_trim - trimmed length of the current iomap iteration -- cgit v1.2.3 From bc264fea0f6f230e56f876cc4266b1982d20f35d Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:50 -0500 Subject: iomap: support incremental iomap_iter advances The current iomap_iter iteration model reads the mapping from the filesystem, processes the subrange of the operation associated with the current mapping, and returns the number of bytes processed back to the iteration code. The latter advances the position and remaining length of the iter in preparation for the next iteration. At the _iter() handler level, this tends to produce a processing loop where the local code pulls the current position and remaining length out of the iter, iterates it locally based on file offset, and then breaks out when the associated range has been fully processed. This works well enough for current handlers, but upcoming enhancements require a bit more flexibility in certain situations. Enhancements for zero range will lead to a situation where the processing loop is no longer a pure ascending offset walk, but rather dictated by pagecache state and folio lookup. Since folio lookup and write preparation occur at different levels, it is more difficult to manage position and length outside of the iter. To provide more flexibility to certain iomap operations, introduce support for incremental iomap_iter advances from within the operation itself. This allows more granular advances for operations that might not use the typical file offset based walk. Note that the semantics for operations that use incremental advances is slightly different than traditional operations. Operations that advance the iter directly are expected to return success or failure (i.e. 0 or negative error code) in iter.processed rather than the number of bytes processed. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-8-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eed06ffdcfbd..e180dacf434c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -218,8 +218,11 @@ struct iomap_ops { * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. - * @processed: The number of bytes processed by the body in the most recent - * iteration, or a negative errno. 0 causes the iteration to stop. + * @iter_start_pos: The original start pos for the current iomap. Used for + * incremental iter advance. + * @processed: The number of bytes the most recent iteration needs iomap_iter() + * to advance the iter, zero if the iter was already advanced, or a + * negative errno for an error during the operation. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -228,6 +231,7 @@ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; + loff_t iter_start_pos; s64 processed; unsigned flags; struct iomap iomap; -- cgit v1.2.3 From edd3e3b7d210747dec723edd2b6cb49d140c1256 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Feb 2025 09:47:56 -0500 Subject: iomap: rename iomap_iter processed field to status The iter.processed field name is no longer appropriate now that iomap operations do not return the number of bytes processed. Rename the field to iter.status to reflect that a success or error code is expected. Also change the type to int as there is no longer a need for an s64. This reduces the size of iomap_iter by 8 bytes due to a combination of smaller type and reduction in structure padding. While here, fix up the return types of various _iter() helpers to reflect the type change. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250224144757.237706-12-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e180dacf434c..af9e51fba5f0 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -220,9 +220,8 @@ struct iomap_ops { * It is updated at the same time as @pos. * @iter_start_pos: The original start pos for the current iomap. Used for * incremental iter advance. - * @processed: The number of bytes the most recent iteration needs iomap_iter() - * to advance the iter, zero if the iter was already advanced, or a - * negative errno for an error during the operation. + * @status: Status of the most recent iteration. Zero on success or a negative + * errno on error. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -232,7 +231,7 @@ struct iomap_iter { loff_t pos; u64 len; loff_t iter_start_pos; - s64 processed; + int status; unsigned flags; struct iomap iomap; struct iomap srcmap; -- cgit v1.2.3 From d79c9cc512973ef6583c3bfc0b343f9d312d85b3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Feb 2025 09:47:57 -0500 Subject: iomap: introduce a full map advance helper Various iomap_iter_advance() calls advance by the full mapping length and thus have no need for the current length input or post-advance remaining length output from the standard advance function. Add an iomap_iter_advance_full() helper to clean up these cases. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250224144757.237706-13-bfoster@redhat.com Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index af9e51fba5f0..1fd66bc29cc1 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -271,6 +271,16 @@ static inline u64 iomap_length(const struct iomap_iter *iter) return iomap_length_trim(iter, iter->pos, iter->len); } +/** + * iomap_iter_advance_full - advance by the full length of current map + */ +static inline int iomap_iter_advance_full(struct iomap_iter *iter) +{ + u64 length = iomap_length(iter); + + return iomap_iter_advance(iter, &length); +} + /** * iomap_iter_srcmap - return the source map for the current iomap iteration * @i: iteration structure -- cgit v1.2.3 From b2cd5ae693a3dc5b70a0f75fba96452c591a2047 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 4 Feb 2025 11:39:59 -0700 Subject: iomap: make buffered writes work with RWF_DONTCACHE Add iomap buffered write support for RWF_DONTCACHE. If RWF_DONTCACHE is set for a write, mark the folios being written as uncached. Then writeback completion will drop the pages. The write_iter handler simply kicks off writeback for the pages, and writeback completion will take care of the rest. Signed-off-by: "Darrick J. Wong" Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250204184047.356762-2-axboe@kernel.dk Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..26b0dbe23e62 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -183,6 +183,7 @@ struct iomap_folio_ops { #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ #define IOMAP_ATOMIC (1 << 9) +#define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { /* -- cgit v1.2.3 From b4de0e9be963b95c46c4a5426e94059923d236d6 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 3 Mar 2025 17:11:10 +0000 Subject: iomap: Rename IOMAP_ATOMIC -> IOMAP_ATOMIC_HW In future xfs will support a SW-based atomic write, so rename IOMAP_ATOMIC -> IOMAP_ATOMIC_HW to be clear which mode is being used. Also relocate setting of IOMAP_ATOMIC_HW to the write path in __iomap_dio_rw(), to be clear that this flag is only relevant to writes. Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250303171120.2837067-3-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index ea29388b2fba..87cd7079aaf3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -189,7 +189,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC (1 << 9) +#define IOMAP_ATOMIC_HW (1 << 9) #define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { -- cgit v1.2.3 From 794ca29dcc924cd3f16d12b6fba61074c992b8fd Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 3 Mar 2025 17:11:13 +0000 Subject: iomap: Support SW-based atomic writes Currently atomic write support requires dedicated HW support. This imposes a restriction on the filesystem that disk blocks need to be aligned and contiguously mapped to FS blocks to issue atomic writes. XFS has no method to guarantee FS block alignment for regular, non-RT files. As such, atomic writes are currently limited to 1x FS block there. To deal with the scenario that we are issuing an atomic write over misaligned or discontiguous data blocks - and raise the atomic write size limit - support a SW-based software emulated atomic write mode. For XFS, this SW-based atomic writes would use CoW support to issue emulated untorn writes. It is the responsibility of the FS to detect discontiguous atomic writes and switch to IOMAP_DIO_ATOMIC_SW mode and retry the write. Indeed, SW-based atomic writes could be used always when the mounted bdev does not support HW offload, but this strategy is not initially expected to be used. Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250303171120.2837067-6-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- include/linux/iomap.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 87cd7079aaf3..9cd93530013c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -189,8 +189,9 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC_HW (1 << 9) +#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */ #define IOMAP_DONTCACHE (1 << 10) +#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */ struct iomap_ops { /* @@ -502,6 +503,11 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Use software-based torn-write protection. + */ +#define IOMAP_DIO_ATOMIC_SW (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); -- cgit v1.2.3 From 370a6de7651b9745b997c32f90685f9e100ccfcd Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Mar 2025 12:02:50 +0000 Subject: iomap: rework IOMAP atomic flags Flag IOMAP_ATOMIC_SW is not really required. The idea of having this flag is that the FS ->iomap_begin callback could check if this flag is set to decide whether to do a SW (FS-based) atomic write. But the FS can set which ->iomap_begin callback it wants when deciding to do a FS-based atomic write. Furthermore, it was thought that IOMAP_ATOMIC_HW is not a proper name, as the block driver can use SW-methods to emulate an atomic write. So change back to IOMAP_ATOMIC. The ->iomap_begin callback needs though to indicate to iomap core that REQ_ATOMIC needs to be set, so add IOMAP_F_ATOMIC_BIO for that. These changes were suggested by Christoph Hellwig and Dave Chinner. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250320120250.4087011-4-john.g.garry@oracle.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 9cd93530013c..02fe001feebb 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -60,6 +60,9 @@ struct vm_fault; * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block * assigned to it yet and the file system will do that in the bio submission * handler, splitting the I/O as needed. + * + * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic + * bio, i.e. set REQ_ATOMIC. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -73,6 +76,7 @@ struct vm_fault; #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) #define IOMAP_F_ANON_WRITE (1U << 7) +#define IOMAP_F_ATOMIC_BIO (1U << 8) /* * Flags set by the core iomap code during operations: @@ -189,9 +193,8 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */ +#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */ #define IOMAP_DONTCACHE (1 << 10) -#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */ struct iomap_ops { /* @@ -503,11 +506,6 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) -/* - * Use software-based torn-write protection. - */ -#define IOMAP_DIO_ATOMIC_SW (1 << 3) - ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); -- cgit v1.2.3