summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 08:14:00 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 08:14:00 -0800
commit1885cdbfbb51ede3637166c895d0b8040c9899cc (patch)
tree257c3f32cff62bff6a3acfe8c76d39981cc52faa /include
parent7d0a66e4bb9081d75c82ec4957c50034cb0ea449 (diff)
parent7fd8720dff2d9c70cf5a1a13b7513af01952ec02 (diff)
Merge tag 'vfs-6.19-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull iomap updates from Christian Brauner: "FUSE iomap Support for Buffered Reads: This adds iomap support for FUSE buffered reads and readahead. This enables granular uptodate tracking with large folios so only non-uptodate portions need to be read. Also fixes a race condition with large folios + writeback cache that could cause data corruption on partial writes followed by reads. - Refactored iomap read/readahead bio logic into helpers - Added caller-provided callbacks for read operations - Moved buffered IO bio logic into new file - FUSE now uses iomap for read_folio and readahead Zero Range Folio Batch Support: Add folio batch support for iomap_zero_range() to handle dirty folios over unwritten mappings. Fix raciness issues where dirty data could be lost during zero range operations. - filemap_get_folios_tag_range() helper for dirty folio lookup - Optional zero range dirty folio processing - XFS fills dirty folios on zero range of unwritten mappings - Removed old partial EOF zeroing optimization DIO Write Completions from Interrupt Context: Restore pre-iomap behavior where pure overwrite completions run inline rather than being deferred to workqueue. Reduces context switches for high-performance workloads like ScyllaDB. - Removed unused IOCB_DIO_CALLER_COMP code - Error completions always run in user context (fixes zonefs) - Reworked REQ_FUA selection logic - Inverted IOMAP_DIO_INLINE_COMP to IOMAP_DIO_OFFLOAD_COMP Buffered IO Cleanups: Some performance and code clarity improvements: - Replace manual bitmap scanning with find_next_bit() - Simplify read skip logic for writes - Optimize pending async writeback accounting - Better variable naming - Documentation for iomap_finish_folio_write() requirements Misaligned Vectors for Zoned XFS: Enables sub-block aligned vectors in XFS always-COW mode for zoned devices via new IOMAP_DIO_FSBLOCK_ALIGNED flag. Bug Fixes: - Allocate s_dio_done_wq for async reads (fixes syzbot report after error completion changes) - Fix iomap_read_end() for already uptodate folios (regression fix)" * tag 'vfs-6.19-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (40 commits) iomap: allocate s_dio_done_wq for async reads as well iomap: fix iomap_read_end() for already uptodate folios iomap: invert the polarity of IOMAP_DIO_INLINE_COMP iomap: support write completions from interrupt context iomap: rework REQ_FUA selection iomap: always run error completions in user context fs, iomap: remove IOCB_DIO_CALLER_COMP iomap: use find_next_bit() for uptodate bitmap scanning iomap: use find_next_bit() for dirty bitmap scanning iomap: simplify when reads can be skipped for writes iomap: simplify ->read_folio_range() error handling for reads iomap: optimize pending async writeback accounting docs: document iomap writeback's iomap_finish_folio_write() requirement iomap: account for unaligned end offsets when truncating read range iomap: rename bytes_pending/bytes_accounted to bytes_submitted/bytes_not_submitted xfs: support sub-block aligned vectors in always COW mode iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag xfs: error tag to force zeroing on debug kernels iomap: remove old partial eof zeroing optimization xfs: fill dirty folios on zero range of unwritten mappings ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/fs.h43
-rw-r--r--include/linux/iomap.h86
-rw-r--r--include/linux/pagemap.h2
3 files changed, 89 insertions, 42 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd3b57cfadee..50751e95c3f3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -367,23 +367,9 @@ struct readahead_control;
#define IOCB_NOIO (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE (1 << 21)
-/*
- * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
- * iocb completion can be passed back to the owner for execution from a safe
- * context rather than needing to be punted through a workqueue. If this
- * flag is set, the bio completion handling may set iocb->dio_complete to a
- * handler function and iocb->private to context information for that handler.
- * The issuer should call the handler with that context information from task
- * context to complete the processing of the iocb. Note that while this
- * provides a task context for the dio_complete() callback, it should only be
- * used on the completion side for non-IO generating completions. It's fine to
- * call blocking functions from this callback, but they should not wait for
- * unrelated IO (like cache flushing, new IO generation, etc).
- */
-#define IOCB_DIO_CALLER_COMP (1 << 22)
/* kiocb is a read or write operation submitted by fs/aio.c. */
-#define IOCB_AIO_RW (1 << 23)
-#define IOCB_HAS_METADATA (1 << 24)
+#define IOCB_AIO_RW (1 << 22)
+#define IOCB_HAS_METADATA (1 << 23)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
@@ -400,7 +386,6 @@ struct readahead_control;
{ IOCB_WAITQ, "WAITQ" }, \
{ IOCB_NOIO, "NOIO" }, \
{ IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \
- { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \
{ IOCB_AIO_RW, "AIO_RW" }, \
{ IOCB_HAS_METADATA, "AIO_HAS_METADATA" }
@@ -412,23 +397,13 @@ struct kiocb {
int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
u8 ki_write_stream;
- union {
- /*
- * Only used for async buffered reads, where it denotes the
- * page waitqueue associated with completing the read. Valid
- * IFF IOCB_WAITQ is set.
- */
- struct wait_page_queue *ki_waitq;
- /*
- * Can be used for O_DIRECT IO, where the completion handling
- * is punted back to the issuer of the IO. May only be set
- * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
- * must then check for presence of this handler when ki_complete
- * is invoked. The data passed in to this handler must be
- * assigned to ->private when dio_complete is assigned.
- */
- ssize_t (*dio_complete)(void *data);
- };
+
+ /*
+ * Only used for async buffered reads, where it denotes the page
+ * waitqueue associated with completing the read.
+ * Valid IFF IOCB_WAITQ is set.
+ */
+ struct wait_page_queue *ki_waitq;
};
static inline bool is_sync_kiocb(struct kiocb *kiocb)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 73dceabc21c8..520e967cb501 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/mm_types.h>
#include <linux/blkdev.h>
+#include <linux/pagevec.h>
struct address_space;
struct fiemap_extent_info;
@@ -16,6 +17,7 @@ struct inode;
struct iomap_iter;
struct iomap_dio;
struct iomap_writepage_ctx;
+struct iomap_read_folio_ctx;
struct iov_iter;
struct kiocb;
struct page;
@@ -241,11 +243,12 @@ struct iomap_iter {
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
+ struct folio_batch *fbatch;
void *private;
};
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
-int iomap_iter_advance(struct iomap_iter *iter, u64 *count);
+int iomap_iter_advance(struct iomap_iter *iter, u64 count);
/**
* iomap_length_trim - trimmed length of the current iomap iteration
@@ -282,9 +285,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter)
*/
static inline int iomap_iter_advance_full(struct iomap_iter *iter)
{
- u64 length = iomap_length(iter);
-
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, iomap_length(iter));
}
/**
@@ -339,8 +340,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops, void *private);
-int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
-void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
+void iomap_read_folio(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx);
+void iomap_readahead(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
@@ -349,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops);
+loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset,
+ loff_t length);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops, void *private);
@@ -430,6 +435,10 @@ struct iomap_writeback_ops {
* An existing mapping from a previous call to this method can be reused
* by the file system if it is still valid.
*
+ * If this succeeds, iomap_finish_folio_write() must be called once
+ * writeback completes for the range, regardless of whether the
+ * writeback succeeded or failed.
+ *
* Returns the number of bytes processed or a negative errno.
*/
ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc,
@@ -467,14 +476,41 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
loff_t pos, loff_t end_pos, unsigned int dirty_len);
int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error);
-void iomap_start_folio_write(struct inode *inode, struct folio *folio,
- size_t len);
+void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
+ int error);
void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len);
int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio);
int iomap_writepages(struct iomap_writepage_ctx *wpc);
+struct iomap_read_folio_ctx {
+ const struct iomap_read_ops *ops;
+ struct folio *cur_folio;
+ struct readahead_control *rac;
+ void *read_ctx;
+};
+
+struct iomap_read_ops {
+ /*
+ * Read in a folio range.
+ *
+ * If this succeeds, iomap_finish_folio_read() must be called after the
+ * range is read in, regardless of whether the read succeeded or failed.
+ *
+ * Returns 0 on success or a negative error on failure.
+ */
+ int (*read_folio_range)(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t len);
+
+ /*
+ * Submit any pending read requests.
+ *
+ * This is optional.
+ */
+ void (*submit_read)(struct iomap_read_folio_ctx *ctx);
+};
+
/*
* Flags for direct I/O ->end_io:
*/
@@ -518,6 +554,14 @@ struct iomap_dio_ops {
*/
#define IOMAP_DIO_PARTIAL (1 << 2)
+/*
+ * Ensure each bio is aligned to fs block size.
+ *
+ * For filesystems which need to calculate/verify the checksum of each fs
+ * block. Otherwise they may not be able to handle unaligned bios.
+ */
+#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3)
+
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, void *private, size_t done_before);
@@ -540,4 +584,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
extern struct bio_set iomap_ioend_bioset;
+#ifdef CONFIG_BLOCK
+extern const struct iomap_read_ops iomap_bio_read_ops;
+
+static inline void iomap_bio_read_folio(struct folio *folio,
+ const struct iomap_ops *ops)
+{
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &iomap_bio_read_ops,
+ .cur_folio = folio,
+ };
+
+ iomap_read_folio(ops, &ctx);
+}
+
+static inline void iomap_bio_readahead(struct readahead_control *rac,
+ const struct iomap_ops *ops)
+{
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &iomap_bio_read_ops,
+ .rac = rac,
+ };
+
+ iomap_readahead(ops, &ctx);
+}
+#endif /* CONFIG_BLOCK */
+
#endif /* LINUX_IOMAP_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..7274a86b4871 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -977,6 +977,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);
+unsigned filemap_get_folios_dirty(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
struct folio *read_cache_folio(struct address_space *, pgoff_t index,
filler_t *filler, struct file *file);