summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 18:14:52 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 18:14:52 -0800
commit4adc13ed7c281c16152a700e47b65d17de07321a (patch)
tree5cadc2218d2e6be035076b9456d5784ef090e54c /include
parent0c00ed308d0559fc216be0442a3df124e9e13533 (diff)
parent3373503df025ab6c9a8ad2ce6b7febd2eb3c99dc (diff)
Merge tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull bounce buffer dio for stable pages from Jens Axboe: "This adds support for bounce buffering of dio for stable pages. This was all done by Christoph. In his words: This series tries to address the problem that under I/O pages can be modified during direct I/O, even when the device or file system require stable pages during I/O to calculate checksums, parity or data operations. It does so by adding block layer helpers to bounce buffer an iov_iter into a bio, then wires that up in iomap and ultimately XFS. The reason that the file system even needs to know about it, is because reads need a user context to copy the data back, and the infrastructure to defer ioends to a workqueue currently sits in XFS. I'm going to look into moving that into ioend and enabling it for other file systems. Additionally btrfs already has it's own infrastructure for this, and actually an urgent need to bounce buffer, so this should be useful there and could be wire up easily. In fact the idea comes from patches by Qu that did this in btrfs. This patch fixes all but one xfstests failures on T10 PI capable devices (generic/095 seems to have issues with a mix of mmap and splice still, I'm looking into that separately), and make qemu VMs running Windows, or Linux with swap enabled fine on an XFS file on a device using PI. Performance numbers on my (not exactly state of the art) NVMe PI test setup: Sequential reads using io_uring, QD=16. Bandwidth and CPU usage (usr/sys): | size | zero copy | bounce | +------+--------------------------+--------------------------+ | 4k | 1316MiB/s (12.65/55.40%) | 1081MiB/s (11.76/49.78%) | | 64K | 3370MiB/s ( 5.46/18.20%) | 3365MiB/s ( 4.47/15.68%) | | 1M | 3401MiB/s ( 0.76/23.05%) | 3400MiB/s ( 0.80/09.06%) | +------+--------------------------+--------------------------+ Sequential writes using io_uring, QD=16. Bandwidth and CPU usage (usr/sys): | size | zero copy | bounce | +------+--------------------------+--------------------------+ | 4k | 882MiB/s (11.83/33.88%) | 750MiB/s (10.53/34.08%) | | 64K | 2009MiB/s ( 7.33/15.80%) | 2007MiB/s ( 7.47/24.71%) | | 1M | 1992MiB/s ( 7.26/ 9.13%) | 1992MiB/s ( 9.21/19.11%) | +------+--------------------------+--------------------------+ Note that the 64k read numbers look really odd to me for the baseline zero copy case, but are reproducible over many repeated runs. The bounce read numbers should further improve when moving the PI validation to the file system and removing the double context switch, which I have patches for that will sent out soon" * tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: xfs: use bounce buffering direct I/O when the device requires stable pages iomap: add a flag to bounce buffer direct I/O iomap: support ioends for direct reads iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED iomap: free the bio before completing the dio iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct iomap: split out the per-bio logic from iomap_dio_bio_iter iomap: simplify iomap_dio_bio_iter iomap: fix submission side handling of completion side errors block: add helpers to bounce buffer an iov_iter into bios block: remove bio_release_page iov_iter: extract a iov_iter_extract_bvecs helper from bio code block: open code bio_add_page and fix handling of mismatching P2P ranges block: refactor get_contig_folio_len block: add a BIO_MAX_SIZE constant and use it
Diffstat (limited to 'include')
-rw-r--r--include/linux/bio.h26
-rw-r--r--include/linux/blk_types.h3
-rw-r--r--include/linux/iomap.h9
-rw-r--r--include/linux/uio.h3
4 files changed, 40 insertions, 1 deletions
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7a2f3fc5be57..36a3f2275ecd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -397,6 +397,29 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
return iov_iter_npages(iter, max_segs);
}
+/**
+ * bio_iov_bounce_nr_vecs - calculate number of bvecs for a bounce bio
+ * @iter: iter to bounce from
+ * @op: REQ_OP_* for the bio
+ *
+ * Calculates how many bvecs are needed for the next bio to bounce from/to
+ * @iter.
+ */
+static inline unsigned short
+bio_iov_bounce_nr_vecs(struct iov_iter *iter, blk_opf_t op)
+{
+ /*
+ * We still need to bounce bvec iters, so don't special case them
+ * here unlike in bio_iov_vecs_to_alloc.
+ *
+ * For reads we need to use a vector for the bounce buffer, account
+ * for that here.
+ */
+ if (op_is_write(op))
+ return iov_iter_npages(iter, BIO_MAX_VECS);
+ return iov_iter_npages(iter, BIO_MAX_VECS - 1) + 1;
+}
+
struct request_queue;
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
@@ -451,6 +474,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
+int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter);
+void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty);
+
extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 19a888a2f104..d59553324a84 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -281,7 +281,8 @@ struct bio {
};
#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
-#define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT)
+#define BIO_MAX_SIZE UINT_MAX /* max value of bi_iter.bi_size */
+#define BIO_MAX_SECTORS (BIO_MAX_SIZE >> SECTOR_SHIFT)
static inline struct bio_vec *bio_inline_vecs(struct bio *bio)
{
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 83cdeff3d41c..99b7209dabd7 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -566,6 +566,15 @@ struct iomap_dio_ops {
*/
#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3)
+/*
+ * Bounce buffer instead of using zero copy access.
+ *
+ * This is needed if the device needs stable data to checksum or generate
+ * parity. The file system must hook into the I/O submission and offload
+ * completions to user context for reads when this is set.
+ */
+#define IOMAP_DIO_BOUNCE (1 << 4)
+
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, void *private, size_t done_before);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 5b127043a151..a9bc5b3067e3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -389,6 +389,9 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
size_t maxsize, unsigned int maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0);
+ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
+ size_t max_size, unsigned short *nr_vecs,
+ unsigned short max_vecs, iov_iter_extraction_t extraction_flags);
/**
* iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained