Merge tag 'for-7.2/block-20260615' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull block updates from Jens Axboe: - NVMe pull request via Keith: - Per-controller admin and IO timeout sysfs attributes, and letting the block layer set request timeouts (Maurizio, Maximilian) - Multipath passthrough iostats, and PCI P2PDMA enablement for multipath devices (Keith, Kiran) - A new diag sysfs attribute group exporting per-controller counters (retries, multipath failover, error counters, requeue and failure counts, reset and reconnect events) (Nilay) - FDP configuration validation and bounds check fixes (liuxixin) - Various nvmet fixes, including a pre-auth out-of-bounds read in the Discovery Get Log Page handler, auth payload bounds validation, and tcp error-path leak fixes (Bryam, Tianchu, Geliang) - nvme-tcp lockdep and workqueue fixes (Shin'ichiro, Kuniyuki, Eric) - Assorted other fixes and cleanups (John, Yao, Chao, Mateusz, Achkinazi, Wentao) - MD pull request via Yu Kuai: - raid1/raid10 fixes for a deadlock in the read error recovery path, error-path detection and bio accounting with cloned bios, and an nr_pending leak in the REQ_ATOMIC bad-block error path (Abd-Alrhman) - PCI P2PDMA propagation from member devices to the RAID device (Kiran) - dm-raid bio requeue fix, and various smaller fixes and cleanups (Benjamin, Chen, Li, Thorsten) - Enable Clang lock context analysis for the block layer, with the accompanying annotations across queue limits, the blk_holder_ops callbacks, crypto, cgroup, iocost, kyber and mq-deadline (Bart) - Block status code infrastructure work: a tagged status table, a str_to_blk_op() helper, a bio_endio_status() helper, and on top of that a new configurable block-layer error injection facility (Christoph) - DRBD netlink rework, replacing the genl_magic machinery with explicit netlink serialization and moving the DRBD UAPI headers to include/uapi/linux/ (Christoph Böhmwalder) - bvec improvements: a bvec_folio() helper and making the bvec_iter helpers proper inline functions (Willy, Christoph) - ublk cleanups and a canceling-flag fix for the disk-not-allocated case (Caleb, Ming) - Partition handling fixes: bound the AIX pp_count scan, fix an of_node refcount leak, and replace __get_free_page() with kmalloc() (Bryam, Wentao, Mike) - Convert numa_node to int in blk_mq_hw_ctx and ->init_request, and add WQ_PERCPU to the block workqueue users (Mateusz, Marco) - Block statistics and tracing: propagate in-flight to the whole disk on partition IO, export passthrough stats, and a new block_rq_tag_wait tracepoint (Tang, Keith, Aaron) - A round of removals, unexports and cleanups across bio, direct-io and the bvec helpers (Christoph) - Various driver fixes (mtip32xx use-after-free, rbd snap_count validation and strscpy conversion, nbd socket lockdep reclassify, virtio-blk zone report clamp, floppy) and a batch of MAINTAINERS email/list updates (Coly, Li, Yu, Christoph Böhmwalder) - Other little fixes and cleanups all over * tag 'for-7.2/block-20260615' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (117 commits) MAINTAINERS: Update Coly Li's email address block: check bio split for unaligned bvec nbd: Reclassify sockets to avoid lockdep circular dependency block: add configurable error injection block: add a str_to_blk_op helper block: add a "tag" for block status codes block: add a macro to initialize the status table floppy: Drop unused pnp driver data block: propagate in_flight to whole disk on partition I/O virtio-blk: clamp zone report to the report buffer capacity block: optimize I/O merge hot path with unlikely() hints drivers/block/rbd: Use strscpy() to copy strings into arrays partitions: aix: bound the pp_count scan to the ppe array block: Enable lock context analysis block/mq-deadline: Make the lock context annotations compatible with Clang block/Kyber: Make the lock context annotations compatible with Clang block/blk-mq-debugfs: Improve lock context annotations block/blk-iocost: Inline iocg_lock() and iocg_unlock() block/blk-iocost: Split ioc_rqos_throttle() block/crypto: Annotate the crypto functions ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-16 13:02:47 +0530
committer: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-16 13:02:47 +0530
commit: ba9c792c824fff732df85119011d399d9b6d9155 (patch)
tree: a6de1eda726e4e9156366b6fd960b81d09297aa3 /block
parent: 9b40ba14edcdf70240af8114092a76f75f070774 (diff)
parent: c7c76f9232bd34835d821f14abdc5fafc17bc938 (diff)
33 files changed, 884 insertions, 429 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 15027963472d..70e4a66d941f 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -221,6 +221,14 @@ config BLOCK_HOLDER_DEPRECATED
 config BLK_MQ_STACKING
 	bool
 
+config BLK_ERROR_INJECTION
+	bool "Enable block layer error injection"
+	select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+	help
+	  Enable inserting arbitrary block errors through a debugfs interface.
+
+	  See Documentation/block/error-injection.rst for details.
+
 source "block/Kconfig.iosched"
 
 endif # BLOCK
diff --git a/block/Makefile b/block/Makefile
index 7dce2e44276c..e7bd320e3d69 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,6 +3,8 @@
 # Makefile for the kernel block layer
 #
 
+CONTEXT_ANALYSIS := y
+
 obj-y		:= bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
@@ -11,6 +13,7 @@ obj-y		:= bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
 			genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
 			disk-events.o blk-ia-ranges.o early-lookup.o
 
+obj-$(CONFIG_BLK_ERROR_INJECTION) += error-injection.o
 obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
diff --git a/block/bdev.c b/block/bdev.c
index bb0ffa3bb4df..85ce57bd2ae4 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -446,15 +446,10 @@ EXPORT_SYMBOL_GPL(blockdev_superblock);
 
 void __init bdev_cache_init(void)
 {
-	int err;
-
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				SLAB_ACCOUNT|SLAB_PANIC),
 			init_once);
-	err = register_filesystem(&bd_type);
-	if (err)
-		panic("Cannot register bdev pseudo-fs");
 	blockdev_mnt = kern_mount(&bd_type);
 	if (IS_ERR(blockdev_mnt))
 		panic("Cannot create bdev pseudo-fs");
@@ -1250,7 +1245,13 @@ void bdev_mark_dead(struct block_device *bdev, bool surprise)
 		bdev->bd_holder_ops->mark_dead(bdev, surprise);
 	else {
 		mutex_unlock(&bdev->bd_holder_lock);
-		sync_blockdev(bdev);
+		/*
+		 * On surprise removal the device is already gone; syncing is
+		 * futile and can hang forever waiting on I/O that will never
+		 * complete.  Match fs_bdev_mark_dead(), which also skips it.
+		 */
+		if (!surprise)
+			sync_blockdev(bdev);
 	}
 
 	invalidate_bdev(bdev);
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index ac83b0668764..0bd0332b3d78 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -300,6 +300,25 @@ static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
 	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
 }
 
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->bytes);
+	blkg_rwstat_exit(&stats->ios);
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	bfq_stat_exit(&stats->time);
+	bfq_stat_exit(&stats->avg_queue_size_sum);
+	bfq_stat_exit(&stats->avg_queue_size_samples);
+	bfq_stat_exit(&stats->dequeue);
+	bfq_stat_exit(&stats->group_wait_time);
+	bfq_stat_exit(&stats->idle_time);
+	bfq_stat_exit(&stats->empty_time);
+#endif
+}
+
 struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *group_entity = bfqq->entity.parent;
@@ -321,8 +340,10 @@ static void bfqg_get(struct bfq_group *bfqg)
 
 static void bfqg_put(struct bfq_group *bfqg)
 {
-	if (refcount_dec_and_test(&bfqg->ref))
+	if (refcount_dec_and_test(&bfqg->ref)) {
+		bfqg_stats_exit(&bfqg->stats);
 		kfree(bfqg);
+	}
 }
 
 static void bfqg_and_blkg_get(struct bfq_group *bfqg)
@@ -433,25 +454,6 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
 	entity->sched_data = &bfqg->sched_data;
 }
 
-static void bfqg_stats_exit(struct bfqg_stats *stats)
-{
-	blkg_rwstat_exit(&stats->bytes);
-	blkg_rwstat_exit(&stats->ios);
-#ifdef CONFIG_BFQ_CGROUP_DEBUG
-	blkg_rwstat_exit(&stats->merged);
-	blkg_rwstat_exit(&stats->service_time);
-	blkg_rwstat_exit(&stats->wait_time);
-	blkg_rwstat_exit(&stats->queued);
-	bfq_stat_exit(&stats->time);
-	bfq_stat_exit(&stats->avg_queue_size_sum);
-	bfq_stat_exit(&stats->avg_queue_size_samples);
-	bfq_stat_exit(&stats->dequeue);
-	bfq_stat_exit(&stats->group_wait_time);
-	bfq_stat_exit(&stats->idle_time);
-	bfq_stat_exit(&stats->empty_time);
-#endif
-}
-
 static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
 {
 	if (blkg_rwstat_init(&stats->bytes, gfp) ||
@@ -552,7 +554,6 @@ static void bfq_pd_free(struct blkg_policy_data *pd)
 {
 	struct bfq_group *bfqg = pd_to_bfqg(pd);
 
-	bfqg_stats_exit(&bfqg->stats);
 	bfqg_put(bfqg);
 }
 
@@ -1051,9 +1052,13 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
 
 	blkg_conf_init(&ctx, buf);
 
+	ret = blkg_conf_open_bdev(&ctx);
+	if (ret)
+		return ret;
+
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx);
 	if (ret)
-		goto out;
+		goto close_bdev;
 
 	if (sscanf(ctx.body, "%llu", &v) == 1) {
 		/* require "default" on dfl */
@@ -1074,8 +1079,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
 		bfq_group_set_weight(bfqg, bfqg->entity.weight, v);
 		ret = 0;
 	}
+
 out:
-	blkg_conf_exit(&ctx);
+	blkg_conf_unprep(&ctx);
+close_bdev:
+	blkg_conf_close_bdev(&ctx);
 	return ret ?: nbytes;
 }
 
diff --git a/block/bio.c b/block/bio.c
index 5f10900b3f42..811a96796202 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -635,15 +635,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_kmalloc);
 
-void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
+void zero_fill_bio(struct bio *bio)
 {
 	struct bio_vec bv;
 	struct bvec_iter iter;
 
-	__bio_for_each_segment(bv, bio, iter, start)
+	bio_for_each_segment(bv, bio, iter)
 		memzero_bvec(&bv);
 }
-EXPORT_SYMBOL(zero_fill_bio_iter);
+EXPORT_SYMBOL(zero_fill_bio);
 
 /**
  * bio_truncate - truncate the bio to small size of @new_size
@@ -1300,7 +1300,7 @@ static void bio_free_folios(struct bio *bio)
 	int i;
 
 	bio_for_each_bvec_all(bv, bio, i) {
-		struct folio *folio = page_folio(bv->bv_page);
+		struct folio *folio = bvec_folio(bv);
 
 		if (!is_zero_folio(folio))
 			folio_put(folio);
@@ -1409,7 +1409,7 @@ int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen,
 
 static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
 {
-	struct folio *folio = page_folio(bv->bv_page);
+	struct folio *folio = bvec_folio(bv);
 	size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
 			bv->bv_offset / PAGE_SIZE + 1;
 
@@ -1443,7 +1443,7 @@ static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
 			bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
 	}
 
-	folio_put(page_folio(bio->bi_io_vec[0].bv_page));
+	folio_put(bvec_folio(&bio->bi_io_vec[0]));
 }
 
 /**
@@ -1578,26 +1578,6 @@ void __bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(__bio_advance);
 
-void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
-			struct bio *src, struct bvec_iter *src_iter)
-{
-	while (src_iter->bi_size && dst_iter->bi_size) {
-		struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
-		struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
-		unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
-		void *src_buf = bvec_kmap_local(&src_bv);
-		void *dst_buf = bvec_kmap_local(&dst_bv);
-
-		memcpy(dst_buf, src_buf, bytes);
-
-		kunmap_local(dst_buf);
-		kunmap_local(src_buf);
-
-		bio_advance_iter_single(src, src_iter, bytes);
-		bio_advance_iter_single(dst, dst_iter, bytes);
-	}
-}
-EXPORT_SYMBOL(bio_copy_data_iter);
 
 /**
  * bio_copy_data - copy contents of data buffers from one bio to another
@@ -1612,7 +1592,21 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 	struct bvec_iter src_iter = src->bi_iter;
 	struct bvec_iter dst_iter = dst->bi_iter;
 
-	bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+	while (src_iter.bi_size && dst_iter.bi_size) {
+		struct bio_vec src_bv = bio_iter_iovec(src, src_iter);
+		struct bio_vec dst_bv = bio_iter_iovec(dst, dst_iter);
+		unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+		void *src_buf = bvec_kmap_local(&src_bv);
+		void *dst_buf = bvec_kmap_local(&dst_bv);
+
+		memcpy(dst_buf, src_buf, bytes);
+
+		kunmap_local(dst_buf);
+		kunmap_local(src_buf);
+
+		bio_advance_iter_single(src, &src_iter, bytes);
+		bio_advance_iter_single(dst, &dst_iter, bytes);
+	}
 }
 EXPORT_SYMBOL(bio_copy_data);
 
@@ -1659,7 +1653,6 @@ void bio_set_pages_dirty(struct bio *bio)
 		folio_unlock(fi.folio);
 	}
 }
-EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
 
 /*
  * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
@@ -1718,7 +1711,6 @@ defer:
 	spin_unlock_irqrestore(&bio_dirty_lock, flags);
 	schedule_work(&bio_dirty_work);
 }
-EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
 static inline bool bio_remaining_done(struct bio *bio)
 {
@@ -1884,7 +1876,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-int biovec_init_pool(mempool_t *pool, int pool_entries)
+static int biovec_init_pool(mempool_t *pool, int pool_entries)
 {
 	struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bc63bd220865..3093c1c03902 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -755,7 +755,7 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
  *
  * Initialize @ctx which can be used to parse blkg config input string @input.
  * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
- * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
+ * blkg_conf_prep().
  */
 void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
 {
@@ -771,10 +771,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_init);
  * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
  * set to point past the device node prefix.
  *
- * This function may be called multiple times on @ctx and the extra calls become
- * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
- * explicitly if bdev access is needed without resolving the blkcg / policy part
- * of @ctx->input. Returns -errno on error.
+ * Returns: -errno on error.
  */
 int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
 {
@@ -783,8 +780,8 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
 	struct block_device *bdev;
 	int key_len;
 
-	if (ctx->bdev)
-		return 0;
+	if (WARN_ON_ONCE(ctx->bdev))
+		return -EINVAL;
 
 	if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
 		return -EINVAL;
@@ -813,38 +810,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
 	ctx->bdev = bdev;
 	return 0;
 }
-/*
- * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
- * ensures the correct locking order between freeze queue and q->rq_qos_mutex.
- *
- * This function returns negative error on failure. On success it returns
- * memflags which must be saved and later passed to blkg_conf_exit_frozen
- * for restoring the memalloc scope.
- */
-unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
-{
-	int ret;
-	unsigned long memflags;
-
-	if (ctx->bdev)
-		return -EINVAL;
-
-	ret = blkg_conf_open_bdev(ctx);
-	if (ret < 0)
-		return ret;
-	/*
-	 * At this point, we haven’t started protecting anything related to QoS,
-	 * so we release q->rq_qos_mutex here, which was first acquired in blkg_
-	 * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
-	 * the queue to maintain the correct locking order.
-	 */
-	mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
-	memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
-	mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
-	return memflags;
-}
+EXPORT_SYMBOL_GPL(blkg_conf_open_bdev);
 
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
@@ -857,22 +823,20 @@ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
  * following MAJ:MIN, @ctx->bdev points to the target block device and
  * @ctx->blkg to the blkg being configured.
  *
- * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
+ * blkg_conf_open_bdev() must be called on @ctx beforehand. On success, this
  * function returns with queue lock held and must be followed by
- * blkg_conf_exit().
+ * blkg_conf_close_bdev().
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		   struct blkg_conf_ctx *ctx)
-	__acquires(&bdev->bd_queue->queue_lock)
 {
 	struct gendisk *disk;
 	struct request_queue *q;
 	struct blkcg_gq *blkg;
 	int ret;
 
-	ret = blkg_conf_open_bdev(ctx);
-	if (ret)
-		return ret;
+	if (WARN_ON_ONCE(!ctx->bdev))
+		return -EINVAL;
 
 	disk = ctx->bdev->bd_disk;
 	q = disk->queue;
@@ -970,43 +934,29 @@ fail_exit:
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
 
 /**
- * blkg_conf_exit - clean up per-blkg config update
+ * blkg_conf_unprep - counterpart of blkg_conf_prep()
  * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
- *
- * Clean up after per-blkg config update. This function must be called on all
- * blkg_conf_ctx's initialized with blkg_conf_init().
  */
-void blkg_conf_exit(struct blkg_conf_ctx *ctx)
-	__releases(&ctx->bdev->bd_queue->queue_lock)
-	__releases(&ctx->bdev->bd_queue->rq_qos_mutex)
+void blkg_conf_unprep(struct blkg_conf_ctx *ctx)
 {
-	if (ctx->blkg) {
-		spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
-		ctx->blkg = NULL;
-	}
-
-	if (ctx->bdev) {
-		mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
-		blkdev_put_no_open(ctx->bdev);
-		ctx->body = NULL;
-		ctx->bdev = NULL;
-	}
+	WARN_ON_ONCE(!ctx->blkg);
+	spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
+	ctx->blkg = NULL;
 }
-EXPORT_SYMBOL_GPL(blkg_conf_exit);
+EXPORT_SYMBOL_GPL(blkg_conf_unprep);
 
-/*
- * Similar to blkg_conf_exit, but also unfreezes the queue. Should be used
- * when blkg_conf_open_bdev_frozen is used to open the bdev.
+/**
+ * blkg_conf_close_bdev - counterpart of blkg_conf_open_bdev()
+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
  */
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
+void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx)
 {
-	if (ctx->bdev) {
-		struct request_queue *q = ctx->bdev->bd_queue;
-
-		blkg_conf_exit(ctx);
-		blk_mq_unfreeze_queue(q, memflags);
-	}
+	mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
+	blkdev_put_no_open(ctx->bdev);
+	ctx->body = NULL;
+	ctx->bdev = NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_conf_close_bdev);
 
 static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
 {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1cce3294634d..f25fecb87c43 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -218,12 +218,15 @@ struct blkg_conf_ctx {
 };
 
 void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
-int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
-unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
+int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
+	__cond_acquires(0, &ctx->bdev->bd_queue->rq_qos_mutex);
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   struct blkg_conf_ctx *ctx);
-void blkg_conf_exit(struct blkg_conf_ctx *ctx);
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);
+		   struct blkg_conf_ctx *ctx)
+	__cond_acquires(0, &ctx->bdev->bd_disk->queue->queue_lock);
+void blkg_conf_unprep(struct blkg_conf_ctx *ctx)
+	__releases(ctx->bdev->bd_disk->queue->queue_lock);
+void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx)
+	__releases(&ctx->bdev->bd_queue->rq_qos_mutex);
 
 /**
  * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
diff --git a/block/blk-core.c b/block/blk-core.c
index 17450058ea6d..73a41df98c9a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -50,6 +50,7 @@
 #include "blk-cgroup.h"
 #include "blk-throttle.h"
 #include "blk-ioprio.h"
+#include "error-injection.h"
 
 struct dentry *blk_debugfs_root;
 
@@ -132,39 +133,56 @@ inline const char *blk_op_str(enum req_op op)
 }
 EXPORT_SYMBOL_GPL(blk_op_str);
 
+enum req_op str_to_blk_op(const char *op)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(blk_op_name); i++)
+		if (blk_op_name[i] && !strcmp(blk_op_name[i], op))
+			return (enum req_op)i;
+	return REQ_OP_LAST;
+}
+
+#define ENT(_tag, _errno, _desc)	\
+[BLK_STS_##_tag] = {				\
+	.errno		= _errno,		\
+	.tag		= __stringify(_tag),	\
+	.name		= _desc,		\
+}
 static const struct {
 	int		errno;
+	const char	*tag;
 	const char	*name;
 } blk_errors[] = {
-	[BLK_STS_OK]		= { 0,		"" },
-	[BLK_STS_NOTSUPP]	= { -EOPNOTSUPP, "operation not supported" },
-	[BLK_STS_TIMEOUT]	= { -ETIMEDOUT,	"timeout" },
-	[BLK_STS_NOSPC]		= { -ENOSPC,	"critical space allocation" },
-	[BLK_STS_TRANSPORT]	= { -ENOLINK,	"recoverable transport" },
-	[BLK_STS_TARGET]	= { -EREMOTEIO,	"critical target" },
-	[BLK_STS_RESV_CONFLICT]	= { -EBADE,	"reservation conflict" },
-	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
-	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
-	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
-	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
-	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
-	[BLK_STS_OFFLINE]	= { -ENODEV,	"device offline" },
+	ENT(OK,			0,		""),
+	ENT(NOTSUPP,		-EOPNOTSUPP,	"operation not supported"),
+	ENT(TIMEOUT,		-ETIMEDOUT,	"timeout"),
+	ENT(NOSPC,		-ENOSPC,	"critical space allocation"),
+	ENT(TRANSPORT,		-ENOLINK,	"recoverable transport"),
+	ENT(TARGET,		-EREMOTEIO,	"critical target"),
+	ENT(RESV_CONFLICT,	-EBADE,		"reservation conflict"),
+	ENT(MEDIUM,		-ENODATA,	"critical medium"),
+	ENT(PROTECTION,		-EILSEQ,	"protection"),
+	ENT(RESOURCE,		-ENOMEM,	"kernel resource"),
+	ENT(DEV_RESOURCE,	-EBUSY,		"device resource"),
+	ENT(AGAIN,		-EAGAIN,	"nonblocking retry"),
+	ENT(OFFLINE,		-ENODEV,	"device offline"),
 
 	/* device mapper special case, should not leak out: */
-	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
+	ENT(DM_REQUEUE,		-EREMCHG,	"dm internal retry"),
 
 	/* zone device specific errors */
-	[BLK_STS_ZONE_OPEN_RESOURCE]	= { -ETOOMANYREFS, "open zones exceeded" },
-	[BLK_STS_ZONE_ACTIVE_RESOURCE]	= { -EOVERFLOW, "active zones exceeded" },
+	ENT(ZONE_OPEN_RESOURCE, -ETOOMANYREFS,	"open zones exceeded"),
+	ENT(ZONE_ACTIVE_RESOURCE, -EOVERFLOW,	"active zones exceeded"),
 
 	/* Command duration limit device-side timeout */
-	[BLK_STS_DURATION_LIMIT]	= { -ETIME, "duration limit exceeded" },
-
-	[BLK_STS_INVAL]		= { -EINVAL,	"invalid" },
+	ENT(DURATION_LIMIT,	-ETIME,		"duration limit exceeded"),
+	ENT(INVAL,		-EINVAL,	"invalid"),
 
 	/* everything else not covered above: */
-	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
+	ENT(IOERR,		-EIO,		"I/O"),
 };
+#undef ENT
 
 blk_status_t errno_to_blk_status(int errno)
 {
@@ -197,7 +215,32 @@ const char *blk_status_to_str(blk_status_t status)
 		return "<null>";
 	return blk_errors[idx].name;
 }
-EXPORT_SYMBOL_GPL(blk_status_to_str);
+
+const char *blk_status_to_tag(blk_status_t status)
+{
+	int idx = (__force int)status;
+
+	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors) || !blk_errors[idx].tag))
+		return "<null>";
+	return blk_errors[idx].tag;
+}
+
+blk_status_t tag_to_blk_status(const char *tag)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
+		if (blk_errors[i].tag &&
+		    !strcmp(blk_errors[i].tag, tag))
+			return (__force blk_status_t)i;
+	}
+
+	/*
+	 * Return BLK_STS_OK for mismatches as this function is intended to
+	 * parse error status values.
+	 */
+	return BLK_STS_OK;
+}
 
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
@@ -637,12 +680,10 @@ static void __submit_bio(struct bio *bio)
 		struct gendisk *disk = bio->bi_bdev->bd_disk;
 	
 		if ((bio->bi_opf & REQ_POLLED) &&
-		    !(disk->queue->limits.features & BLK_FEAT_POLL)) {
-			bio->bi_status = BLK_STS_NOTSUPP;
-			bio_endio(bio);
-		} else {
+		    !(disk->queue->limits.features & BLK_FEAT_POLL))
+			bio_endio_status(bio, BLK_STS_NOTSUPP);
+		else
 			disk->fops->submit_bio(bio);
-		}
 		blk_queue_exit(disk->queue);
 	}
 
@@ -727,6 +768,9 @@ static void __submit_bio_noacct_mq(struct bio *bio)
 
 void submit_bio_noacct_nocheck(struct bio *bio, bool split)
 {
+	if (unlikely(blk_error_inject(bio)))
+		return;
+
 	blk_cgroup_bio_start(bio);
 
 	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
@@ -887,8 +931,7 @@ void submit_bio_noacct(struct bio *bio)
 not_supported:
 	status = BLK_STS_NOTSUPP;
 end_io:
-	bio->bi_status = status;
-	bio_endio(bio);
+	bio_endio_status(bio, status);
 }
 EXPORT_SYMBOL(submit_bio_noacct);
 
@@ -1042,7 +1085,7 @@ unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
 {
 	part_stat_lock();
 	update_io_ticks(bdev, start_time, false);
-	part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
+	bdev_inc_in_flight(bdev, op);
 	part_stat_unlock();
 
 	return start_time;
@@ -1073,7 +1116,7 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
 	part_stat_inc(bdev, ios[sgrp]);
 	part_stat_add(bdev, sectors[sgrp], sectors);
 	part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
-	part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
+	bdev_dec_in_flight(bdev, op);
 	part_stat_unlock();
 }
 EXPORT_SYMBOL(bdev_end_io_acct);
@@ -1270,7 +1313,6 @@ void blk_io_schedule(void)
 	else
 		io_schedule();
 }
-EXPORT_SYMBOL_GPL(blk_io_schedule);
 
 int __init blk_dev_init(void)
 {
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index ab6924fba280..2a5c52ab74b4 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -362,8 +362,7 @@ static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio)
 	status = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
 					bc->bc_key, &slot);
 	if (status != BLK_STS_OK) {
-		src_bio->bi_status = status;
-		bio_endio(src_bio);
+		bio_endio_status(src_bio, status);
 		return;
 	}
 	__blk_crypto_fallback_encrypt_bio(src_bio,
@@ -438,8 +437,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
 	}
 	mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
 
-	bio->bi_status = status;
-	bio_endio(bio);
+	bio_endio_status(bio, status);
 }
 
 /**
@@ -500,8 +498,7 @@ bool blk_crypto_fallback_bio_prep(struct bio *bio)
 
 	if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile,
 					&bc->bc_key->crypto_cfg)) {
-		bio->bi_status = BLK_STS_NOTSUPP;
-		bio_endio(bio);
+		bio_endio_status(bio, BLK_STS_NOTSUPP);
 		return false;
 	}
 
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 4ac74443687a..cf447ba4a66e 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -43,6 +43,7 @@ struct blk_crypto_keyslot {
 };
 
 static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
+	__acquires(&profile->lock)
 {
 	/*
 	 * Calling into the driver requires profile->lock held and the device
@@ -55,6 +56,7 @@ static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
 }
 
 static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile)
+	__releases(&profile->lock)
 {
 	up_write(&profile->lock);
 	if (profile->dev)
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 856d3c5b1fa0..165c9d2cce07 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -267,8 +267,7 @@ bool __blk_crypto_submit_bio(struct bio *bio)
 		if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)) {
 			pr_warn_once("%pg: crypto API fallback disabled; failing request.\n",
 				bdev);
-			bio->bi_status = BLK_STS_NOTSUPP;
-			bio_endio(bio);
+			bio_endio_status(bio, BLK_STS_NOTSUPP);
 			return false;
 		}
 		return blk_crypto_fallback_bio_prep(bio);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 0cca88a366dc..563cc7dcf348 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -727,26 +727,6 @@ static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
 	put_cpu_ptr(gcs);
 }
 
-static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
-{
-	if (lock_ioc) {
-		spin_lock_irqsave(&iocg->ioc->lock, *flags);
-		spin_lock(&iocg->waitq.lock);
-	} else {
-		spin_lock_irqsave(&iocg->waitq.lock, *flags);
-	}
-}
-
-static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
-{
-	if (unlock_ioc) {
-		spin_unlock(&iocg->waitq.lock);
-		spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
-	} else {
-		spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
-	}
-}
-
 #define CREATE_TRACE_POINTS
 #include <trace/events/iocost.h>
 
@@ -1589,9 +1569,17 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
 
 	ioc_now(iocg->ioc, &now);
 
-	iocg_lock(iocg, pay_debt, &flags);
-	iocg_kick_waitq(iocg, pay_debt, &now);
-	iocg_unlock(iocg, pay_debt, &flags);
+	if (pay_debt) {
+		spin_lock_irqsave(&iocg->ioc->lock, flags);
+		spin_lock(&iocg->waitq.lock);
+		iocg_kick_waitq(iocg, pay_debt, &now);
+		spin_unlock(&iocg->waitq.lock);
+		spin_unlock_irqrestore(&iocg->ioc->lock, flags);
+	} else {
+		spin_lock_irqsave(&iocg->waitq.lock, flags);
+		iocg_kick_waitq(iocg, pay_debt, &now);
+		spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+	}
 
 	return HRTIMER_NORESTART;
 }
@@ -2614,6 +2602,88 @@ static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
 	return cost;
 }
 
+enum over_budget_action {
+	action_retry,
+	action_commit,
+	action_wait,
+	action_return,
+};
+
+static enum over_budget_action
+iocg_handle_over_budget(struct rq_qos *rqos, struct ioc_gq *iocg,
+			struct bio *bio, struct ioc_now *now,
+			struct iocg_wait *wait, bool use_debt, bool ioc_locked,
+			u64 abs_cost, u64 cost)
+{
+	lockdep_assert_held(&iocg->waitq.lock);
+
+	/*
+	 * @iocg must stay activated for debt and waitq handling. Deactivation
+	 * is synchronized against both ioc->lock and waitq.lock and we won't
+	 * get deactivated as long as we're waiting or have debt, so we're good
+	 * if we're activated here. In the unlikely cases that we aren't, just
+	 * issue the IO.
+	 */
+	if (unlikely(list_empty(&iocg->active_list)))
+		return action_commit;
+
+	/*
+	 * We're over budget. If @bio has to be issued regardless, remember
+	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
+	 * off the debt before waking more IOs.
+	 *
+	 * This way, the debt is continuously paid off each period with the
+	 * actual budget available to the cgroup. If we just wound vtime, we
+	 * would incorrectly use the current hw_inuse for the entire amount
+	 * which, for example, can lead to the cgroup staying blocked for a
+	 * long time even with substantially raised hw_inuse.
+	 *
+	 * An iocg with vdebt should stay online so that the timer can keep
+	 * deducting its vdebt and [de]activate use_delay mechanism
+	 * accordingly. We don't want to race against the timer trying to
+	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
+	 * penalizing the cgroup and its descendants.
+	 */
+	if (use_debt) {
+		iocg_incur_debt(iocg, abs_cost, now);
+		if (iocg_kick_delay(iocg, now))
+			blkcg_schedule_throttle(rqos->disk,
+						(bio->bi_opf & REQ_SWAP) ==
+							REQ_SWAP);
+		return action_return;
+	}
+
+	/* guarantee that iocgs w/ waiters have maximum inuse */
+	if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
+		if (!ioc_locked)
+			return action_retry;
+		lockdep_assert_held(&iocg->ioc->lock);
+		propagate_weights(iocg, iocg->active, iocg->active, true, now);
+	}
+
+	/*
+	 * Append self to the waitq and schedule the wakeup timer if we're
+	 * the first waiter.  The timer duration is calculated based on the
+	 * current vrate.  vtime and hweight changes can make it too short
+	 * or too long.  Each wait entry records the absolute cost it's
+	 * waiting for to allow re-evaluation using a custom wait entry.
+	 *
+	 * If too short, the timer simply reschedules itself.  If too long,
+	 * the period timer will notice and trigger wakeups.
+	 *
+	 * All waiters are on iocg->waitq and the wait states are
+	 * synchronized using waitq.lock.
+	 */
+	init_wait_func(&wait->wait, iocg_wake_fn);
+	wait->bio = bio;
+	wait->abs_cost = abs_cost;
+	wait->committed = false; /* will be set true by waker */
+
+	__add_wait_queue_entry_tail(&iocg->waitq, &wait->wait);
+	iocg_kick_waitq(iocg, ioc_locked, now);
+	return action_wait;
+}
+
 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
 {
 	struct blkcg_gq *blkg = bio->bi_blkg;
@@ -2623,6 +2693,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
 	struct iocg_wait wait;
 	u64 abs_cost, cost, vtime;
 	bool use_debt, ioc_locked;
+	enum over_budget_action action;
 	unsigned long flags;
 
 	/* bypass IOs if disabled, still initializing, or for root cgroup */
@@ -2662,81 +2733,34 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
 	use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
 	ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
 retry_lock:
-	iocg_lock(iocg, ioc_locked, &flags);
-
-	/*
-	 * @iocg must stay activated for debt and waitq handling. Deactivation
-	 * is synchronized against both ioc->lock and waitq.lock and we won't
-	 * get deactivated as long as we're waiting or has debt, so we're good
-	 * if we're activated here. In the unlikely cases that we aren't, just
-	 * issue the IO.
-	 */
-	if (unlikely(list_empty(&iocg->active_list))) {
-		iocg_unlock(iocg, ioc_locked, &flags);
+	if (ioc_locked) {
+		spin_lock_irqsave(&iocg->ioc->lock, flags);
+		spin_lock(&iocg->waitq.lock);
+		action = iocg_handle_over_budget(rqos, iocg, bio, &now, &wait,
+						 use_debt, ioc_locked, abs_cost,
+						 cost);
+		spin_unlock(&iocg->waitq.lock);
+		spin_unlock_irqrestore(&iocg->ioc->lock, flags);
+	} else {
+		spin_lock_irqsave(&iocg->waitq.lock, flags);
+		action = iocg_handle_over_budget(rqos, iocg, bio, &now, &wait,
+						 use_debt, ioc_locked, abs_cost,
+						 cost);
+		spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+	}
+	switch (action) {
+	case action_retry:
+		ioc_locked = true;
+		goto retry_lock;
+	case action_commit:
 		iocg_commit_bio(iocg, bio, abs_cost, cost);
 		return;
-	}
-
-	/*
-	 * We're over budget. If @bio has to be issued regardless, remember
-	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
-	 * off the debt before waking more IOs.
-	 *
-	 * This way, the debt is continuously paid off each period with the
-	 * actual budget available to the cgroup. If we just wound vtime, we
-	 * would incorrectly use the current hw_inuse for the entire amount
-	 * which, for example, can lead to the cgroup staying blocked for a
-	 * long time even with substantially raised hw_inuse.
-	 *
-	 * An iocg with vdebt should stay online so that the timer can keep
-	 * deducting its vdebt and [de]activate use_delay mechanism
-	 * accordingly. We don't want to race against the timer trying to
-	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
-	 * penalizing the cgroup and its descendants.
-	 */
-	if (use_debt) {
-		iocg_incur_debt(iocg, abs_cost, &now);
-		if (iocg_kick_delay(iocg, &now))
-			blkcg_schedule_throttle(rqos->disk,
-					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
-		iocg_unlock(iocg, ioc_locked, &flags);
+	case action_return:
 		return;
+	case action_wait:
+		break;
 	}
 
-	/* guarantee that iocgs w/ waiters have maximum inuse */
-	if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
-		if (!ioc_locked) {
-			iocg_unlock(iocg, false, &flags);
-			ioc_locked = true;
-			goto retry_lock;
-		}
-		propagate_weights(iocg, iocg->active, iocg->active, true,
-				  &now);
-	}
-
-	/*
-	 * Append self to the waitq and schedule the wakeup timer if we're
-	 * the first waiter.  The timer duration is calculated based on the
-	 * current vrate.  vtime and hweight changes can make it too short
-	 * or too long.  Each wait entry records the absolute cost it's
-	 * waiting for to allow re-evaluation using a custom wait entry.
-	 *
-	 * If too short, the timer simply reschedules itself.  If too long,
-	 * the period timer will notice and trigger wakeups.
-	 *
-	 * All waiters are on iocg->waitq and the wait states are
-	 * synchronized using waitq.lock.
-	 */
-	init_wait_func(&wait.wait, iocg_wake_fn);
-	wait.bio = bio;
-	wait.abs_cost = abs_cost;
-	wait.committed = false;	/* will be set true by waker */
-
-	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
-	iocg_kick_waitq(iocg, ioc_locked, &now);
-
-	iocg_unlock(iocg, ioc_locked, &flags);
-
 	while (true) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (wait.committed)
@@ -3140,19 +3164,25 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
 
 	blkg_conf_init(&ctx, buf);
 
+	ret = blkg_conf_open_bdev(&ctx);
+	if (ret)
+		return ret;
+
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
 	if (ret)
-		goto err;
+		goto close_bdev;
 
 	iocg = blkg_to_iocg(ctx.blkg);
 
+	ret = -EINVAL;
+
 	if (!strncmp(ctx.body, "default", 7)) {
 		v = 0;
 	} else {
 		if (!sscanf(ctx.body, "%u", &v))
-			goto einval;
+			goto unprep;
 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
-			goto einval;
+			goto unprep;
 	}
 
 	spin_lock(&iocg->ioc->lock);
@@ -3161,14 +3191,15 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
 	weight_updated(iocg, &now);
 	spin_unlock(&iocg->ioc->lock);
 
-	blkg_conf_exit(&ctx);
-	return nbytes;
+	ret = 0;
 
-einval:
-	ret = -EINVAL;
-err:
-	blkg_conf_exit(&ctx);
-	return ret;
+unprep:
+	blkg_conf_unprep(&ctx);
+
+close_bdev:
+	blkg_conf_close_bdev(&ctx);
+
+	return ret ?: nbytes;
 }
 
 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
@@ -3226,34 +3257,43 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 			     size_t nbytes, loff_t off)
 {
 	struct blkg_conf_ctx ctx;
+	struct request_queue *q;
 	struct gendisk *disk;
 	struct ioc *ioc;
 	u32 qos[NR_QOS_PARAMS];
 	bool enable, user;
 	char *body, *p;
-	unsigned long memflags;
+	unsigned int memflags;
 	int ret;
 
 	blkg_conf_init(&ctx, input);
 
-	memflags = blkg_conf_open_bdev_frozen(&ctx);
-	if (IS_ERR_VALUE(memflags)) {
-		ret = memflags;
-		goto err;
-	}
+	ret = blkg_conf_open_bdev(&ctx);
+	if (ret)
+		return ret;
+	/*
+	 * At this point, we haven’t started protecting anything related to QoS,
+	 * so we release q->rq_qos_mutex here, which was first acquired in blkg_
+	 * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
+	 * the queue to maintain the correct locking order.
+	 */
+	mutex_unlock(&ctx.bdev->bd_queue->rq_qos_mutex);
+
+	memflags = blk_mq_freeze_queue(ctx.bdev->bd_queue);
+	mutex_lock(&ctx.bdev->bd_queue->rq_qos_mutex);
 
 	body = ctx.body;
 	disk = ctx.bdev->bd_disk;
 	if (!queue_is_mq(disk->queue)) {
 		ret = -EOPNOTSUPP;
-		goto err;
+		goto close_bdev;
 	}
 
 	ioc = q_to_ioc(disk->queue);
 	if (!ioc) {
 		ret = blk_iocost_init(disk);
 		if (ret)
-			goto err;
+			goto close_bdev;
 		ioc = q_to_ioc(disk->queue);
 	}
 
@@ -3357,15 +3397,17 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
 
 	blk_mq_unquiesce_queue(disk->queue);
 
-	blkg_conf_exit_frozen(&ctx, memflags);
-	return nbytes;
+close_bdev:
+	q = ctx.bdev->bd_queue;
+	blkg_conf_close_bdev(&ctx);
+	blk_mq_unfreeze_queue(q, memflags);
+	return ret ?: nbytes;
+
 einval:
 	spin_unlock_irq(&ioc->lock);
 	blk_mq_unquiesce_queue(disk->queue);
 	ret = -EINVAL;
-err:
-	blkg_conf_exit_frozen(&ctx, memflags);
-	return ret;
+	goto close_bdev;
 }
 
 static u64 ioc_cost_model_prfill(struct seq_file *sf,
@@ -3430,20 +3472,20 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 
 	ret = blkg_conf_open_bdev(&ctx);
 	if (ret)
-		goto err;
+		return ret;
 
 	body = ctx.body;
 	q = bdev_get_queue(ctx.bdev);
 	if (!queue_is_mq(q)) {
 		ret = -EOPNOTSUPP;
-		goto err;
+		goto close_bdev;
 	}
 
 	ioc = q_to_ioc(q);
 	if (!ioc) {
 		ret = blk_iocost_init(ctx.bdev->bd_disk);
 		if (ret)
-			goto err;
+			goto close_bdev;
 		ioc = q_to_ioc(q);
 	}
 
@@ -3454,6 +3496,8 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
 	user = ioc->user_cost_model;
 
+	ret = -EINVAL;
+
 	while ((p = strsep(&body, " \t\n"))) {
 		substring_t args[MAX_OPT_ARGS];
 		char buf[32];
@@ -3471,20 +3515,20 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 			else if (!strcmp(buf, "user"))
 				user = true;
 			else
-				goto einval;
+				goto unlock;
 			continue;
 		case COST_MODEL:
 			match_strlcpy(buf, &args[0], sizeof(buf));
 			if (strcmp(buf, "linear"))
-				goto einval;
+				goto unlock;
 			continue;
 		}
 
 		tok = match_token(p, i_lcoef_tokens, args);
 		if (tok == NR_I_LCOEFS)
-			goto einval;
+			goto unlock;
 		if (match_u64(&args[0], &v))
-			goto einval;
+			goto unlock;
 		u[tok] = v;
 		user = true;
 	}
@@ -3496,24 +3540,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 		ioc->user_cost_model = false;
 	}
 	ioc_refresh_params(ioc, true);
-	spin_unlock_irq(&ioc->lock);
-
-	blk_mq_unquiesce_queue(q);
-	blk_mq_unfreeze_queue(q, memflags);
 
-	blkg_conf_exit(&ctx);
-	return nbytes;
+	ret = 0;
 
-einval:
+unlock:
 	spin_unlock_irq(&ioc->lock);
 
 	blk_mq_unquiesce_queue(q);
 	blk_mq_unfreeze_queue(q, memflags);
 
-	ret = -EINVAL;
-err:
-	blkg_conf_exit(&ctx);
-	return ret;
+close_bdev:
+	blkg_conf_close_bdev(&ctx);
+	return ret ?: nbytes;
 }
 
 static struct cftype ioc_files[] = {
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 53e8dd2dfa8a..1aaee6fb0f59 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -840,7 +840,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 
 	ret = blkg_conf_open_bdev(&ctx);
 	if (ret)
-		goto out;
+		return ret;
 
 	/*
 	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
@@ -850,11 +850,11 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	if (!iolat_rq_qos(ctx.bdev->bd_queue))
 		ret = blk_iolatency_init(ctx.bdev->bd_disk);
 	if (ret)
-		goto out;
+		goto close_bdev;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx);
 	if (ret)
-		goto out;
+		goto close_bdev;
 
 	iolat = blkg_to_lat(ctx.blkg);
 	p = ctx.body;
@@ -865,7 +865,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 		char val[21];	/* 18446744073709551616 */
 
 		if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
-			goto out;
+			goto unprep;
 
 		if (!strcmp(key, "target")) {
 			u64 v;
@@ -875,9 +875,9 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 			else if (sscanf(val, "%llu", &v) == 1)
 				lat_val = v * NSEC_PER_USEC;
 			else
-				goto out;
+				goto unprep;
 		} else {
-			goto out;
+			goto unprep;
 		}
 	}
 
@@ -889,8 +889,11 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	if (oldval != iolat->min_lat_nsec)
 		iolatency_clear_scaling(blkg);
 	ret = 0;
-out:
-	blkg_conf_exit(&ctx);
+
+unprep:
+	blkg_conf_unprep(&ctx);
+close_bdev:
+	blkg_conf_close_bdev(&ctx);
 	return ret ?: nbytes;
 }
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fcf09325b22e..ab1161ca69f1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -122,8 +122,7 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
 	struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs);
 
 	if (IS_ERR(split)) {
-		bio->bi_status = errno_to_blk_status(PTR_ERR(split));
-		bio_endio(bio);
+		bio_endio_status(bio, errno_to_blk_status(PTR_ERR(split)));
 		return NULL;
 	}
 
@@ -143,8 +142,7 @@ EXPORT_SYMBOL_GPL(bio_submit_split_bioset);
 static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
 {
 	if (unlikely(split_sectors < 0)) {
-		bio->bi_status = errno_to_blk_status(split_sectors);
-		bio_endio(bio);
+		bio_endio_status(bio, errno_to_blk_status(split_sectors));
 		return NULL;
 	}
 
@@ -547,7 +545,7 @@ static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
 	if (!blk_cgroup_mergeable(req, bio))
 		goto no_merge;
 
-	if (blk_integrity_merge_bio(req->q, req, bio) == false)
+	if (unlikely(!blk_integrity_merge_bio(req->q, req, bio)))
 		goto no_merge;
 
 	/* discard request merge won't add new segment */
@@ -649,7 +647,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (!blk_cgroup_mergeable(req, next->bio))
 		return 0;
 
-	if (blk_integrity_merge_rq(q, req, next) == false)
+	if (unlikely(!blk_integrity_merge_rq(q, req, next)))
 		return 0;
 
 	if (!bio_crypt_ctx_merge_rq(req, next))
@@ -723,8 +721,7 @@ static void blk_account_io_merge_request(struct request *req)
 	if (req->rq_flags & RQF_IO_STAT) {
 		part_stat_lock();
 		part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
-		part_stat_local_dec(req->part,
-				    in_flight[op_is_write(req_op(req))]);
+		bdev_dec_in_flight(req->part, req_op(req));
 		part_stat_unlock();
 	}
 }
@@ -905,7 +902,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 
 	if (!blk_cgroup_mergeable(rq, bio))
 		return false;
-	if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
+	if (unlikely(!blk_integrity_merge_bio(rq->q, rq, bio)))
 		return false;
 	if (!bio_crypt_rq_ctx_compatible(rq, bio))
 		return false;
@@ -915,7 +912,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 		return false;
 	if (rq->bio->bi_ioprio != bio->bi_ioprio)
 		return false;
-	if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
+	if (unlikely(!blk_atomic_write_mergeable_rq_bio(rq, bio)))
 		return false;
 
 	return true;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 047ec887456b..6754d8f9449c 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -19,8 +19,10 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
 	return 0;
 }
 
+#define TO_REQUEST_QUEUE(m) ((struct request_queue *)m->private)
+
 static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos)
-	__acquires(&q->requeue_lock)
+	__acquires(&TO_REQUEST_QUEUE(m)->requeue_lock)
 {
 	struct request_queue *q = m->private;
 
@@ -36,13 +38,15 @@ static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos)
 }
 
 static void queue_requeue_list_stop(struct seq_file *m, void *v)
-	__releases(&q->requeue_lock)
+	__releases(&TO_REQUEST_QUEUE(m)->requeue_lock)
 {
 	struct request_queue *q = m->private;
 
 	spin_unlock_irq(&q->requeue_lock);
 }
 
+#undef TO_REQUEST_QUEUE
+
 static const struct seq_operations queue_requeue_list_seq_ops = {
 	.start	= queue_requeue_list_start,
 	.next	= queue_requeue_list_next,
@@ -297,8 +301,10 @@ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
 }
 EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show);
 
+#define TO_HCTX(m) ((struct blk_mq_hw_ctx *)m->private)
+
 static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos)
-	__acquires(&hctx->lock)
+	__acquires(&TO_HCTX(m)->lock)
 {
 	struct blk_mq_hw_ctx *hctx = m->private;
 
@@ -314,13 +320,15 @@ static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
 }
 
 static void hctx_dispatch_stop(struct seq_file *m, void *v)
-	__releases(&hctx->lock)
+	__releases(&TO_HCTX(m)->lock)
 {
 	struct blk_mq_hw_ctx *hctx = m->private;
 
 	spin_unlock(&hctx->lock);
 }
 
+#undef TO_HCTX
+
 static const struct seq_operations hctx_dispatch_seq_ops = {
 	.start	= hctx_dispatch_start,
 	.next	= hctx_dispatch_next,
@@ -484,9 +492,11 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
 	return 0;
 }
 
+#define TO_CTX(m) ((struct blk_mq_ctx *)m->private)
+
 #define CTX_RQ_SEQ_OPS(name, type)					\
 static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
-	__acquires(&ctx->lock)						\
+	__acquires(&TO_CTX(m)->lock)					\
 {									\
 	struct blk_mq_ctx *ctx = m->private;				\
 									\
@@ -503,7 +513,7 @@ static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v,	\
 }									\
 									\
 static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v)	\
-	__releases(&ctx->lock)						\
+	__releases(&TO_CTX(m)->lock)					\
 {									\
 	struct blk_mq_ctx *ctx = m->private;				\
 									\
@@ -521,6 +531,8 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
 CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
 CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
 
+#undef TO_CTX
+
 static int blk_mq_debugfs_show(struct seq_file *m, void *v)
 {
 	const struct blk_mq_debugfs_attr *attr = m->private;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..35deee5bbc73 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include <linux/kmemleak.h>
 
 #include <linux/delay.h>
+#include <trace/events/block.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
@@ -181,6 +182,11 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != BLK_MQ_NO_TAG)
 			break;
 
+		/* Log the starvation event before altering task state */
+		trace_block_rq_tag_wait(data->q, data->hctx,
+					data->rq_flags & RQF_SCHED_TAGS,
+					data->flags);
+
 		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __blk_mq_get_tag(data, bt);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a24175441380..88cb5acc4f39 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1082,49 +1082,18 @@ static inline void blk_account_io_done(struct request *req, u64 now)
 		update_io_ticks(req->part, jiffies, true);
 		part_stat_inc(req->part, ios[sgrp]);
 		part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
-		part_stat_local_dec(req->part,
-				    in_flight[op_is_write(req_op(req))]);
+		bdev_dec_in_flight(req->part, req_op(req));
 		part_stat_unlock();
 	}
 }
 
-static inline bool blk_rq_passthrough_stats(struct request *req)
-{
-	struct bio *bio = req->bio;
-
-	if (!blk_queue_passthrough_stat(req->q))
-		return false;
-
-	/* Requests without a bio do not transfer data. */
-	if (!bio)
-		return false;
-
-	/*
-	 * Stats are accumulated in the bdev, so must have one attached to a
-	 * bio to track stats. Most drivers do not set the bdev for passthrough
-	 * requests, but nvme is one that will set it.
-	 */
-	if (!bio->bi_bdev)
-		return false;
-
-	/*
-	 * We don't know what a passthrough command does, but we know the
-	 * payload size and data direction. Ensuring the size is aligned to the
-	 * block size filters out most commands with payloads that don't
-	 * represent sector access.
-	 */
-	if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
-		return false;
-	return true;
-}
-
 static inline void blk_account_io_start(struct request *req)
 {
 	trace_block_io_start(req);
 
 	if (!blk_queue_io_stat(req->q))
 		return;
-	if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
+	if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req, req->q))
 		return;
 
 	req->rq_flags |= RQF_IO_STAT;
@@ -1143,7 +1112,7 @@ static inline void blk_account_io_start(struct request *req)
 
 	part_stat_lock();
 	update_io_ticks(req->part, jiffies, false);
-	part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]);
+	bdev_inc_in_flight(req->part, req_op(req));
 	part_stat_unlock();
 }
 
@@ -3170,8 +3139,7 @@ void blk_mq_submit_bio(struct bio *bio)
 	}
 
 	if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
-		bio->bi_status = BLK_STS_NOTSUPP;
-		bio_endio(bio);
+		bio_endio_status(bio, BLK_STS_NOTSUPP);
 		goto queue_exit;
 	}
 
@@ -3215,8 +3183,7 @@ new_request:
 
 	ret = blk_crypto_rq_get_keyslot(rq);
 	if (ret != BLK_STS_OK) {
-		bio->bi_status = ret;
-		bio_endio(bio);
+		bio_endio_status(bio, ret);
 		blk_mq_free_request(rq);
 		return;
 	}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 78c83817b9d3..8274631290db 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -795,6 +795,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		t->features &= ~BLK_FEAT_NOWAIT;
 	if (!(b->features & BLK_FEAT_POLL))
 		t->features &= ~BLK_FEAT_POLL;
+	if (!(b->features & BLK_FEAT_PCI_P2PDMA))
+		t->features &= ~BLK_FEAT_PCI_P2PDMA;
 
 	t->flags |= (b->flags & BLK_FLAG_MISALIGNED);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f22c1f253eb3..520972676ab4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -19,6 +19,7 @@
 #include "blk-wbt.h"
 #include "blk-cgroup.h"
 #include "blk-throttle.h"
+#include "error-injection.h"
 
 struct queue_sysfs_entry {
 	struct attribute attr;
@@ -933,6 +934,8 @@ static void blk_debugfs_remove(struct gendisk *disk)
 
 	blk_debugfs_lock_nomemsave(q);
 	blk_trace_shutdown(q);
+	if (IS_ENABLED(CONFIG_BLK_ERROR_INJECTION))
+		blk_error_injection_exit(disk);
 	debugfs_remove_recursive(q->debugfs_dir);
 	q->debugfs_dir = NULL;
 	q->sched_debugfs_dir = NULL;
@@ -963,6 +966,8 @@ int blk_register_queue(struct gendisk *disk)
 
 	memflags = blk_debugfs_lock(q);
 	q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root);
+	if (IS_ENABLED(CONFIG_BLK_ERROR_INJECTION))
+		blk_error_injection_init(disk);
 	if (queue_is_mq(q))
 		blk_mq_debugfs_register(q);
 	blk_debugfs_unlock(q, memflags);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index cabf91f0d0dc..47052ba21d1b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1353,21 +1353,21 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 
 	ret = blkg_conf_open_bdev(&ctx);
 	if (ret)
-		goto out_finish;
+		return ret;
 
 	if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
 		ret = blk_throtl_init(ctx.bdev->bd_disk);
 		if (ret)
-			goto out_finish;
+			goto close_bdev;
 	}
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
 	if (ret)
-		goto out_finish;
+		goto close_bdev;
 
 	ret = -EINVAL;
 	if (sscanf(ctx.body, "%llu", &v) != 1)
-		goto out_finish;
+		goto unprep;
 	if (!v)
 		v = U64_MAX;
 
@@ -1381,8 +1381,12 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 
 	tg_conf_updated(tg, false);
 	ret = 0;
-out_finish:
-	blkg_conf_exit(&ctx);
+
+unprep:
+	blkg_conf_unprep(&ctx);
+
+close_bdev:
+	blkg_conf_close_bdev(&ctx);
 	return ret ?: nbytes;
 }
 
@@ -1537,17 +1541,17 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 
 	ret = blkg_conf_open_bdev(&ctx);
 	if (ret)
-		goto out_finish;
+		return ret;
 
 	if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
 		ret = blk_throtl_init(ctx.bdev->bd_disk);
 		if (ret)
-			goto out_finish;
+			goto close_bdev;
 	}
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
 	if (ret)
-		goto out_finish;
+		goto close_bdev;
 
 	tg = blkg_to_tg(ctx.blkg);
 	tg_update_carryover(tg);
@@ -1573,11 +1577,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 		p = tok;
 		strsep(&p, "=");
 		if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
-			goto out_finish;
+			goto unprep;
 
 		ret = -ERANGE;
 		if (!val)
-			goto out_finish;
+			goto unprep;
 
 		ret = -EINVAL;
 		if (!strcmp(tok, "rbps"))
@@ -1589,7 +1593,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 		else if (!strcmp(tok, "wiops"))
 			v[3] = min_t(u64, val, UINT_MAX);
 		else
-			goto out_finish;
+			goto unprep;
 	}
 
 	tg->bps[READ] = v[0];
@@ -1599,8 +1603,10 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 
 	tg_conf_updated(tg, false);
 	ret = 0;
-out_finish:
-	blkg_conf_exit(&ctx);
+unprep:
+	blkg_conf_unprep(&ctx);
+close_bdev:
+	blkg_conf_close_bdev(&ctx);
 	return ret ?: nbytes;
 }
 
@@ -1649,7 +1655,7 @@ static void tg_flush_bios(struct throtl_grp *tg)
 	 */
 	tg_update_disptime(tg);
 
-	throtl_schedule_pending_timer(sq, jiffies + 1);
+	throtl_schedule_next_dispatch(sq->parent_sq, true);
 }
 
 static void throtl_pd_offline(struct blkg_policy_data *pd)
@@ -1668,11 +1674,52 @@ struct blkcg_policy blkcg_policy_throtl = {
 	.pd_free_fn		= throtl_pd_free,
 };
 
+static void tg_cancel_writeback_bios(struct throtl_grp *tg,
+				      struct bio_list *cancel_bios)
+{
+	struct throtl_service_queue *sq = &tg->service_queue;
+	struct throtl_data *td = sq_to_td(sq);
+	int rw;
+
+	if (tg->flags & THROTL_TG_CANCELING)
+		return;
+	tg->flags |= THROTL_TG_CANCELING;
+
+	for (rw = READ; rw <= WRITE; rw++) {
+		struct throtl_qnode *qn, *tmp;
+		unsigned int nr_bios = 0;
+
+		list_for_each_entry_safe(qn, tmp, &sq->queued[rw], node) {
+			struct bio *bio;
+
+			while ((bio = bio_list_pop(&qn->bios_iops))) {
+				sq->nr_queued_iops[rw]--;
+				bio_list_add(&cancel_bios[rw], bio);
+				nr_bios++;
+			}
+			while ((bio = bio_list_pop(&qn->bios_bps))) {
+				sq->nr_queued_bps[rw]--;
+				bio_list_add(&cancel_bios[rw], bio);
+				nr_bios++;
+			}
+
+			list_del_init(&qn->node);
+			blkg_put(tg_to_blkg(qn->tg));
+		}
+
+		td->nr_queued[rw] -= nr_bios;
+	}
+
+	throtl_dequeue_tg(tg);
+}
+
 void blk_throtl_cancel_bios(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	struct cgroup_subsys_state *pos_css;
 	struct blkcg_gq *blkg;
+	struct bio_list cancel_bios[2] = { };
+	int rw;
 
 	if (!blk_throtl_activated(q))
 		return;
@@ -1693,10 +1740,16 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
 		 * Cancel bios here to ensure no bios are inflight after
 		 * del_gendisk.
 		 */
-		tg_flush_bios(blkg_to_tg(blkg));
+		tg_cancel_writeback_bios(blkg_to_tg(blkg), cancel_bios);
 	}
 	rcu_read_unlock();
 	spin_unlock_irq(&q->queue_lock);
+
+	for (rw = READ; rw <= WRITE; rw++) {
+		struct bio *bio;
+		while ((bio = bio_list_pop(&cancel_bios[rw])))
+			bio_io_error(bio);
+	}
 }
 
 static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 6a221c180889..bea817f3de56 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1924,7 +1924,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
 		goto free_hash;
 
 	disk->zone_wplugs_wq =
-		alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
+		alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU,
 				pool_size, disk->disk_name);
 	if (!disk->zone_wplugs_wq)
 		goto destroy_pool;
diff --git a/block/blk.h b/block/blk.h
index b998a7761faf..25af8ac5ef0f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,6 +4,7 @@
 
 #include <linux/bio-integrity.h>
 #include <linux/blk-crypto.h>
+#include <linux/part_stat.h>
 #include <linux/lockdep.h>
 #include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
 #include <linux/sched/sysctl.h>
@@ -49,6 +50,11 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
 					      gfp_t flags);
 void blk_free_flush_queue(struct blk_flush_queue *q);
 
+const char *blk_status_to_str(blk_status_t status);
+const char *blk_status_to_tag(blk_status_t status);
+blk_status_t tag_to_blk_status(const char *tag);
+enum req_op str_to_blk_op(const char *op);
+
 bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 bool blk_queue_start_drain(struct request_queue *q);
 bool __blk_freeze_queue_start(struct request_queue *q,
@@ -402,6 +408,8 @@ static inline bool bio_may_need_split(struct bio *bio,
 	bv = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 	if (bio->bi_iter.bi_size > bv->bv_len - bio->bi_iter.bi_bvec_done)
 		return true;
+	if ((bv->bv_offset | bv->bv_len) & lim->dma_alignment)
+		return true;
 	return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size;
 }
 
@@ -485,6 +493,26 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 		q->last_merge = NULL;
 }
 
+static inline void bdev_inc_in_flight(struct block_device *bdev,
+				      enum req_op op)
+{
+	bool rw = op_is_write(op);
+
+	part_stat_local_inc(bdev, in_flight[rw]);
+	if (bdev_is_partition(bdev))
+		part_stat_local_inc(bdev_whole(bdev), in_flight[rw]);
+}
+
+static inline void bdev_dec_in_flight(struct block_device *bdev,
+				      enum req_op op)
+{
+	bool rw = op_is_write(op);
+
+	part_stat_local_dec(bdev, in_flight[rw]);
+	if (bdev_is_partition(bdev))
+		part_stat_local_dec(bdev_whole(bdev), in_flight[rw]);
+}
+
 /*
  * Internal io_context interface
  */
@@ -754,16 +782,19 @@ static inline void blk_unfreeze_release_lock(struct request_queue *q)
  * reclaim from triggering block I/O.
  */
 static inline void blk_debugfs_lock_nomemsave(struct request_queue *q)
+	__acquires(&q->debugfs_mutex)
 {
 	mutex_lock(&q->debugfs_mutex);
 }
 
 static inline void blk_debugfs_unlock_nomemrestore(struct request_queue *q)
+	__releases(&q->debugfs_mutex)
 {
 	mutex_unlock(&q->debugfs_mutex);
 }
 
 static inline unsigned int __must_check blk_debugfs_lock(struct request_queue *q)
+	__acquires(&q->debugfs_mutex)
 {
 	unsigned int memflags = memalloc_noio_save();
 
@@ -773,6 +804,7 @@ static inline unsigned int __must_check blk_debugfs_lock(struct request_queue *q
 
 static inline void blk_debugfs_unlock(struct request_queue *q,
 				      unsigned int memflags)
+	__releases(&q->debugfs_mutex)
 {
 	blk_debugfs_unlock_nomemrestore(q);
 	memalloc_noio_restore(memflags);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index fdb4b290ca68..895db30a7033 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -299,7 +299,7 @@ out:
 
 /* called right after the request is allocated for the request_queue */
 static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
-		       unsigned int hctx_idx, unsigned int numa_node)
+		       unsigned int hctx_idx, int numa_node)
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 
diff --git a/block/error-injection.c b/block/error-injection.c
new file mode 100644
index 000000000000..d24c90e9a25f
--- /dev/null
+++ b/block/error-injection.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2026 Christoph Hellwig.
+ */
+#include <linux/debugfs.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include "blk.h"
+#include "error-injection.h"
+
+struct blk_error_inject {
+	struct list_head		entry;
+	sector_t			start;
+	sector_t			end;
+	enum req_op			op;
+	blk_status_t			status;
+
+	/* only inject every 1 / chance times */
+	unsigned int			chance;
+};
+
+DEFINE_STATIC_KEY_FALSE(blk_error_injection_enabled);
+
+bool __blk_error_inject(struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	struct blk_error_inject *inj;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(inj, &disk->error_injection_list, entry) {
+		if (bio_op(bio) != inj->op)
+			continue;
+		/*
+		 * This never matches 0-sized bios like empty WRITEs with
+		 * REQ_PREFLUSH or ZONE_RESET_ALL.  While adding a special case
+		 * for them would be trivial, that means any WRITE rule would
+		 * trigger for flushes.  So before we can make this work
+		 * properly, we'll need to start using REQ_OP_FLUSH for pure
+		 * flushes at the bio level like we already do in blk-mq.
+		 */
+		if (bio->bi_iter.bi_sector > inj->end ||
+		    bio_end_sector(bio) <= inj->start)
+			continue;
+		if (inj->chance > 1 && (get_random_u32() % inj->chance) != 0)
+			continue;
+
+		pr_info_ratelimited("%pg: injecting %s error for %s at sector %llu:%u\n",
+				disk->part0, blk_status_to_str(inj->status),
+				blk_op_str(inj->op), bio->bi_iter.bi_sector,
+				bio_sectors(bio));
+		bio->bi_status = inj->status;
+		rcu_read_unlock();
+		bio_endio(bio);
+		return true;
+	}
+	rcu_read_unlock();
+	return false;
+}
+
+static int error_inject_add(struct gendisk *disk, enum req_op op,
+		sector_t start, u64 nr_sectors, blk_status_t status,
+		unsigned int chance)
+{
+	struct blk_error_inject *inj;
+	int error = -EINVAL;
+
+	if (op == REQ_OP_LAST)
+		return -EINVAL;
+	if (status == BLK_STS_OK)
+		return -EINVAL;
+
+	inj = kzalloc_obj(*inj);
+	if (!inj)
+		return -ENOMEM;
+
+	if (nr_sectors) {
+		if (U64_MAX - nr_sectors < start)
+			goto out_free_inj;
+		inj->end = start + nr_sectors - 1;
+	} else {
+		inj->end = U64_MAX;
+	}
+
+	inj->op = op;
+	inj->start = start;
+	inj->status = status;
+	inj->chance = chance;
+
+	pr_debug_ratelimited("%pg: adding %s injection for %s at sector %llu:%llu\n",
+			disk->part0, blk_status_to_str(status),
+			blk_op_str(op),
+			start, nr_sectors);
+
+	/*
+	 * Add to the front of the list so that newer entries can partially
+	 * override other entries.  This also intentionally allows duplicate
+	 * entries as there is no real reason to reject them.
+	 */
+	mutex_lock(&disk->error_injection_lock);
+	if (!disk_live(disk)) {
+		mutex_unlock(&disk->error_injection_lock);
+		error = -ENODEV;
+		goto out_free_inj;
+	}
+	if (list_empty(&disk->error_injection_list))
+		static_branch_inc(&blk_error_injection_enabled);
+	list_add_rcu(&inj->entry, &disk->error_injection_list);
+	set_bit(GD_ERROR_INJECT, &disk->state);
+	mutex_unlock(&disk->error_injection_lock);
+	return 0;
+
+out_free_inj:
+	kfree(inj);
+	return error;
+}
+
+static void error_inject_removeall(struct gendisk *disk)
+{
+	struct blk_error_inject *inj;
+
+	mutex_lock(&disk->error_injection_lock);
+	clear_bit(GD_ERROR_INJECT, &disk->state);
+	while ((inj = list_first_entry_or_null(&disk->error_injection_list,
+			struct blk_error_inject, entry))) {
+		list_del_rcu(&inj->entry);
+		kfree_rcu_mightsleep(inj);
+	}
+	static_branch_dec(&blk_error_injection_enabled);
+	mutex_unlock(&disk->error_injection_lock);
+}
+
+enum options {
+	Opt_add			= (1u << 0),
+	Opt_removeall		= (1u << 1),
+
+	Opt_op			= (1u << 16),
+	Opt_start		= (1u << 17),
+	Opt_nr_sectors		= (1u << 18),
+	Opt_status		= (1u << 19),
+	Opt_chance		= (1u << 20),
+
+	Opt_invalid,
+};
+
+static const match_table_t opt_tokens = {
+	{ Opt_add,			"add",			},
+	{ Opt_removeall,		"removeall",		},
+	{ Opt_op,			"op=%s",		},
+	{ Opt_start,			"start=%u"		},
+	{ Opt_nr_sectors,		"nr_sectors=%u"		},
+	{ Opt_status,			"status=%s"		},
+	{ Opt_chance,			"chance=%u"		},
+	{ Opt_invalid,			NULL,			},
+};
+
+static int match_op(substring_t *args, enum req_op *op)
+{
+	const char *tag;
+
+	tag = match_strdup(args);
+	if (!tag)
+		return -ENOMEM;
+	*op = str_to_blk_op(tag);
+	if (*op == REQ_OP_LAST)
+		pr_warn("invalid op '%s'\n", tag);
+	kfree(tag);
+	return 0;
+}
+
+static int match_status(substring_t *args, blk_status_t *status)
+{
+	const char *tag;
+
+	tag = match_strdup(args);
+	if (!tag)
+		return -ENOMEM;
+	*status = tag_to_blk_status(tag);
+	if (!*status)
+		pr_warn("invalid status '%s'\n", tag);
+	kfree(tag);
+	return 0;
+}
+
+static ssize_t blk_error_injection_parse_options(struct gendisk *disk,
+		char *options)
+{
+	enum { Unset, Add, Removeall } action = Unset;
+	unsigned int option_mask = 0, chance = 1;
+	enum req_op op = REQ_OP_LAST;
+	u64 start = 0, nr_sectors = 0;
+	blk_status_t status = BLK_STS_OK;
+	substring_t args[MAX_OPT_ARGS];
+	char *p;
+
+	while ((p = strsep(&options, ",\n")) != NULL) {
+		int error = 0;
+		ssize_t token;
+
+		if (!*p)
+			continue;
+		token = match_token(p, opt_tokens, args);
+		option_mask |= token;
+		switch (token) {
+		case Opt_add:
+			if (action != Unset)
+				return -EINVAL;
+			action = Add;
+			break;
+		case Opt_removeall:
+			if (action != Unset)
+				return -EINVAL;
+			action = Removeall;
+			break;
+		case Opt_op:
+			error = match_op(args, &op);
+			break;
+		case Opt_start:
+			error = match_u64(args, &start);
+			break;
+		case Opt_nr_sectors:
+			error = match_u64(args, &nr_sectors);
+			break;
+		case Opt_status:
+			error = match_status(args, &status);
+			break;
+		case Opt_chance:
+			error = match_uint(args, &chance);
+			if (!error && chance == 0)
+				error = -EINVAL;
+			break;
+		default:
+			pr_warn("unknown parameter or missing value '%s'\n", p);
+			error = -EINVAL;
+		}
+		if (error)
+			return error;
+	}
+
+	switch (action) {
+	case Add:
+		return error_inject_add(disk, op, start, nr_sectors, status,
+				chance);
+	case Removeall:
+		if (option_mask & ~Opt_removeall)
+			return -EINVAL;
+		error_inject_removeall(disk);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static ssize_t blk_error_injection_write(struct file *file,
+		const char __user *ubuf, size_t count, loff_t *pos)
+{
+	struct gendisk *disk = file_inode(file)->i_private;
+	char *options;
+	int error;
+
+	options = memdup_user_nul(ubuf, count);
+	if (IS_ERR(options))
+		return PTR_ERR(options);
+	error = blk_error_injection_parse_options(disk, options);
+	kfree(options);
+
+	if (error)
+		return error;
+	return count;
+}
+
+static int blk_error_injection_show(struct seq_file *s, void *private)
+{
+	struct gendisk *disk = s->private;
+	struct blk_error_inject *inj;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(inj, &disk->error_injection_list, entry) {
+		seq_printf(s, "%llu:%llu status=%s,chance=%u",
+			inj->start, inj->end,
+			blk_status_to_tag(inj->status), inj->chance);
+		seq_putc(s, '\n');
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+static int blk_error_injection_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, blk_error_injection_show, inode->i_private);
+}
+
+static int blk_error_injection_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations blk_error_injection_fops = {
+	.owner		= THIS_MODULE,
+	.write		= blk_error_injection_write,
+	.read		= seq_read,
+	.open		= blk_error_injection_open,
+	.release	= blk_error_injection_release,
+};
+
+void blk_error_injection_init(struct gendisk *disk)
+{
+	debugfs_create_file("error_injection", 0600, disk->queue->debugfs_dir,
+			disk, &blk_error_injection_fops);
+}
+
+void blk_error_injection_exit(struct gendisk *disk)
+{
+	error_inject_removeall(disk);
+}
diff --git a/block/error-injection.h b/block/error-injection.h
new file mode 100644
index 000000000000..9821d773abab
--- /dev/null
+++ b/block/error-injection.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BLK_ERROR_INJECTION_H
+#define _BLK_ERROR_INJECTION_H 1
+
+#include <linux/jump_label.h>
+
+DECLARE_STATIC_KEY_FALSE(blk_error_injection_enabled);
+
+void blk_error_injection_init(struct gendisk *disk);
+void blk_error_injection_exit(struct gendisk *disk);
+bool __blk_error_inject(struct bio *bio);
+static inline bool blk_error_inject(struct bio *bio)
+{
+	if (IS_ENABLED(CONFIG_BLK_ERROR_INJECTION) &&
+	    static_branch_unlikely(&blk_error_injection_enabled) &&
+	    test_bit(GD_ERROR_INJECT, &bio->bi_bdev->bd_disk->state))
+		return __blk_error_inject(bio);
+	return false;
+}
+
+#endif /* _BLK_ERROR_INJECTION_H */
diff --git a/block/fops.c b/block/fops.c
index bb6642b45937..15783a6180de 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -218,8 +218,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 		ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
 		if (unlikely(ret)) {
-			bio->bi_status = BLK_STS_IOERR;
-			bio_endio(bio);
+			bio_endio_status(bio, BLK_STS_IOERR);
 			break;
 		}
 		if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -499,36 +498,12 @@ static void blkdev_readahead(struct readahead_control *rac)
 	mpage_readahead(rac, blkdev_get_block);
 }
 
-static int blkdev_write_begin(const struct kiocb *iocb,
-			      struct address_space *mapping, loff_t pos,
-			      unsigned len, struct folio **foliop,
-			      void **fsdata)
-{
-	return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
-}
-
-static int blkdev_write_end(const struct kiocb *iocb,
-			    struct address_space *mapping,
-			    loff_t pos, unsigned len, unsigned copied,
-			    struct folio *folio, void *fsdata)
-{
-	int ret;
-	ret = block_write_end(pos, len, copied, folio);
-
-	folio_unlock(folio);
-	folio_put(folio);
-
-	return ret;
-}
-
 const struct address_space_operations def_blk_aops = {
 	.dirty_folio	= block_dirty_folio,
 	.invalidate_folio = block_invalidate_folio,
 	.read_folio	= blkdev_read_folio,
 	.readahead	= blkdev_readahead,
 	.writepages	= blkdev_writepages,
-	.write_begin	= blkdev_write_begin,
-	.write_end	= blkdev_write_end,
 	.migrate_folio	= buffer_migrate_folio_norefs,
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
diff --git a/block/genhd.c b/block/genhd.c
index 7d6854fd28e9..f84b6a355b57 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1486,6 +1486,10 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
 	INIT_LIST_HEAD(&disk->slave_bdevs);
 #endif
+#ifdef CONFIG_BLK_ERROR_INJECTION
+	mutex_init(&disk->error_injection_lock);
+	INIT_LIST_HEAD(&disk->error_injection_list);
+#endif
 	mutex_init(&disk->rqos_state_mutex);
 	kobject_init(&disk->queue_kobj, &blk_queue_ktype);
 	return disk;
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b84163d1f851..971818bcdc9d 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -882,6 +882,9 @@ static const struct elv_fs_entry kyber_sched_attrs[] = {
 };
 #undef KYBER_LAT_ATTR
 
+#define HCTX_FROM_SEQ_FILE(m) ((struct blk_mq_hw_ctx *)(m)->private)
+#define KYBER_HCTX_DATA(hctx) ((struct kyber_hctx_data *)(hctx)->sched_data)
+
 #ifdef CONFIG_BLK_DEBUG_FS
 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)			\
 static int kyber_##name##_tokens_show(void *data, struct seq_file *m)	\
@@ -894,7 +897,7 @@ static int kyber_##name##_tokens_show(void *data, struct seq_file *m)	\
 }									\
 									\
 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)	\
-	__acquires(&khd->lock)						\
+	__acquires(&KYBER_HCTX_DATA(HCTX_FROM_SEQ_FILE(m))->lock)	\
 {									\
 	struct blk_mq_hw_ctx *hctx = m->private;			\
 	struct kyber_hctx_data *khd = hctx->sched_data;			\
@@ -913,7 +916,7 @@ static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,	\
 }									\
 									\
 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)	\
-	__releases(&khd->lock)						\
+	__releases(&KYBER_HCTX_DATA(HCTX_FROM_SEQ_FILE(m))->lock)	\
 {									\
 	struct blk_mq_hw_ctx *hctx = m->private;			\
 	struct kyber_hctx_data *khd = hctx->sched_data;			\
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 95917a88976f..824bfc17b2c6 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -794,11 +794,15 @@ static const struct elv_fs_entry deadline_attrs[] = {
 	__ATTR_NULL
 };
 
+#define RQ_FROM_SEQ_FILE(m) ((struct request_queue *)(m)->private)
+#define DD_DATA_FROM_RQ(rq)					\
+	((struct deadline_data *)(rq)->elevator->elevator_data)
+
 #ifdef CONFIG_BLK_DEBUG_FS
 #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name)		\
 static void *deadline_##name##_fifo_start(struct seq_file *m,		\
 					  loff_t *pos)			\
-	__acquires(&dd->lock)						\
+	__acquires(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock)		\
 {									\
 	struct request_queue *q = m->private;				\
 	struct deadline_data *dd = q->elevator->elevator_data;		\
@@ -819,7 +823,7 @@ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v,	\
 }									\
 									\
 static void deadline_##name##_fifo_stop(struct seq_file *m, void *v)	\
-	__releases(&dd->lock)						\
+	__releases(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock)		\
 {									\
 	struct request_queue *q = m->private;				\
 	struct deadline_data *dd = q->elevator->elevator_data;		\
@@ -921,7 +925,7 @@ static int dd_owned_by_driver_show(void *data, struct seq_file *m)
 }
 
 static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos)
-	__acquires(&dd->lock)
+	__acquires(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock)
 {
 	struct request_queue *q = m->private;
 	struct deadline_data *dd = q->elevator->elevator_data;
@@ -939,7 +943,7 @@ static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
 }
 
 static void deadline_dispatch_stop(struct seq_file *m, void *v)
-	__releases(&dd->lock)
+	__releases(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock)
 {
 	struct request_queue *q = m->private;
 	struct deadline_data *dd = q->elevator->elevator_data;
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c
index 9f7389f174d0..067d6a27a3bd 100644
--- a/block/partitions/acorn.c
+++ b/block/partitions/acorn.c
@@ -9,6 +9,7 @@
  */
 #include <linux/buffer_head.h>
 #include <linux/adfs_fs.h>
+#include <linux/minmax.h>
 
 #include "check.h"
 
@@ -80,7 +81,7 @@ static int riscix_partition(struct parsed_partitions *state,
 
 
 	if (rr->magic == RISCIX_MAGIC) {
-		unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+		unsigned long size = min(nr_sects, 2);
 		int part;
 
 		seq_buf_puts(&state->pp_buf, " <");
@@ -124,7 +125,7 @@ static int linux_partition(struct parsed_partitions *state,
 {
 	Sector sect;
 	struct linux_part *linuxp;
-	unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+	unsigned long size = min(nr_sects, 2);
 
 	seq_buf_puts(&state->pp_buf, " [Linux]");
 
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index 29b8f4cebb63..f3c4174e003e 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -226,6 +226,15 @@ int aix_partition(struct parsed_partitions *state)
 		int next_lp_ix = 1;
 		int lp_ix;
 
+		/*
+		 * pvd was read into a fixed-size struct pvd whose ppe[] array
+		 * holds ARRAY_SIZE(pvd->ppe) entries.  pp_count is an
+		 * unvalidated on-disk __be16, so clamp the scan to the array
+		 * size to avoid walking past the allocation.
+		 */
+		if (numpps > ARRAY_SIZE(pvd->ppe))
+			numpps = ARRAY_SIZE(pvd->ppe);
+
 		for (i = 0; i < numpps; i += 1) {
 			struct ppe *p = pvd->ppe + i;
 			unsigned int lv_ix;
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 5d5332ce586b..b5c59b79ca7c 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -124,7 +124,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 	state = allocate_partitions(hd);
 	if (!state)
 		return NULL;
-	state->pp_buf.buffer = (char *)__get_free_page(GFP_KERNEL);
+	state->pp_buf.buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!state->pp_buf.buffer) {
 		free_partitions(state);
 		return NULL;
@@ -154,7 +154,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 	if (res > 0) {
 		printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));
 
-		free_page((unsigned long)state->pp_buf.buffer);
+		kfree(state->pp_buf.buffer);
 		return state;
 	}
 	if (state->access_beyond_eod)
@@ -170,7 +170,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 		printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));
 	}
 
-	free_page((unsigned long)state->pp_buf.buffer);
+	kfree(state->pp_buf.buffer);
 	free_partitions(state);
 	return ERR_PTR(res);
 }
diff --git a/block/partitions/of.c b/block/partitions/of.c
index c22b60661098..53664ea06b65 100644
--- a/block/partitions/of.c
+++ b/block/partitions/of.c
@@ -74,8 +74,10 @@ int of_partition(struct parsed_partitions *state)
 	struct device_node *partitions_np = of_node_get(ddev->of_node);
 
 	if (!partitions_np ||
-	    !of_device_is_compatible(partitions_np, "fixed-partitions"))
+	    !of_device_is_compatible(partitions_np, "fixed-partitions")) {
+		of_node_put(partitions_np);
 		return 0;
+	}
 
 	slot = 1;
 	/* Validate parition offset and size */
@@ -104,5 +106,6 @@ int of_partition(struct parsed_partitions *state)
 
 	seq_buf_puts(&state->pp_buf, "\n");
 
+	of_node_put(partitions_np);
 	return 1;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-16 13:02:47 +0530
committer	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-16 13:02:47 +0530
commit	ba9c792c824fff732df85119011d399d9b6d9155 (patch)
tree	a6de1eda726e4e9156366b6fd960b81d09297aa3 /block
parent	9b40ba14edcdf70240af8114092a76f75f070774 (diff)
parent	c7c76f9232bd34835d821f14abdc5fafc17bc938 (diff)