summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-16 13:02:47 +0530
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-16 13:02:47 +0530
commitba9c792c824fff732df85119011d399d9b6d9155 (patch)
treea6de1eda726e4e9156366b6fd960b81d09297aa3 /block
parent9b40ba14edcdf70240af8114092a76f75f070774 (diff)
parentc7c76f9232bd34835d821f14abdc5fafc17bc938 (diff)
Merge tag 'for-7.2/block-20260615' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe: - NVMe pull request via Keith: - Per-controller admin and IO timeout sysfs attributes, and letting the block layer set request timeouts (Maurizio, Maximilian) - Multipath passthrough iostats, and PCI P2PDMA enablement for multipath devices (Keith, Kiran) - A new diag sysfs attribute group exporting per-controller counters (retries, multipath failover, error counters, requeue and failure counts, reset and reconnect events) (Nilay) - FDP configuration validation and bounds check fixes (liuxixin) - Various nvmet fixes, including a pre-auth out-of-bounds read in the Discovery Get Log Page handler, auth payload bounds validation, and tcp error-path leak fixes (Bryam, Tianchu, Geliang) - nvme-tcp lockdep and workqueue fixes (Shin'ichiro, Kuniyuki, Eric) - Assorted other fixes and cleanups (John, Yao, Chao, Mateusz, Achkinazi, Wentao) - MD pull request via Yu Kuai: - raid1/raid10 fixes for a deadlock in the read error recovery path, error-path detection and bio accounting with cloned bios, and an nr_pending leak in the REQ_ATOMIC bad-block error path (Abd-Alrhman) - PCI P2PDMA propagation from member devices to the RAID device (Kiran) - dm-raid bio requeue fix, and various smaller fixes and cleanups (Benjamin, Chen, Li, Thorsten) - Enable Clang lock context analysis for the block layer, with the accompanying annotations across queue limits, the blk_holder_ops callbacks, crypto, cgroup, iocost, kyber and mq-deadline (Bart) - Block status code infrastructure work: a tagged status table, a str_to_blk_op() helper, a bio_endio_status() helper, and on top of that a new configurable block-layer error injection facility (Christoph) - DRBD netlink rework, replacing the genl_magic machinery with explicit netlink serialization and moving the DRBD UAPI headers to include/uapi/linux/ (Christoph Böhmwalder) - bvec improvements: a bvec_folio() helper and making the bvec_iter helpers proper inline functions (Willy, Christoph) - ublk cleanups and a canceling-flag fix for the disk-not-allocated case (Caleb, Ming) - Partition handling fixes: bound the AIX pp_count scan, fix an of_node refcount leak, and replace __get_free_page() with kmalloc() (Bryam, Wentao, Mike) - Convert numa_node to int in blk_mq_hw_ctx and ->init_request, and add WQ_PERCPU to the block workqueue users (Mateusz, Marco) - Block statistics and tracing: propagate in-flight to the whole disk on partition IO, export passthrough stats, and a new block_rq_tag_wait tracepoint (Tang, Keith, Aaron) - A round of removals, unexports and cleanups across bio, direct-io and the bvec helpers (Christoph) - Various driver fixes (mtip32xx use-after-free, rbd snap_count validation and strscpy conversion, nbd socket lockdep reclassify, virtio-blk zone report clamp, floppy) and a batch of MAINTAINERS email/list updates (Coly, Li, Yu, Christoph Böhmwalder) - Other little fixes and cleanups all over * tag 'for-7.2/block-20260615' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (117 commits) MAINTAINERS: Update Coly Li's email address block: check bio split for unaligned bvec nbd: Reclassify sockets to avoid lockdep circular dependency block: add configurable error injection block: add a str_to_blk_op helper block: add a "tag" for block status codes block: add a macro to initialize the status table floppy: Drop unused pnp driver data block: propagate in_flight to whole disk on partition I/O virtio-blk: clamp zone report to the report buffer capacity block: optimize I/O merge hot path with unlikely() hints drivers/block/rbd: Use strscpy() to copy strings into arrays partitions: aix: bound the pp_count scan to the ppe array block: Enable lock context analysis block/mq-deadline: Make the lock context annotations compatible with Clang block/Kyber: Make the lock context annotations compatible with Clang block/blk-mq-debugfs: Improve lock context annotations block/blk-iocost: Inline iocg_lock() and iocg_unlock() block/blk-iocost: Split ioc_rqos_throttle() block/crypto: Annotate the crypto functions ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig8
-rw-r--r--block/Makefile3
-rw-r--r--block/bdev.c13
-rw-r--r--block/bfq-cgroup.c54
-rw-r--r--block/bio.c52
-rw-r--r--block/blk-cgroup.c98
-rw-r--r--block/blk-cgroup.h13
-rw-r--r--block/blk-core.c104
-rw-r--r--block/blk-crypto-fallback.c9
-rw-r--r--block/blk-crypto-profile.c2
-rw-r--r--block/blk-crypto.c3
-rw-r--r--block/blk-iocost.c306
-rw-r--r--block/blk-iolatency.c19
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq-debugfs.c24
-rw-r--r--block/blk-mq-tag.c6
-rw-r--r--block/blk-mq.c43
-rw-r--r--block/blk-settings.c2
-rw-r--r--block/blk-sysfs.c5
-rw-r--r--block/blk-throttle.c85
-rw-r--r--block/blk-zoned.c2
-rw-r--r--block/blk.h32
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/error-injection.c315
-rw-r--r--block/error-injection.h21
-rw-r--r--block/fops.c27
-rw-r--r--block/genhd.c4
-rw-r--r--block/kyber-iosched.c7
-rw-r--r--block/mq-deadline.c12
-rw-r--r--block/partitions/acorn.c5
-rw-r--r--block/partitions/aix.c9
-rw-r--r--block/partitions/core.c6
-rw-r--r--block/partitions/of.c5
33 files changed, 884 insertions, 429 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 15027963472d..70e4a66d941f 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -221,6 +221,14 @@ config BLOCK_HOLDER_DEPRECATED
config BLK_MQ_STACKING
bool
+config BLK_ERROR_INJECTION
+ bool "Enable block layer error injection"
+ select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+ help
+ Enable inserting arbitrary block errors through a debugfs interface.
+
+ See Documentation/block/error-injection.rst for details.
+
source "block/Kconfig.iosched"
endif # BLOCK
diff --git a/block/Makefile b/block/Makefile
index 7dce2e44276c..e7bd320e3d69 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,6 +3,8 @@
# Makefile for the kernel block layer
#
+CONTEXT_ANALYSIS := y
+
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
@@ -11,6 +13,7 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o blk-ia-ranges.o early-lookup.o
+obj-$(CONFIG_BLK_ERROR_INJECTION) += error-injection.o
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
diff --git a/block/bdev.c b/block/bdev.c
index bb0ffa3bb4df..85ce57bd2ae4 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -446,15 +446,10 @@ EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
{
- int err;
-
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_ACCOUNT|SLAB_PANIC),
init_once);
- err = register_filesystem(&bd_type);
- if (err)
- panic("Cannot register bdev pseudo-fs");
blockdev_mnt = kern_mount(&bd_type);
if (IS_ERR(blockdev_mnt))
panic("Cannot create bdev pseudo-fs");
@@ -1250,7 +1245,13 @@ void bdev_mark_dead(struct block_device *bdev, bool surprise)
bdev->bd_holder_ops->mark_dead(bdev, surprise);
else {
mutex_unlock(&bdev->bd_holder_lock);
- sync_blockdev(bdev);
+ /*
+ * On surprise removal the device is already gone; syncing is
+ * futile and can hang forever waiting on I/O that will never
+ * complete. Match fs_bdev_mark_dead(), which also skips it.
+ */
+ if (!surprise)
+ sync_blockdev(bdev);
}
invalidate_bdev(bdev);
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index ac83b0668764..0bd0332b3d78 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -300,6 +300,25 @@ static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
return pblkg ? blkg_to_bfqg(pblkg) : NULL;
}
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+ blkg_rwstat_exit(&stats->bytes);
+ blkg_rwstat_exit(&stats->ios);
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+ blkg_rwstat_exit(&stats->merged);
+ blkg_rwstat_exit(&stats->service_time);
+ blkg_rwstat_exit(&stats->wait_time);
+ blkg_rwstat_exit(&stats->queued);
+ bfq_stat_exit(&stats->time);
+ bfq_stat_exit(&stats->avg_queue_size_sum);
+ bfq_stat_exit(&stats->avg_queue_size_samples);
+ bfq_stat_exit(&stats->dequeue);
+ bfq_stat_exit(&stats->group_wait_time);
+ bfq_stat_exit(&stats->idle_time);
+ bfq_stat_exit(&stats->empty_time);
+#endif
+}
+
struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
{
struct bfq_entity *group_entity = bfqq->entity.parent;
@@ -321,8 +340,10 @@ static void bfqg_get(struct bfq_group *bfqg)
static void bfqg_put(struct bfq_group *bfqg)
{
- if (refcount_dec_and_test(&bfqg->ref))
+ if (refcount_dec_and_test(&bfqg->ref)) {
+ bfqg_stats_exit(&bfqg->stats);
kfree(bfqg);
+ }
}
static void bfqg_and_blkg_get(struct bfq_group *bfqg)
@@ -433,25 +454,6 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
entity->sched_data = &bfqg->sched_data;
}
-static void bfqg_stats_exit(struct bfqg_stats *stats)
-{
- blkg_rwstat_exit(&stats->bytes);
- blkg_rwstat_exit(&stats->ios);
-#ifdef CONFIG_BFQ_CGROUP_DEBUG
- blkg_rwstat_exit(&stats->merged);
- blkg_rwstat_exit(&stats->service_time);
- blkg_rwstat_exit(&stats->wait_time);
- blkg_rwstat_exit(&stats->queued);
- bfq_stat_exit(&stats->time);
- bfq_stat_exit(&stats->avg_queue_size_sum);
- bfq_stat_exit(&stats->avg_queue_size_samples);
- bfq_stat_exit(&stats->dequeue);
- bfq_stat_exit(&stats->group_wait_time);
- bfq_stat_exit(&stats->idle_time);
- bfq_stat_exit(&stats->empty_time);
-#endif
-}
-
static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
if (blkg_rwstat_init(&stats->bytes, gfp) ||
@@ -552,7 +554,6 @@ static void bfq_pd_free(struct blkg_policy_data *pd)
{
struct bfq_group *bfqg = pd_to_bfqg(pd);
- bfqg_stats_exit(&bfqg->stats);
bfqg_put(bfqg);
}
@@ -1051,9 +1052,13 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
blkg_conf_init(&ctx, buf);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ return ret;
+
ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx);
if (ret)
- goto out;
+ goto close_bdev;
if (sscanf(ctx.body, "%llu", &v) == 1) {
/* require "default" on dfl */
@@ -1074,8 +1079,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
bfq_group_set_weight(bfqg, bfqg->entity.weight, v);
ret = 0;
}
+
out:
- blkg_conf_exit(&ctx);
+ blkg_conf_unprep(&ctx);
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
return ret ?: nbytes;
}
diff --git a/block/bio.c b/block/bio.c
index 5f10900b3f42..811a96796202 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -635,15 +635,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
}
EXPORT_SYMBOL(bio_kmalloc);
-void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
+void zero_fill_bio(struct bio *bio)
{
struct bio_vec bv;
struct bvec_iter iter;
- __bio_for_each_segment(bv, bio, iter, start)
+ bio_for_each_segment(bv, bio, iter)
memzero_bvec(&bv);
}
-EXPORT_SYMBOL(zero_fill_bio_iter);
+EXPORT_SYMBOL(zero_fill_bio);
/**
* bio_truncate - truncate the bio to small size of @new_size
@@ -1300,7 +1300,7 @@ static void bio_free_folios(struct bio *bio)
int i;
bio_for_each_bvec_all(bv, bio, i) {
- struct folio *folio = page_folio(bv->bv_page);
+ struct folio *folio = bvec_folio(bv);
if (!is_zero_folio(folio))
folio_put(folio);
@@ -1409,7 +1409,7 @@ int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen,
static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
{
- struct folio *folio = page_folio(bv->bv_page);
+ struct folio *folio = bvec_folio(bv);
size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
bv->bv_offset / PAGE_SIZE + 1;
@@ -1443,7 +1443,7 @@ static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
}
- folio_put(page_folio(bio->bi_io_vec[0].bv_page));
+ folio_put(bvec_folio(&bio->bi_io_vec[0]));
}
/**
@@ -1578,26 +1578,6 @@ void __bio_advance(struct bio *bio, unsigned bytes)
}
EXPORT_SYMBOL(__bio_advance);
-void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
- struct bio *src, struct bvec_iter *src_iter)
-{
- while (src_iter->bi_size && dst_iter->bi_size) {
- struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
- struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
- unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
- void *src_buf = bvec_kmap_local(&src_bv);
- void *dst_buf = bvec_kmap_local(&dst_bv);
-
- memcpy(dst_buf, src_buf, bytes);
-
- kunmap_local(dst_buf);
- kunmap_local(src_buf);
-
- bio_advance_iter_single(src, src_iter, bytes);
- bio_advance_iter_single(dst, dst_iter, bytes);
- }
-}
-EXPORT_SYMBOL(bio_copy_data_iter);
/**
* bio_copy_data - copy contents of data buffers from one bio to another
@@ -1612,7 +1592,21 @@ void bio_copy_data(struct bio *dst, struct bio *src)
struct bvec_iter src_iter = src->bi_iter;
struct bvec_iter dst_iter = dst->bi_iter;
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+ while (src_iter.bi_size && dst_iter.bi_size) {
+ struct bio_vec src_bv = bio_iter_iovec(src, src_iter);
+ struct bio_vec dst_bv = bio_iter_iovec(dst, dst_iter);
+ unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ void *src_buf = bvec_kmap_local(&src_bv);
+ void *dst_buf = bvec_kmap_local(&dst_bv);
+
+ memcpy(dst_buf, src_buf, bytes);
+
+ kunmap_local(dst_buf);
+ kunmap_local(src_buf);
+
+ bio_advance_iter_single(src, &src_iter, bytes);
+ bio_advance_iter_single(dst, &dst_iter, bytes);
+ }
}
EXPORT_SYMBOL(bio_copy_data);
@@ -1659,7 +1653,6 @@ void bio_set_pages_dirty(struct bio *bio)
folio_unlock(fi.folio);
}
}
-EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
/*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
@@ -1718,7 +1711,6 @@ defer:
spin_unlock_irqrestore(&bio_dirty_lock, flags);
schedule_work(&bio_dirty_work);
}
-EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
static inline bool bio_remaining_done(struct bio *bio)
{
@@ -1884,7 +1876,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
* create memory pools for biovec's in a bio_set.
* use the global biovec slabs created for general use.
*/
-int biovec_init_pool(mempool_t *pool, int pool_entries)
+static int biovec_init_pool(mempool_t *pool, int pool_entries)
{
struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bc63bd220865..3093c1c03902 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -755,7 +755,7 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
*
* Initialize @ctx which can be used to parse blkg config input string @input.
* Once initialized, @ctx can be used with blkg_conf_open_bdev() and
- * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
+ * blkg_conf_prep().
*/
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
{
@@ -771,10 +771,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_init);
* @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
* set to point past the device node prefix.
*
- * This function may be called multiple times on @ctx and the extra calls become
- * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
- * explicitly if bdev access is needed without resolving the blkcg / policy part
- * of @ctx->input. Returns -errno on error.
+ * Returns: -errno on error.
*/
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
{
@@ -783,8 +780,8 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
struct block_device *bdev;
int key_len;
- if (ctx->bdev)
- return 0;
+ if (WARN_ON_ONCE(ctx->bdev))
+ return -EINVAL;
if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
return -EINVAL;
@@ -813,38 +810,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
ctx->bdev = bdev;
return 0;
}
-/*
- * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
- * ensures the correct locking order between freeze queue and q->rq_qos_mutex.
- *
- * This function returns negative error on failure. On success it returns
- * memflags which must be saved and later passed to blkg_conf_exit_frozen
- * for restoring the memalloc scope.
- */
-unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
-{
- int ret;
- unsigned long memflags;
-
- if (ctx->bdev)
- return -EINVAL;
-
- ret = blkg_conf_open_bdev(ctx);
- if (ret < 0)
- return ret;
- /*
- * At this point, we haven’t started protecting anything related to QoS,
- * so we release q->rq_qos_mutex here, which was first acquired in blkg_
- * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
- * the queue to maintain the correct locking order.
- */
- mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
- memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
- mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);
-
- return memflags;
-}
+EXPORT_SYMBOL_GPL(blkg_conf_open_bdev);
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
@@ -857,22 +823,20 @@ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
* following MAJ:MIN, @ctx->bdev points to the target block device and
* @ctx->blkg to the blkg being configured.
*
- * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
+ * blkg_conf_open_bdev() must be called on @ctx beforehand. On success, this
* function returns with queue lock held and must be followed by
- * blkg_conf_exit().
+ * blkg_conf_close_bdev().
*/
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx)
- __acquires(&bdev->bd_queue->queue_lock)
{
struct gendisk *disk;
struct request_queue *q;
struct blkcg_gq *blkg;
int ret;
- ret = blkg_conf_open_bdev(ctx);
- if (ret)
- return ret;
+ if (WARN_ON_ONCE(!ctx->bdev))
+ return -EINVAL;
disk = ctx->bdev->bd_disk;
q = disk->queue;
@@ -970,43 +934,29 @@ fail_exit:
EXPORT_SYMBOL_GPL(blkg_conf_prep);
/**
- * blkg_conf_exit - clean up per-blkg config update
+ * blkg_conf_unprep - counterpart of blkg_conf_prep()
* @ctx: blkg_conf_ctx initialized with blkg_conf_init()
- *
- * Clean up after per-blkg config update. This function must be called on all
- * blkg_conf_ctx's initialized with blkg_conf_init().
*/
-void blkg_conf_exit(struct blkg_conf_ctx *ctx)
- __releases(&ctx->bdev->bd_queue->queue_lock)
- __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
+void blkg_conf_unprep(struct blkg_conf_ctx *ctx)
{
- if (ctx->blkg) {
- spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
- ctx->blkg = NULL;
- }
-
- if (ctx->bdev) {
- mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
- blkdev_put_no_open(ctx->bdev);
- ctx->body = NULL;
- ctx->bdev = NULL;
- }
+ WARN_ON_ONCE(!ctx->blkg);
+ spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
+ ctx->blkg = NULL;
}
-EXPORT_SYMBOL_GPL(blkg_conf_exit);
+EXPORT_SYMBOL_GPL(blkg_conf_unprep);
-/*
- * Similar to blkg_conf_exit, but also unfreezes the queue. Should be used
- * when blkg_conf_open_bdev_frozen is used to open the bdev.
+/**
+ * blkg_conf_close_bdev - counterpart of blkg_conf_open_bdev()
+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
*/
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
+void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx)
{
- if (ctx->bdev) {
- struct request_queue *q = ctx->bdev->bd_queue;
-
- blkg_conf_exit(ctx);
- blk_mq_unfreeze_queue(q, memflags);
- }
+ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
+ blkdev_put_no_open(ctx->bdev);
+ ctx->body = NULL;
+ ctx->bdev = NULL;
}
+EXPORT_SYMBOL_GPL(blkg_conf_close_bdev);
static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1cce3294634d..f25fecb87c43 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -218,12 +218,15 @@ struct blkg_conf_ctx {
};
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
-int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
-unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
+int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
+ __cond_acquires(0, &ctx->bdev->bd_queue->rq_qos_mutex);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- struct blkg_conf_ctx *ctx);
-void blkg_conf_exit(struct blkg_conf_ctx *ctx);
-void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);
+ struct blkg_conf_ctx *ctx)
+ __cond_acquires(0, &ctx->bdev->bd_disk->queue->queue_lock);
+void blkg_conf_unprep(struct blkg_conf_ctx *ctx)
+ __releases(ctx->bdev->bd_disk->queue->queue_lock);
+void blkg_conf_close_bdev(struct blkg_conf_ctx *ctx)
+ __releases(&ctx->bdev->bd_queue->rq_qos_mutex);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
diff --git a/block/blk-core.c b/block/blk-core.c
index 17450058ea6d..73a41df98c9a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -50,6 +50,7 @@
#include "blk-cgroup.h"
#include "blk-throttle.h"
#include "blk-ioprio.h"
+#include "error-injection.h"
struct dentry *blk_debugfs_root;
@@ -132,39 +133,56 @@ inline const char *blk_op_str(enum req_op op)
}
EXPORT_SYMBOL_GPL(blk_op_str);
+enum req_op str_to_blk_op(const char *op)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(blk_op_name); i++)
+ if (blk_op_name[i] && !strcmp(blk_op_name[i], op))
+ return (enum req_op)i;
+ return REQ_OP_LAST;
+}
+
+#define ENT(_tag, _errno, _desc) \
+[BLK_STS_##_tag] = { \
+ .errno = _errno, \
+ .tag = __stringify(_tag), \
+ .name = _desc, \
+}
static const struct {
int errno;
+ const char *tag;
const char *name;
} blk_errors[] = {
- [BLK_STS_OK] = { 0, "" },
- [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
- [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
- [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
- [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
- [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
- [BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" },
- [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
- [BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
- [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
- [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
- [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
- [BLK_STS_OFFLINE] = { -ENODEV, "device offline" },
+ ENT(OK, 0, ""),
+ ENT(NOTSUPP, -EOPNOTSUPP, "operation not supported"),
+ ENT(TIMEOUT, -ETIMEDOUT, "timeout"),
+ ENT(NOSPC, -ENOSPC, "critical space allocation"),
+ ENT(TRANSPORT, -ENOLINK, "recoverable transport"),
+ ENT(TARGET, -EREMOTEIO, "critical target"),
+ ENT(RESV_CONFLICT, -EBADE, "reservation conflict"),
+ ENT(MEDIUM, -ENODATA, "critical medium"),
+ ENT(PROTECTION, -EILSEQ, "protection"),
+ ENT(RESOURCE, -ENOMEM, "kernel resource"),
+ ENT(DEV_RESOURCE, -EBUSY, "device resource"),
+ ENT(AGAIN, -EAGAIN, "nonblocking retry"),
+ ENT(OFFLINE, -ENODEV, "device offline"),
/* device mapper special case, should not leak out: */
- [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
+ ENT(DM_REQUEUE, -EREMCHG, "dm internal retry"),
/* zone device specific errors */
- [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
- [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },
+ ENT(ZONE_OPEN_RESOURCE, -ETOOMANYREFS, "open zones exceeded"),
+ ENT(ZONE_ACTIVE_RESOURCE, -EOVERFLOW, "active zones exceeded"),
/* Command duration limit device-side timeout */
- [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" },
-
- [BLK_STS_INVAL] = { -EINVAL, "invalid" },
+ ENT(DURATION_LIMIT, -ETIME, "duration limit exceeded"),
+ ENT(INVAL, -EINVAL, "invalid"),
/* everything else not covered above: */
- [BLK_STS_IOERR] = { -EIO, "I/O" },
+ ENT(IOERR, -EIO, "I/O"),
};
+#undef ENT
blk_status_t errno_to_blk_status(int errno)
{
@@ -197,7 +215,32 @@ const char *blk_status_to_str(blk_status_t status)
return "<null>";
return blk_errors[idx].name;
}
-EXPORT_SYMBOL_GPL(blk_status_to_str);
+
+const char *blk_status_to_tag(blk_status_t status)
+{
+ int idx = (__force int)status;
+
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors) || !blk_errors[idx].tag))
+ return "<null>";
+ return blk_errors[idx].tag;
+}
+
+blk_status_t tag_to_blk_status(const char *tag)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
+ if (blk_errors[i].tag &&
+ !strcmp(blk_errors[i].tag, tag))
+ return (__force blk_status_t)i;
+ }
+
+ /*
+ * Return BLK_STS_OK for mismatches as this function is intended to
+ * parse error status values.
+ */
+ return BLK_STS_OK;
+}
/**
* blk_sync_queue - cancel any pending callbacks on a queue
@@ -637,12 +680,10 @@ static void __submit_bio(struct bio *bio)
struct gendisk *disk = bio->bi_bdev->bd_disk;
if ((bio->bi_opf & REQ_POLLED) &&
- !(disk->queue->limits.features & BLK_FEAT_POLL)) {
- bio->bi_status = BLK_STS_NOTSUPP;
- bio_endio(bio);
- } else {
+ !(disk->queue->limits.features & BLK_FEAT_POLL))
+ bio_endio_status(bio, BLK_STS_NOTSUPP);
+ else
disk->fops->submit_bio(bio);
- }
blk_queue_exit(disk->queue);
}
@@ -727,6 +768,9 @@ static void __submit_bio_noacct_mq(struct bio *bio)
void submit_bio_noacct_nocheck(struct bio *bio, bool split)
{
+ if (unlikely(blk_error_inject(bio)))
+ return;
+
blk_cgroup_bio_start(bio);
if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
@@ -887,8 +931,7 @@ void submit_bio_noacct(struct bio *bio)
not_supported:
status = BLK_STS_NOTSUPP;
end_io:
- bio->bi_status = status;
- bio_endio(bio);
+ bio_endio_status(bio, status);
}
EXPORT_SYMBOL(submit_bio_noacct);
@@ -1042,7 +1085,7 @@ unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
{
part_stat_lock();
update_io_ticks(bdev, start_time, false);
- part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
+ bdev_inc_in_flight(bdev, op);
part_stat_unlock();
return start_time;
@@ -1073,7 +1116,7 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
part_stat_inc(bdev, ios[sgrp]);
part_stat_add(bdev, sectors[sgrp], sectors);
part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
- part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
+ bdev_dec_in_flight(bdev, op);
part_stat_unlock();
}
EXPORT_SYMBOL(bdev_end_io_acct);
@@ -1270,7 +1313,6 @@ void blk_io_schedule(void)
else
io_schedule();
}
-EXPORT_SYMBOL_GPL(blk_io_schedule);
int __init blk_dev_init(void)
{
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index ab6924fba280..2a5c52ab74b4 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -362,8 +362,7 @@ static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio)
status = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
bc->bc_key, &slot);
if (status != BLK_STS_OK) {
- src_bio->bi_status = status;
- bio_endio(src_bio);
+ bio_endio_status(src_bio, status);
return;
}
__blk_crypto_fallback_encrypt_bio(src_bio,
@@ -438,8 +437,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
}
mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
- bio->bi_status = status;
- bio_endio(bio);
+ bio_endio_status(bio, status);
}
/**
@@ -500,8 +498,7 @@ bool blk_crypto_fallback_bio_prep(struct bio *bio)
if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile,
&bc->bc_key->crypto_cfg)) {
- bio->bi_status = BLK_STS_NOTSUPP;
- bio_endio(bio);
+ bio_endio_status(bio, BLK_STS_NOTSUPP);
return false;
}
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 4ac74443687a..cf447ba4a66e 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -43,6 +43,7 @@ struct blk_crypto_keyslot {
};
static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
+ __acquires(&profile->lock)
{
/*
* Calling into the driver requires profile->lock held and the device
@@ -55,6 +56,7 @@ static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
}
static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile)
+ __releases(&profile->lock)
{
up_write(&profile->lock);
if (profile->dev)
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 856d3c5b1fa0..165c9d2cce07 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -267,8 +267,7 @@ bool __blk_crypto_submit_bio(struct bio *bio)
if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)) {
pr_warn_once("%pg: crypto API fallback disabled; failing request.\n",
bdev);
- bio->bi_status = BLK_STS_NOTSUPP;
- bio_endio(bio);
+ bio_endio_status(bio, BLK_STS_NOTSUPP);
return false;
}
return blk_crypto_fallback_bio_prep(bio);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 0cca88a366dc..563cc7dcf348 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -727,26 +727,6 @@ static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
put_cpu_ptr(gcs);
}
-static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
-{
- if (lock_ioc) {
- spin_lock_irqsave(&iocg->ioc->lock, *flags);
- spin_lock(&iocg->waitq.lock);
- } else {
- spin_lock_irqsave(&iocg->waitq.lock, *flags);
- }
-}
-
-static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
-{
- if (unlock_ioc) {
- spin_unlock(&iocg->waitq.lock);
- spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
- } else {
- spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
- }
-}
-
#define CREATE_TRACE_POINTS
#include <trace/events/iocost.h>
@@ -1589,9 +1569,17 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
ioc_now(iocg->ioc, &now);
- iocg_lock(iocg, pay_debt, &flags);
- iocg_kick_waitq(iocg, pay_debt, &now);
- iocg_unlock(iocg, pay_debt, &flags);
+ if (pay_debt) {
+ spin_lock_irqsave(&iocg->ioc->lock, flags);
+ spin_lock(&iocg->waitq.lock);
+ iocg_kick_waitq(iocg, pay_debt, &now);
+ spin_unlock(&iocg->waitq.lock);
+ spin_unlock_irqrestore(&iocg->ioc->lock, flags);
+ } else {
+ spin_lock_irqsave(&iocg->waitq.lock, flags);
+ iocg_kick_waitq(iocg, pay_debt, &now);
+ spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+ }
return HRTIMER_NORESTART;
}
@@ -2614,6 +2602,88 @@ static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
return cost;
}
+enum over_budget_action {
+ action_retry,
+ action_commit,
+ action_wait,
+ action_return,
+};
+
+static enum over_budget_action
+iocg_handle_over_budget(struct rq_qos *rqos, struct ioc_gq *iocg,
+ struct bio *bio, struct ioc_now *now,
+ struct iocg_wait *wait, bool use_debt, bool ioc_locked,
+ u64 abs_cost, u64 cost)
+{
+ lockdep_assert_held(&iocg->waitq.lock);
+
+ /*
+ * @iocg must stay activated for debt and waitq handling. Deactivation
+ * is synchronized against both ioc->lock and waitq.lock and we won't
+ * get deactivated as long as we're waiting or have debt, so we're good
+ * if we're activated here. In the unlikely cases that we aren't, just
+ * issue the IO.
+ */
+ if (unlikely(list_empty(&iocg->active_list)))
+ return action_commit;
+
+ /*
+ * We're over budget. If @bio has to be issued regardless, remember
+ * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
+ * off the debt before waking more IOs.
+ *
+ * This way, the debt is continuously paid off each period with the
+ * actual budget available to the cgroup. If we just wound vtime, we
+ * would incorrectly use the current hw_inuse for the entire amount
+ * which, for example, can lead to the cgroup staying blocked for a
+ * long time even with substantially raised hw_inuse.
+ *
+ * An iocg with vdebt should stay online so that the timer can keep
+ * deducting its vdebt and [de]activate use_delay mechanism
+ * accordingly. We don't want to race against the timer trying to
+ * clear them and leave @iocg inactive w/ dangling use_delay heavily
+ * penalizing the cgroup and its descendants.
+ */
+ if (use_debt) {
+ iocg_incur_debt(iocg, abs_cost, now);
+ if (iocg_kick_delay(iocg, now))
+ blkcg_schedule_throttle(rqos->disk,
+ (bio->bi_opf & REQ_SWAP) ==
+ REQ_SWAP);
+ return action_return;
+ }
+
+ /* guarantee that iocgs w/ waiters have maximum inuse */
+ if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
+ if (!ioc_locked)
+ return action_retry;
+ lockdep_assert_held(&iocg->ioc->lock);
+ propagate_weights(iocg, iocg->active, iocg->active, true, now);
+ }
+
+ /*
+ * Append self to the waitq and schedule the wakeup timer if we're
+ * the first waiter. The timer duration is calculated based on the
+ * current vrate. vtime and hweight changes can make it too short
+ * or too long. Each wait entry records the absolute cost it's
+ * waiting for to allow re-evaluation using a custom wait entry.
+ *
+ * If too short, the timer simply reschedules itself. If too long,
+ * the period timer will notice and trigger wakeups.
+ *
+ * All waiters are on iocg->waitq and the wait states are
+ * synchronized using waitq.lock.
+ */
+ init_wait_func(&wait->wait, iocg_wake_fn);
+ wait->bio = bio;
+ wait->abs_cost = abs_cost;
+ wait->committed = false; /* will be set true by waker */
+
+ __add_wait_queue_entry_tail(&iocg->waitq, &wait->wait);
+ iocg_kick_waitq(iocg, ioc_locked, now);
+ return action_wait;
+}
+
static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
{
struct blkcg_gq *blkg = bio->bi_blkg;
@@ -2623,6 +2693,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
struct iocg_wait wait;
u64 abs_cost, cost, vtime;
bool use_debt, ioc_locked;
+ enum over_budget_action action;
unsigned long flags;
/* bypass IOs if disabled, still initializing, or for root cgroup */
@@ -2662,81 +2733,34 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
retry_lock:
- iocg_lock(iocg, ioc_locked, &flags);
-
- /*
- * @iocg must stay activated for debt and waitq handling. Deactivation
- * is synchronized against both ioc->lock and waitq.lock and we won't
- * get deactivated as long as we're waiting or has debt, so we're good
- * if we're activated here. In the unlikely cases that we aren't, just
- * issue the IO.
- */
- if (unlikely(list_empty(&iocg->active_list))) {
- iocg_unlock(iocg, ioc_locked, &flags);
+ if (ioc_locked) {
+ spin_lock_irqsave(&iocg->ioc->lock, flags);
+ spin_lock(&iocg->waitq.lock);
+ action = iocg_handle_over_budget(rqos, iocg, bio, &now, &wait,
+ use_debt, ioc_locked, abs_cost,
+ cost);
+ spin_unlock(&iocg->waitq.lock);
+ spin_unlock_irqrestore(&iocg->ioc->lock, flags);
+ } else {
+ spin_lock_irqsave(&iocg->waitq.lock, flags);
+ action = iocg_handle_over_budget(rqos, iocg, bio, &now, &wait,
+ use_debt, ioc_locked, abs_cost,
+ cost);
+ spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+ }
+ switch (action) {
+ case action_retry:
+ ioc_locked = true;
+ goto retry_lock;
+ case action_commit:
iocg_commit_bio(iocg, bio, abs_cost, cost);
return;
- }
-
- /*
- * We're over budget. If @bio has to be issued regardless, remember
- * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
- * off the debt before waking more IOs.
- *
- * This way, the debt is continuously paid off each period with the
- * actual budget available to the cgroup. If we just wound vtime, we
- * would incorrectly use the current hw_inuse for the entire amount
- * which, for example, can lead to the cgroup staying blocked for a
- * long time even with substantially raised hw_inuse.
- *
- * An iocg with vdebt should stay online so that the timer can keep
- * deducting its vdebt and [de]activate use_delay mechanism
- * accordingly. We don't want to race against the timer trying to
- * clear them and leave @iocg inactive w/ dangling use_delay heavily
- * penalizing the cgroup and its descendants.
- */
- if (use_debt) {
- iocg_incur_debt(iocg, abs_cost, &now);
- if (iocg_kick_delay(iocg, &now))
- blkcg_schedule_throttle(rqos->disk,
- (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
- iocg_unlock(iocg, ioc_locked, &flags);
+ case action_return:
return;
+ case action_wait:
+ break;
}
- /* guarantee that iocgs w/ waiters have maximum inuse */
- if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
- if (!ioc_locked) {
- iocg_unlock(iocg, false, &flags);
- ioc_locked = true;
- goto retry_lock;
- }
- propagate_weights(iocg, iocg->active, iocg->active, true,
- &now);
- }
-
- /*
- * Append self to the waitq and schedule the wakeup timer if we're
- * the first waiter. The timer duration is calculated based on the
- * current vrate. vtime and hweight changes can make it too short
- * or too long. Each wait entry records the absolute cost it's
- * waiting for to allow re-evaluation using a custom wait entry.
- *
- * If too short, the timer simply reschedules itself. If too long,
- * the period timer will notice and trigger wakeups.
- *
- * All waiters are on iocg->waitq and the wait states are
- * synchronized using waitq.lock.
- */
- init_wait_func(&wait.wait, iocg_wake_fn);
- wait.bio = bio;
- wait.abs_cost = abs_cost;
- wait.committed = false; /* will be set true by waker */
-
- __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
- iocg_kick_waitq(iocg, ioc_locked, &now);
-
- iocg_unlock(iocg, ioc_locked, &flags);
-
while (true) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (wait.committed)
@@ -3140,19 +3164,25 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
blkg_conf_init(&ctx, buf);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ return ret;
+
ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
if (ret)
- goto err;
+ goto close_bdev;
iocg = blkg_to_iocg(ctx.blkg);
+ ret = -EINVAL;
+
if (!strncmp(ctx.body, "default", 7)) {
v = 0;
} else {
if (!sscanf(ctx.body, "%u", &v))
- goto einval;
+ goto unprep;
if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
- goto einval;
+ goto unprep;
}
spin_lock(&iocg->ioc->lock);
@@ -3161,14 +3191,15 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
weight_updated(iocg, &now);
spin_unlock(&iocg->ioc->lock);
- blkg_conf_exit(&ctx);
- return nbytes;
+ ret = 0;
-einval:
- ret = -EINVAL;
-err:
- blkg_conf_exit(&ctx);
- return ret;
+unprep:
+ blkg_conf_unprep(&ctx);
+
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
+
+ return ret ?: nbytes;
}
static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
@@ -3226,34 +3257,43 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
struct blkg_conf_ctx ctx;
+ struct request_queue *q;
struct gendisk *disk;
struct ioc *ioc;
u32 qos[NR_QOS_PARAMS];
bool enable, user;
char *body, *p;
- unsigned long memflags;
+ unsigned int memflags;
int ret;
blkg_conf_init(&ctx, input);
- memflags = blkg_conf_open_bdev_frozen(&ctx);
- if (IS_ERR_VALUE(memflags)) {
- ret = memflags;
- goto err;
- }
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ return ret;
+ /*
+ * At this point, we haven’t started protecting anything related to QoS,
+ * so we release q->rq_qos_mutex here, which was first acquired in blkg_
+ * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
+ * the queue to maintain the correct locking order.
+ */
+ mutex_unlock(&ctx.bdev->bd_queue->rq_qos_mutex);
+
+ memflags = blk_mq_freeze_queue(ctx.bdev->bd_queue);
+ mutex_lock(&ctx.bdev->bd_queue->rq_qos_mutex);
body = ctx.body;
disk = ctx.bdev->bd_disk;
if (!queue_is_mq(disk->queue)) {
ret = -EOPNOTSUPP;
- goto err;
+ goto close_bdev;
}
ioc = q_to_ioc(disk->queue);
if (!ioc) {
ret = blk_iocost_init(disk);
if (ret)
- goto err;
+ goto close_bdev;
ioc = q_to_ioc(disk->queue);
}
@@ -3357,15 +3397,17 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
blk_mq_unquiesce_queue(disk->queue);
- blkg_conf_exit_frozen(&ctx, memflags);
- return nbytes;
+close_bdev:
+ q = ctx.bdev->bd_queue;
+ blkg_conf_close_bdev(&ctx);
+ blk_mq_unfreeze_queue(q, memflags);
+ return ret ?: nbytes;
+
einval:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(disk->queue);
ret = -EINVAL;
-err:
- blkg_conf_exit_frozen(&ctx, memflags);
- return ret;
+ goto close_bdev;
}
static u64 ioc_cost_model_prfill(struct seq_file *sf,
@@ -3430,20 +3472,20 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto err;
+ return ret;
body = ctx.body;
q = bdev_get_queue(ctx.bdev);
if (!queue_is_mq(q)) {
ret = -EOPNOTSUPP;
- goto err;
+ goto close_bdev;
}
ioc = q_to_ioc(q);
if (!ioc) {
ret = blk_iocost_init(ctx.bdev->bd_disk);
if (ret)
- goto err;
+ goto close_bdev;
ioc = q_to_ioc(q);
}
@@ -3454,6 +3496,8 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
memcpy(u, ioc->params.i_lcoefs, sizeof(u));
user = ioc->user_cost_model;
+ ret = -EINVAL;
+
while ((p = strsep(&body, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
char buf[32];
@@ -3471,20 +3515,20 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
else if (!strcmp(buf, "user"))
user = true;
else
- goto einval;
+ goto unlock;
continue;
case COST_MODEL:
match_strlcpy(buf, &args[0], sizeof(buf));
if (strcmp(buf, "linear"))
- goto einval;
+ goto unlock;
continue;
}
tok = match_token(p, i_lcoef_tokens, args);
if (tok == NR_I_LCOEFS)
- goto einval;
+ goto unlock;
if (match_u64(&args[0], &v))
- goto einval;
+ goto unlock;
u[tok] = v;
user = true;
}
@@ -3496,24 +3540,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc->user_cost_model = false;
}
ioc_refresh_params(ioc, true);
- spin_unlock_irq(&ioc->lock);
-
- blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q, memflags);
- blkg_conf_exit(&ctx);
- return nbytes;
+ ret = 0;
-einval:
+unlock:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q, memflags);
- ret = -EINVAL;
-err:
- blkg_conf_exit(&ctx);
- return ret;
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
+ return ret ?: nbytes;
}
static struct cftype ioc_files[] = {
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 53e8dd2dfa8a..1aaee6fb0f59 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -840,7 +840,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto out;
+ return ret;
/*
* blk_iolatency_init() may fail after rq_qos_add() succeeds which can
@@ -850,11 +850,11 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
if (!iolat_rq_qos(ctx.bdev->bd_queue))
ret = blk_iolatency_init(ctx.bdev->bd_disk);
if (ret)
- goto out;
+ goto close_bdev;
ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx);
if (ret)
- goto out;
+ goto close_bdev;
iolat = blkg_to_lat(ctx.blkg);
p = ctx.body;
@@ -865,7 +865,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
char val[21]; /* 18446744073709551616 */
if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
- goto out;
+ goto unprep;
if (!strcmp(key, "target")) {
u64 v;
@@ -875,9 +875,9 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
else if (sscanf(val, "%llu", &v) == 1)
lat_val = v * NSEC_PER_USEC;
else
- goto out;
+ goto unprep;
} else {
- goto out;
+ goto unprep;
}
}
@@ -889,8 +889,11 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
if (oldval != iolat->min_lat_nsec)
iolatency_clear_scaling(blkg);
ret = 0;
-out:
- blkg_conf_exit(&ctx);
+
+unprep:
+ blkg_conf_unprep(&ctx);
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
return ret ?: nbytes;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fcf09325b22e..ab1161ca69f1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -122,8 +122,7 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs);
if (IS_ERR(split)) {
- bio->bi_status = errno_to_blk_status(PTR_ERR(split));
- bio_endio(bio);
+ bio_endio_status(bio, errno_to_blk_status(PTR_ERR(split)));
return NULL;
}
@@ -143,8 +142,7 @@ EXPORT_SYMBOL_GPL(bio_submit_split_bioset);
static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
{
if (unlikely(split_sectors < 0)) {
- bio->bi_status = errno_to_blk_status(split_sectors);
- bio_endio(bio);
+ bio_endio_status(bio, errno_to_blk_status(split_sectors));
return NULL;
}
@@ -547,7 +545,7 @@ static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
if (!blk_cgroup_mergeable(req, bio))
goto no_merge;
- if (blk_integrity_merge_bio(req->q, req, bio) == false)
+ if (unlikely(!blk_integrity_merge_bio(req->q, req, bio)))
goto no_merge;
/* discard request merge won't add new segment */
@@ -649,7 +647,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
if (!blk_cgroup_mergeable(req, next->bio))
return 0;
- if (blk_integrity_merge_rq(q, req, next) == false)
+ if (unlikely(!blk_integrity_merge_rq(q, req, next)))
return 0;
if (!bio_crypt_ctx_merge_rq(req, next))
@@ -723,8 +721,7 @@ static void blk_account_io_merge_request(struct request *req)
if (req->rq_flags & RQF_IO_STAT) {
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
- part_stat_local_dec(req->part,
- in_flight[op_is_write(req_op(req))]);
+ bdev_dec_in_flight(req->part, req_op(req));
part_stat_unlock();
}
}
@@ -905,7 +902,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!blk_cgroup_mergeable(rq, bio))
return false;
- if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
+ if (unlikely(!blk_integrity_merge_bio(rq->q, rq, bio)))
return false;
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
@@ -915,7 +912,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
if (rq->bio->bi_ioprio != bio->bi_ioprio)
return false;
- if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
+ if (unlikely(!blk_atomic_write_mergeable_rq_bio(rq, bio)))
return false;
return true;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 047ec887456b..6754d8f9449c 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -19,8 +19,10 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
return 0;
}
+#define TO_REQUEST_QUEUE(m) ((struct request_queue *)m->private)
+
static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos)
- __acquires(&q->requeue_lock)
+ __acquires(&TO_REQUEST_QUEUE(m)->requeue_lock)
{
struct request_queue *q = m->private;
@@ -36,13 +38,15 @@ static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos)
}
static void queue_requeue_list_stop(struct seq_file *m, void *v)
- __releases(&q->requeue_lock)
+ __releases(&TO_REQUEST_QUEUE(m)->requeue_lock)
{
struct request_queue *q = m->private;
spin_unlock_irq(&q->requeue_lock);
}
+#undef TO_REQUEST_QUEUE
+
static const struct seq_operations queue_requeue_list_seq_ops = {
.start = queue_requeue_list_start,
.next = queue_requeue_list_next,
@@ -297,8 +301,10 @@ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
}
EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show);
+#define TO_HCTX(m) ((struct blk_mq_hw_ctx *)m->private)
+
static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos)
- __acquires(&hctx->lock)
+ __acquires(&TO_HCTX(m)->lock)
{
struct blk_mq_hw_ctx *hctx = m->private;
@@ -314,13 +320,15 @@ static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
}
static void hctx_dispatch_stop(struct seq_file *m, void *v)
- __releases(&hctx->lock)
+ __releases(&TO_HCTX(m)->lock)
{
struct blk_mq_hw_ctx *hctx = m->private;
spin_unlock(&hctx->lock);
}
+#undef TO_HCTX
+
static const struct seq_operations hctx_dispatch_seq_ops = {
.start = hctx_dispatch_start,
.next = hctx_dispatch_next,
@@ -484,9 +492,11 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
return 0;
}
+#define TO_CTX(m) ((struct blk_mq_ctx *)m->private)
+
#define CTX_RQ_SEQ_OPS(name, type) \
static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
- __acquires(&ctx->lock) \
+ __acquires(&TO_CTX(m)->lock) \
{ \
struct blk_mq_ctx *ctx = m->private; \
\
@@ -503,7 +513,7 @@ static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \
} \
\
static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \
- __releases(&ctx->lock) \
+ __releases(&TO_CTX(m)->lock) \
{ \
struct blk_mq_ctx *ctx = m->private; \
\
@@ -521,6 +531,8 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
+#undef TO_CTX
+
static int blk_mq_debugfs_show(struct seq_file *m, void *v)
{
const struct blk_mq_debugfs_attr *attr = m->private;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..35deee5bbc73 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
#include <linux/kmemleak.h>
#include <linux/delay.h>
+#include <trace/events/block.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
@@ -181,6 +182,11 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != BLK_MQ_NO_TAG)
break;
+ /* Log the starvation event before altering task state */
+ trace_block_rq_tag_wait(data->q, data->hctx,
+ data->rq_flags & RQF_SCHED_TAGS,
+ data->flags);
+
sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a24175441380..88cb5acc4f39 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1082,49 +1082,18 @@ static inline void blk_account_io_done(struct request *req, u64 now)
update_io_ticks(req->part, jiffies, true);
part_stat_inc(req->part, ios[sgrp]);
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
- part_stat_local_dec(req->part,
- in_flight[op_is_write(req_op(req))]);
+ bdev_dec_in_flight(req->part, req_op(req));
part_stat_unlock();
}
}
-static inline bool blk_rq_passthrough_stats(struct request *req)
-{
- struct bio *bio = req->bio;
-
- if (!blk_queue_passthrough_stat(req->q))
- return false;
-
- /* Requests without a bio do not transfer data. */
- if (!bio)
- return false;
-
- /*
- * Stats are accumulated in the bdev, so must have one attached to a
- * bio to track stats. Most drivers do not set the bdev for passthrough
- * requests, but nvme is one that will set it.
- */
- if (!bio->bi_bdev)
- return false;
-
- /*
- * We don't know what a passthrough command does, but we know the
- * payload size and data direction. Ensuring the size is aligned to the
- * block size filters out most commands with payloads that don't
- * represent sector access.
- */
- if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
- return false;
- return true;
-}
-
static inline void blk_account_io_start(struct request *req)
{
trace_block_io_start(req);
if (!blk_queue_io_stat(req->q))
return;
- if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
+ if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req, req->q))
return;
req->rq_flags |= RQF_IO_STAT;
@@ -1143,7 +1112,7 @@ static inline void blk_account_io_start(struct request *req)
part_stat_lock();
update_io_ticks(req->part, jiffies, false);
- part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]);
+ bdev_inc_in_flight(req->part, req_op(req));
part_stat_unlock();
}
@@ -3170,8 +3139,7 @@ void blk_mq_submit_bio(struct bio *bio)
}
if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
- bio->bi_status = BLK_STS_NOTSUPP;
- bio_endio(bio);
+ bio_endio_status(bio, BLK_STS_NOTSUPP);
goto queue_exit;
}
@@ -3215,8 +3183,7 @@ new_request:
ret = blk_crypto_rq_get_keyslot(rq);
if (ret != BLK_STS_OK) {
- bio->bi_status = ret;
- bio_endio(bio);
+ bio_endio_status(bio, ret);
blk_mq_free_request(rq);
return;
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 78c83817b9d3..8274631290db 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -795,6 +795,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->features &= ~BLK_FEAT_NOWAIT;
if (!(b->features & BLK_FEAT_POLL))
t->features &= ~BLK_FEAT_POLL;
+ if (!(b->features & BLK_FEAT_PCI_P2PDMA))
+ t->features &= ~BLK_FEAT_PCI_P2PDMA;
t->flags |= (b->flags & BLK_FLAG_MISALIGNED);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f22c1f253eb3..520972676ab4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -19,6 +19,7 @@
#include "blk-wbt.h"
#include "blk-cgroup.h"
#include "blk-throttle.h"
+#include "error-injection.h"
struct queue_sysfs_entry {
struct attribute attr;
@@ -933,6 +934,8 @@ static void blk_debugfs_remove(struct gendisk *disk)
blk_debugfs_lock_nomemsave(q);
blk_trace_shutdown(q);
+ if (IS_ENABLED(CONFIG_BLK_ERROR_INJECTION))
+ blk_error_injection_exit(disk);
debugfs_remove_recursive(q->debugfs_dir);
q->debugfs_dir = NULL;
q->sched_debugfs_dir = NULL;
@@ -963,6 +966,8 @@ int blk_register_queue(struct gendisk *disk)
memflags = blk_debugfs_lock(q);
q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root);
+ if (IS_ENABLED(CONFIG_BLK_ERROR_INJECTION))
+ blk_error_injection_init(disk);
if (queue_is_mq(q))
blk_mq_debugfs_register(q);
blk_debugfs_unlock(q, memflags);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index cabf91f0d0dc..47052ba21d1b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1353,21 +1353,21 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto out_finish;
+ return ret;
if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
ret = blk_throtl_init(ctx.bdev->bd_disk);
if (ret)
- goto out_finish;
+ goto close_bdev;
}
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
- goto out_finish;
+ goto close_bdev;
ret = -EINVAL;
if (sscanf(ctx.body, "%llu", &v) != 1)
- goto out_finish;
+ goto unprep;
if (!v)
v = U64_MAX;
@@ -1381,8 +1381,12 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
tg_conf_updated(tg, false);
ret = 0;
-out_finish:
- blkg_conf_exit(&ctx);
+
+unprep:
+ blkg_conf_unprep(&ctx);
+
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
return ret ?: nbytes;
}
@@ -1537,17 +1541,17 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
ret = blkg_conf_open_bdev(&ctx);
if (ret)
- goto out_finish;
+ return ret;
if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
ret = blk_throtl_init(ctx.bdev->bd_disk);
if (ret)
- goto out_finish;
+ goto close_bdev;
}
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
- goto out_finish;
+ goto close_bdev;
tg = blkg_to_tg(ctx.blkg);
tg_update_carryover(tg);
@@ -1573,11 +1577,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
p = tok;
strsep(&p, "=");
if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
- goto out_finish;
+ goto unprep;
ret = -ERANGE;
if (!val)
- goto out_finish;
+ goto unprep;
ret = -EINVAL;
if (!strcmp(tok, "rbps"))
@@ -1589,7 +1593,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
else if (!strcmp(tok, "wiops"))
v[3] = min_t(u64, val, UINT_MAX);
else
- goto out_finish;
+ goto unprep;
}
tg->bps[READ] = v[0];
@@ -1599,8 +1603,10 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg_conf_updated(tg, false);
ret = 0;
-out_finish:
- blkg_conf_exit(&ctx);
+unprep:
+ blkg_conf_unprep(&ctx);
+close_bdev:
+ blkg_conf_close_bdev(&ctx);
return ret ?: nbytes;
}
@@ -1649,7 +1655,7 @@ static void tg_flush_bios(struct throtl_grp *tg)
*/
tg_update_disptime(tg);
- throtl_schedule_pending_timer(sq, jiffies + 1);
+ throtl_schedule_next_dispatch(sq->parent_sq, true);
}
static void throtl_pd_offline(struct blkg_policy_data *pd)
@@ -1668,11 +1674,52 @@ struct blkcg_policy blkcg_policy_throtl = {
.pd_free_fn = throtl_pd_free,
};
+static void tg_cancel_writeback_bios(struct throtl_grp *tg,
+ struct bio_list *cancel_bios)
+{
+ struct throtl_service_queue *sq = &tg->service_queue;
+ struct throtl_data *td = sq_to_td(sq);
+ int rw;
+
+ if (tg->flags & THROTL_TG_CANCELING)
+ return;
+ tg->flags |= THROTL_TG_CANCELING;
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ struct throtl_qnode *qn, *tmp;
+ unsigned int nr_bios = 0;
+
+ list_for_each_entry_safe(qn, tmp, &sq->queued[rw], node) {
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&qn->bios_iops))) {
+ sq->nr_queued_iops[rw]--;
+ bio_list_add(&cancel_bios[rw], bio);
+ nr_bios++;
+ }
+ while ((bio = bio_list_pop(&qn->bios_bps))) {
+ sq->nr_queued_bps[rw]--;
+ bio_list_add(&cancel_bios[rw], bio);
+ nr_bios++;
+ }
+
+ list_del_init(&qn->node);
+ blkg_put(tg_to_blkg(qn->tg));
+ }
+
+ td->nr_queued[rw] -= nr_bios;
+ }
+
+ throtl_dequeue_tg(tg);
+}
+
void blk_throtl_cancel_bios(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct cgroup_subsys_state *pos_css;
struct blkcg_gq *blkg;
+ struct bio_list cancel_bios[2] = { };
+ int rw;
if (!blk_throtl_activated(q))
return;
@@ -1693,10 +1740,16 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
* Cancel bios here to ensure no bios are inflight after
* del_gendisk.
*/
- tg_flush_bios(blkg_to_tg(blkg));
+ tg_cancel_writeback_bios(blkg_to_tg(blkg), cancel_bios);
}
rcu_read_unlock();
spin_unlock_irq(&q->queue_lock);
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ struct bio *bio;
+ while ((bio = bio_list_pop(&cancel_bios[rw])))
+ bio_io_error(bio);
+ }
}
static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 6a221c180889..bea817f3de56 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1924,7 +1924,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
goto free_hash;
disk->zone_wplugs_wq =
- alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
+ alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU,
pool_size, disk->disk_name);
if (!disk->zone_wplugs_wq)
goto destroy_pool;
diff --git a/block/blk.h b/block/blk.h
index b998a7761faf..25af8ac5ef0f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,6 +4,7 @@
#include <linux/bio-integrity.h>
#include <linux/blk-crypto.h>
+#include <linux/part_stat.h>
#include <linux/lockdep.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
@@ -49,6 +50,11 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);
+const char *blk_status_to_str(blk_status_t status);
+const char *blk_status_to_tag(blk_status_t status);
+blk_status_t tag_to_blk_status(const char *tag);
+enum req_op str_to_blk_op(const char *op);
+
bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
bool blk_queue_start_drain(struct request_queue *q);
bool __blk_freeze_queue_start(struct request_queue *q,
@@ -402,6 +408,8 @@ static inline bool bio_may_need_split(struct bio *bio,
bv = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
if (bio->bi_iter.bi_size > bv->bv_len - bio->bi_iter.bi_bvec_done)
return true;
+ if ((bv->bv_offset | bv->bv_len) & lim->dma_alignment)
+ return true;
return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size;
}
@@ -485,6 +493,26 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
q->last_merge = NULL;
}
+static inline void bdev_inc_in_flight(struct block_device *bdev,
+ enum req_op op)
+{
+ bool rw = op_is_write(op);
+
+ part_stat_local_inc(bdev, in_flight[rw]);
+ if (bdev_is_partition(bdev))
+ part_stat_local_inc(bdev_whole(bdev), in_flight[rw]);
+}
+
+static inline void bdev_dec_in_flight(struct block_device *bdev,
+ enum req_op op)
+{
+ bool rw = op_is_write(op);
+
+ part_stat_local_dec(bdev, in_flight[rw]);
+ if (bdev_is_partition(bdev))
+ part_stat_local_dec(bdev_whole(bdev), in_flight[rw]);
+}
+
/*
* Internal io_context interface
*/
@@ -754,16 +782,19 @@ static inline void blk_unfreeze_release_lock(struct request_queue *q)
* reclaim from triggering block I/O.
*/
static inline void blk_debugfs_lock_nomemsave(struct request_queue *q)
+ __acquires(&q->debugfs_mutex)
{
mutex_lock(&q->debugfs_mutex);
}
static inline void blk_debugfs_unlock_nomemrestore(struct request_queue *q)
+ __releases(&q->debugfs_mutex)
{
mutex_unlock(&q->debugfs_mutex);
}
static inline unsigned int __must_check blk_debugfs_lock(struct request_queue *q)
+ __acquires(&q->debugfs_mutex)
{
unsigned int memflags = memalloc_noio_save();
@@ -773,6 +804,7 @@ static inline unsigned int __must_check blk_debugfs_lock(struct request_queue *q
static inline void blk_debugfs_unlock(struct request_queue *q,
unsigned int memflags)
+ __releases(&q->debugfs_mutex)
{
blk_debugfs_unlock_nomemrestore(q);
memalloc_noio_restore(memflags);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index fdb4b290ca68..895db30a7033 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -299,7 +299,7 @@ out:
/* called right after the request is allocated for the request_queue */
static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
- unsigned int hctx_idx, unsigned int numa_node)
+ unsigned int hctx_idx, int numa_node)
{
struct bsg_job *job = blk_mq_rq_to_pdu(req);
diff --git a/block/error-injection.c b/block/error-injection.c
new file mode 100644
index 000000000000..d24c90e9a25f
--- /dev/null
+++ b/block/error-injection.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2026 Christoph Hellwig.
+ */
+#include <linux/debugfs.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include "blk.h"
+#include "error-injection.h"
+
+struct blk_error_inject {
+ struct list_head entry;
+ sector_t start;
+ sector_t end;
+ enum req_op op;
+ blk_status_t status;
+
+ /* only inject every 1 / chance times */
+ unsigned int chance;
+};
+
+DEFINE_STATIC_KEY_FALSE(blk_error_injection_enabled);
+
+bool __blk_error_inject(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_error_inject *inj;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inj, &disk->error_injection_list, entry) {
+ if (bio_op(bio) != inj->op)
+ continue;
+ /*
+ * This never matches 0-sized bios like empty WRITEs with
+ * REQ_PREFLUSH or ZONE_RESET_ALL. While adding a special case
+ * for them would be trivial, that means any WRITE rule would
+ * trigger for flushes. So before we can make this work
+ * properly, we'll need to start using REQ_OP_FLUSH for pure
+ * flushes at the bio level like we already do in blk-mq.
+ */
+ if (bio->bi_iter.bi_sector > inj->end ||
+ bio_end_sector(bio) <= inj->start)
+ continue;
+ if (inj->chance > 1 && (get_random_u32() % inj->chance) != 0)
+ continue;
+
+ pr_info_ratelimited("%pg: injecting %s error for %s at sector %llu:%u\n",
+ disk->part0, blk_status_to_str(inj->status),
+ blk_op_str(inj->op), bio->bi_iter.bi_sector,
+ bio_sectors(bio));
+ bio->bi_status = inj->status;
+ rcu_read_unlock();
+ bio_endio(bio);
+ return true;
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+static int error_inject_add(struct gendisk *disk, enum req_op op,
+ sector_t start, u64 nr_sectors, blk_status_t status,
+ unsigned int chance)
+{
+ struct blk_error_inject *inj;
+ int error = -EINVAL;
+
+ if (op == REQ_OP_LAST)
+ return -EINVAL;
+ if (status == BLK_STS_OK)
+ return -EINVAL;
+
+ inj = kzalloc_obj(*inj);
+ if (!inj)
+ return -ENOMEM;
+
+ if (nr_sectors) {
+ if (U64_MAX - nr_sectors < start)
+ goto out_free_inj;
+ inj->end = start + nr_sectors - 1;
+ } else {
+ inj->end = U64_MAX;
+ }
+
+ inj->op = op;
+ inj->start = start;
+ inj->status = status;
+ inj->chance = chance;
+
+ pr_debug_ratelimited("%pg: adding %s injection for %s at sector %llu:%llu\n",
+ disk->part0, blk_status_to_str(status),
+ blk_op_str(op),
+ start, nr_sectors);
+
+ /*
+ * Add to the front of the list so that newer entries can partially
+ * override other entries. This also intentionally allows duplicate
+ * entries as there is no real reason to reject them.
+ */
+ mutex_lock(&disk->error_injection_lock);
+ if (!disk_live(disk)) {
+ mutex_unlock(&disk->error_injection_lock);
+ error = -ENODEV;
+ goto out_free_inj;
+ }
+ if (list_empty(&disk->error_injection_list))
+ static_branch_inc(&blk_error_injection_enabled);
+ list_add_rcu(&inj->entry, &disk->error_injection_list);
+ set_bit(GD_ERROR_INJECT, &disk->state);
+ mutex_unlock(&disk->error_injection_lock);
+ return 0;
+
+out_free_inj:
+ kfree(inj);
+ return error;
+}
+
+static void error_inject_removeall(struct gendisk *disk)
+{
+ struct blk_error_inject *inj;
+
+ mutex_lock(&disk->error_injection_lock);
+ clear_bit(GD_ERROR_INJECT, &disk->state);
+ while ((inj = list_first_entry_or_null(&disk->error_injection_list,
+ struct blk_error_inject, entry))) {
+ list_del_rcu(&inj->entry);
+ kfree_rcu_mightsleep(inj);
+ }
+ static_branch_dec(&blk_error_injection_enabled);
+ mutex_unlock(&disk->error_injection_lock);
+}
+
+enum options {
+ Opt_add = (1u << 0),
+ Opt_removeall = (1u << 1),
+
+ Opt_op = (1u << 16),
+ Opt_start = (1u << 17),
+ Opt_nr_sectors = (1u << 18),
+ Opt_status = (1u << 19),
+ Opt_chance = (1u << 20),
+
+ Opt_invalid,
+};
+
+static const match_table_t opt_tokens = {
+ { Opt_add, "add", },
+ { Opt_removeall, "removeall", },
+ { Opt_op, "op=%s", },
+ { Opt_start, "start=%u" },
+ { Opt_nr_sectors, "nr_sectors=%u" },
+ { Opt_status, "status=%s" },
+ { Opt_chance, "chance=%u" },
+ { Opt_invalid, NULL, },
+};
+
+static int match_op(substring_t *args, enum req_op *op)
+{
+ const char *tag;
+
+ tag = match_strdup(args);
+ if (!tag)
+ return -ENOMEM;
+ *op = str_to_blk_op(tag);
+ if (*op == REQ_OP_LAST)
+ pr_warn("invalid op '%s'\n", tag);
+ kfree(tag);
+ return 0;
+}
+
+static int match_status(substring_t *args, blk_status_t *status)
+{
+ const char *tag;
+
+ tag = match_strdup(args);
+ if (!tag)
+ return -ENOMEM;
+ *status = tag_to_blk_status(tag);
+ if (!*status)
+ pr_warn("invalid status '%s'\n", tag);
+ kfree(tag);
+ return 0;
+}
+
+static ssize_t blk_error_injection_parse_options(struct gendisk *disk,
+ char *options)
+{
+ enum { Unset, Add, Removeall } action = Unset;
+ unsigned int option_mask = 0, chance = 1;
+ enum req_op op = REQ_OP_LAST;
+ u64 start = 0, nr_sectors = 0;
+ blk_status_t status = BLK_STS_OK;
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
+
+ while ((p = strsep(&options, ",\n")) != NULL) {
+ int error = 0;
+ ssize_t token;
+
+ if (!*p)
+ continue;
+ token = match_token(p, opt_tokens, args);
+ option_mask |= token;
+ switch (token) {
+ case Opt_add:
+ if (action != Unset)
+ return -EINVAL;
+ action = Add;
+ break;
+ case Opt_removeall:
+ if (action != Unset)
+ return -EINVAL;
+ action = Removeall;
+ break;
+ case Opt_op:
+ error = match_op(args, &op);
+ break;
+ case Opt_start:
+ error = match_u64(args, &start);
+ break;
+ case Opt_nr_sectors:
+ error = match_u64(args, &nr_sectors);
+ break;
+ case Opt_status:
+ error = match_status(args, &status);
+ break;
+ case Opt_chance:
+ error = match_uint(args, &chance);
+ if (!error && chance == 0)
+ error = -EINVAL;
+ break;
+ default:
+ pr_warn("unknown parameter or missing value '%s'\n", p);
+ error = -EINVAL;
+ }
+ if (error)
+ return error;
+ }
+
+ switch (action) {
+ case Add:
+ return error_inject_add(disk, op, start, nr_sectors, status,
+ chance);
+ case Removeall:
+ if (option_mask & ~Opt_removeall)
+ return -EINVAL;
+ error_inject_removeall(disk);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static ssize_t blk_error_injection_write(struct file *file,
+ const char __user *ubuf, size_t count, loff_t *pos)
+{
+ struct gendisk *disk = file_inode(file)->i_private;
+ char *options;
+ int error;
+
+ options = memdup_user_nul(ubuf, count);
+ if (IS_ERR(options))
+ return PTR_ERR(options);
+ error = blk_error_injection_parse_options(disk, options);
+ kfree(options);
+
+ if (error)
+ return error;
+ return count;
+}
+
+static int blk_error_injection_show(struct seq_file *s, void *private)
+{
+ struct gendisk *disk = s->private;
+ struct blk_error_inject *inj;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inj, &disk->error_injection_list, entry) {
+ seq_printf(s, "%llu:%llu status=%s,chance=%u",
+ inj->start, inj->end,
+ blk_status_to_tag(inj->status), inj->chance);
+ seq_putc(s, '\n');
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int blk_error_injection_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, blk_error_injection_show, inode->i_private);
+}
+
+static int blk_error_injection_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+
+static const struct file_operations blk_error_injection_fops = {
+ .owner = THIS_MODULE,
+ .write = blk_error_injection_write,
+ .read = seq_read,
+ .open = blk_error_injection_open,
+ .release = blk_error_injection_release,
+};
+
+void blk_error_injection_init(struct gendisk *disk)
+{
+ debugfs_create_file("error_injection", 0600, disk->queue->debugfs_dir,
+ disk, &blk_error_injection_fops);
+}
+
+void blk_error_injection_exit(struct gendisk *disk)
+{
+ error_inject_removeall(disk);
+}
diff --git a/block/error-injection.h b/block/error-injection.h
new file mode 100644
index 000000000000..9821d773abab
--- /dev/null
+++ b/block/error-injection.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BLK_ERROR_INJECTION_H
+#define _BLK_ERROR_INJECTION_H 1
+
+#include <linux/jump_label.h>
+
+DECLARE_STATIC_KEY_FALSE(blk_error_injection_enabled);
+
+void blk_error_injection_init(struct gendisk *disk);
+void blk_error_injection_exit(struct gendisk *disk);
+bool __blk_error_inject(struct bio *bio);
+static inline bool blk_error_inject(struct bio *bio)
+{
+ if (IS_ENABLED(CONFIG_BLK_ERROR_INJECTION) &&
+ static_branch_unlikely(&blk_error_injection_enabled) &&
+ test_bit(GD_ERROR_INJECT, &bio->bi_bdev->bd_disk->state))
+ return __blk_error_inject(bio);
+ return false;
+}
+
+#endif /* _BLK_ERROR_INJECTION_H */
diff --git a/block/fops.c b/block/fops.c
index bb6642b45937..15783a6180de 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -218,8 +218,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
if (unlikely(ret)) {
- bio->bi_status = BLK_STS_IOERR;
- bio_endio(bio);
+ bio_endio_status(bio, BLK_STS_IOERR);
break;
}
if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -499,36 +498,12 @@ static void blkdev_readahead(struct readahead_control *rac)
mpage_readahead(rac, blkdev_get_block);
}
-static int blkdev_write_begin(const struct kiocb *iocb,
- struct address_space *mapping, loff_t pos,
- unsigned len, struct folio **foliop,
- void **fsdata)
-{
- return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
-}
-
-static int blkdev_write_end(const struct kiocb *iocb,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
-{
- int ret;
- ret = block_write_end(pos, len, copied, folio);
-
- folio_unlock(folio);
- folio_put(folio);
-
- return ret;
-}
-
const struct address_space_operations def_blk_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = blkdev_read_folio,
.readahead = blkdev_readahead,
.writepages = blkdev_writepages,
- .write_begin = blkdev_write_begin,
- .write_end = blkdev_write_end,
.migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
diff --git a/block/genhd.c b/block/genhd.c
index 7d6854fd28e9..f84b6a355b57 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1486,6 +1486,10 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
+#ifdef CONFIG_BLK_ERROR_INJECTION
+ mutex_init(&disk->error_injection_lock);
+ INIT_LIST_HEAD(&disk->error_injection_list);
+#endif
mutex_init(&disk->rqos_state_mutex);
kobject_init(&disk->queue_kobj, &blk_queue_ktype);
return disk;
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b84163d1f851..971818bcdc9d 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -882,6 +882,9 @@ static const struct elv_fs_entry kyber_sched_attrs[] = {
};
#undef KYBER_LAT_ATTR
+#define HCTX_FROM_SEQ_FILE(m) ((struct blk_mq_hw_ctx *)(m)->private)
+#define KYBER_HCTX_DATA(hctx) ((struct kyber_hctx_data *)(hctx)->sched_data)
+
#ifdef CONFIG_BLK_DEBUG_FS
#define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
@@ -894,7 +897,7 @@ static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
} \
\
static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
- __acquires(&khd->lock) \
+ __acquires(&KYBER_HCTX_DATA(HCTX_FROM_SEQ_FILE(m))->lock) \
{ \
struct blk_mq_hw_ctx *hctx = m->private; \
struct kyber_hctx_data *khd = hctx->sched_data; \
@@ -913,7 +916,7 @@ static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
} \
\
static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
- __releases(&khd->lock) \
+ __releases(&KYBER_HCTX_DATA(HCTX_FROM_SEQ_FILE(m))->lock) \
{ \
struct blk_mq_hw_ctx *hctx = m->private; \
struct kyber_hctx_data *khd = hctx->sched_data; \
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 95917a88976f..824bfc17b2c6 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -794,11 +794,15 @@ static const struct elv_fs_entry deadline_attrs[] = {
__ATTR_NULL
};
+#define RQ_FROM_SEQ_FILE(m) ((struct request_queue *)(m)->private)
+#define DD_DATA_FROM_RQ(rq) \
+ ((struct deadline_data *)(rq)->elevator->elevator_data)
+
#ifdef CONFIG_BLK_DEBUG_FS
#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \
static void *deadline_##name##_fifo_start(struct seq_file *m, \
loff_t *pos) \
- __acquires(&dd->lock) \
+ __acquires(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock) \
{ \
struct request_queue *q = m->private; \
struct deadline_data *dd = q->elevator->elevator_data; \
@@ -819,7 +823,7 @@ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \
} \
\
static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \
- __releases(&dd->lock) \
+ __releases(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock) \
{ \
struct request_queue *q = m->private; \
struct deadline_data *dd = q->elevator->elevator_data; \
@@ -921,7 +925,7 @@ static int dd_owned_by_driver_show(void *data, struct seq_file *m)
}
static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos)
- __acquires(&dd->lock)
+ __acquires(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock)
{
struct request_queue *q = m->private;
struct deadline_data *dd = q->elevator->elevator_data;
@@ -939,7 +943,7 @@ static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
}
static void deadline_dispatch_stop(struct seq_file *m, void *v)
- __releases(&dd->lock)
+ __releases(&DD_DATA_FROM_RQ(RQ_FROM_SEQ_FILE(m))->lock)
{
struct request_queue *q = m->private;
struct deadline_data *dd = q->elevator->elevator_data;
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c
index 9f7389f174d0..067d6a27a3bd 100644
--- a/block/partitions/acorn.c
+++ b/block/partitions/acorn.c
@@ -9,6 +9,7 @@
*/
#include <linux/buffer_head.h>
#include <linux/adfs_fs.h>
+#include <linux/minmax.h>
#include "check.h"
@@ -80,7 +81,7 @@ static int riscix_partition(struct parsed_partitions *state,
if (rr->magic == RISCIX_MAGIC) {
- unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+ unsigned long size = min(nr_sects, 2);
int part;
seq_buf_puts(&state->pp_buf, " <");
@@ -124,7 +125,7 @@ static int linux_partition(struct parsed_partitions *state,
{
Sector sect;
struct linux_part *linuxp;
- unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+ unsigned long size = min(nr_sects, 2);
seq_buf_puts(&state->pp_buf, " [Linux]");
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index 29b8f4cebb63..f3c4174e003e 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -226,6 +226,15 @@ int aix_partition(struct parsed_partitions *state)
int next_lp_ix = 1;
int lp_ix;
+ /*
+ * pvd was read into a fixed-size struct pvd whose ppe[] array
+ * holds ARRAY_SIZE(pvd->ppe) entries. pp_count is an
+ * unvalidated on-disk __be16, so clamp the scan to the array
+ * size to avoid walking past the allocation.
+ */
+ if (numpps > ARRAY_SIZE(pvd->ppe))
+ numpps = ARRAY_SIZE(pvd->ppe);
+
for (i = 0; i < numpps; i += 1) {
struct ppe *p = pvd->ppe + i;
unsigned int lv_ix;
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 5d5332ce586b..b5c59b79ca7c 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -124,7 +124,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
state = allocate_partitions(hd);
if (!state)
return NULL;
- state->pp_buf.buffer = (char *)__get_free_page(GFP_KERNEL);
+ state->pp_buf.buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!state->pp_buf.buffer) {
free_partitions(state);
return NULL;
@@ -154,7 +154,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
if (res > 0) {
printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));
- free_page((unsigned long)state->pp_buf.buffer);
+ kfree(state->pp_buf.buffer);
return state;
}
if (state->access_beyond_eod)
@@ -170,7 +170,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));
}
- free_page((unsigned long)state->pp_buf.buffer);
+ kfree(state->pp_buf.buffer);
free_partitions(state);
return ERR_PTR(res);
}
diff --git a/block/partitions/of.c b/block/partitions/of.c
index c22b60661098..53664ea06b65 100644
--- a/block/partitions/of.c
+++ b/block/partitions/of.c
@@ -74,8 +74,10 @@ int of_partition(struct parsed_partitions *state)
struct device_node *partitions_np = of_node_get(ddev->of_node);
if (!partitions_np ||
- !of_device_is_compatible(partitions_np, "fixed-partitions"))
+ !of_device_is_compatible(partitions_np, "fixed-partitions")) {
+ of_node_put(partitions_np);
return 0;
+ }
slot = 1;
/* Validate parition offset and size */
@@ -104,5 +106,6 @@ int of_partition(struct parsed_partitions *state)
seq_buf_puts(&state->pp_buf, "\n");
+ of_node_put(partitions_np);
return 1;
}