From 90138237a56282f99d71130ab7c68903a2eba5a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:20 +0200 Subject: block: remove the unused blk_queue_state enum Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-10-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 12b9dbcc980e..9e367509bea1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -262,11 +262,6 @@ enum blk_eh_timer_return { BLK_EH_RESET_TIMER, /* reset timer and try again */ }; -enum blk_queue_state { - Queue_down, - Queue_up, -}; - #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ -- cgit v1.2.3 From 713e4e11088875bf7aee3df81b167c8b2f6c5d65 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:21 +0200 Subject: block: remove the cmd_size field from struct request_queue Entirely unused. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9e367509bea1..6737e484280d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -544,8 +544,6 @@ struct request_queue { bool mq_sysfs_init_done; - size_t cmd_size; - #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; }; -- cgit v1.2.3 From 9778ac77c2027827ffdbb33d3e936b3a0ae9f0f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:22 +0200 Subject: block: remove the struct blk_queue_ctx forward declaration This type doesn't exist at all, so no need to forward declare it. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-12-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6737e484280d..4bcbb1ae2d66 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -253,8 +253,6 @@ static inline unsigned short req_get_ioprio(struct request *req) #include -struct blk_queue_ctx; - struct bio_vec; enum blk_eh_timer_return { -- cgit v1.2.3 From 2e9bc3465ac54d282b855b073409c2c3a7d1ae00 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:23 +0200 Subject: block: move elevator.h to block/ Except for the features passed to blk_queue_required_elevator_features, elevator.h is only needed internally to the block layer. Move the ELEVATOR_F_* definitions to blkdev.h, and the move elevator.h to block/, dropping all the spurious includes outside of that. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-13-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4bcbb1ae2d66..d815238f61ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -251,8 +251,6 @@ static inline unsigned short req_get_ioprio(struct request *req) return req->ioprio; } -#include - struct bio_vec; enum blk_eh_timer_return { @@ -1124,6 +1122,15 @@ extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); + +/* + * Elevator features for blk_queue_required_elevator_features: + */ +/* Supports zoned block devices sequential write constraint */ +#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) +/* Supports scheduling on multiple hardware queues */ +#define ELEVATOR_F_MQ_AWARE (1U << 1) + extern void blk_queue_required_elevator_features(struct request_queue *q, unsigned int features); extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, -- cgit v1.2.3 From 3ab0bc78e96bd03a5096e4801550926d81b3e19d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:24 +0200 Subject: block: drop unused includes in Drop various include not actually used in blkdev.h itself. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d815238f61ed..46a703394f7f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -3,8 +3,6 @@ #define _LINUX_BLKDEV_H #include -#include -#include #include #include #include @@ -12,17 +10,12 @@ #include #include #include -#include -#include #include -#include #include -#include #include #include #include #include -#include #include struct module; -- cgit v1.2.3 From badf7f64378796d460c79eb0f182fa7282eb65d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:26 +0200 Subject: block: move a few merge helpers out of These are block-layer internal helpers, so move them to block/blk.h and block/blk-merge.c. Also update a comment a bit to use better grammar. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-16-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 64 -------------------------------------------------- 1 file changed, 64 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 46a703394f7f..be534040ca9c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -745,37 +745,6 @@ static inline bool rq_is_sync(struct request *rq) return op_is_sync(rq->cmd_flags); } -static inline bool rq_mergeable(struct request *rq) -{ - if (blk_rq_is_passthrough(rq)) - return false; - - if (req_op(rq) == REQ_OP_FLUSH) - return false; - - if (req_op(rq) == REQ_OP_WRITE_ZEROES) - return false; - - if (req_op(rq) == REQ_OP_ZONE_APPEND) - return false; - - if (rq->cmd_flags & REQ_NOMERGE_FLAGS) - return false; - if (rq->rq_flags & RQF_NOMERGE_FLAGS) - return false; - - return true; -} - -static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) -{ - if (bio_page(a) == bio_page(b) && - bio_offset(a) == bio_offset(b)) - return true; - - return false; -} - static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) @@ -1030,23 +999,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q, return min(q->limits.max_sectors, chunk_sectors); } -static inline unsigned int blk_rq_get_max_sectors(struct request *rq, - sector_t offset) -{ - struct request_queue *q = rq->q; - - if (blk_rq_is_passthrough(rq)) - return q->limits.max_hw_sectors; - - if (!q->limits.chunk_sectors || - req_op(rq) == REQ_OP_DISCARD || - req_op(rq) == REQ_OP_SECURE_ERASE) - return blk_queue_get_max_sectors(q, req_op(rq)); - - return min(blk_max_size_offset(q, offset, 0), - blk_queue_get_max_sectors(q, req_op(rq))); -} - static inline unsigned int blk_rq_count_bios(struct request *rq) { unsigned int nr_bios = 0; @@ -1490,22 +1442,6 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector return offset << SECTOR_SHIFT; } -/* - * Two cases of handling DISCARD merge: - * If max_discard_segments > 1, the driver takes every bio - * as a range and send them to controller together. The ranges - * needn't to be contiguous. - * Otherwise, the bios/requests will be handled as same as - * others which should be contiguous. - */ -static inline bool blk_discard_mergable(struct request *req) -{ - if (req_op(req) == REQ_OP_DISCARD && - queue_max_discard_segments(req->q) > 1) - return true; - return false; -} - static inline int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From fe45e630a1035aea94c29016f2598bbde149bbe3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:27 +0200 Subject: block: move integrity handling out of Split the integrity/metadata handling definitions out into a new header. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-17-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 183 ------------------------------------------------- 1 file changed, 183 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index be534040ca9c..56e60e5c09d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1555,189 +1555,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ MODULE_ALIAS("block-major-" __stringify(major) "-*") -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -enum blk_integrity_flags { - BLK_INTEGRITY_VERIFY = 1 << 0, - BLK_INTEGRITY_GENERATE = 1 << 1, - BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, - BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, -}; - -struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - const char *disk_name; -}; - -typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); -typedef void (integrity_prepare_fn) (struct request *); -typedef void (integrity_complete_fn) (struct request *, unsigned int); - -struct blk_integrity_profile { - integrity_processing_fn *generate_fn; - integrity_processing_fn *verify_fn; - integrity_prepare_fn *prepare_fn; - integrity_complete_fn *complete_fn; - const char *name; -}; - -extern void blk_integrity_register(struct gendisk *, struct blk_integrity *); -extern void blk_integrity_unregister(struct gendisk *); -extern int blk_integrity_compare(struct gendisk *, struct gendisk *); -extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, - struct scatterlist *); -extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); - -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - if (!bi->profile) - return NULL; - - return bi; -} - -static inline -struct blk_integrity *bdev_get_integrity(struct block_device *bdev) -{ - return blk_get_integrity(bdev->bd_disk); -} - -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return q->integrity.profile; -} - -static inline bool blk_integrity_rq(struct request *rq) -{ - return rq->cmd_flags & REQ_INTEGRITY; -} - -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ - q->limits.max_integrity_segments = segs; -} - -static inline unsigned short -queue_max_integrity_segments(const struct request_queue *q) -{ - return q->limits.max_integrity_segments; -} - -/** - * bio_integrity_intervals - Return number of integrity intervals for a bio - * @bi: blk_integrity profile for device - * @sectors: Size of the bio in 512-byte sectors - * - * Description: The block layer calculates everything in 512 byte - * sectors but integrity metadata is done in terms of the data integrity - * interval size of the storage device. Convert the block layer sectors - * to the appropriate number of integrity intervals. - */ -static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, - unsigned int sectors) -{ - return sectors >> (bi->interval_exp - 9); -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return bio_integrity_intervals(bi, sectors) * bi->tuple_size; -} - -/* - * Return the first bvec that contains integrity data. Only drivers that are - * limited to a single integrity segment should use this helper. - */ -static inline struct bio_vec *rq_integrity_vec(struct request *rq) -{ - if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) - return NULL; - return rq->bio->bi_integrity->bip_vec; -} - -#else /* CONFIG_BLK_DEV_INTEGRITY */ - -struct bio; -struct block_device; -struct gendisk; -struct blk_integrity; - -static inline int blk_integrity_rq(struct request *rq) -{ - return 0; -} -static inline int blk_rq_count_integrity_sg(struct request_queue *q, - struct bio *b) -{ - return 0; -} -static inline int blk_rq_map_integrity_sg(struct request_queue *q, - struct bio *b, - struct scatterlist *s) -{ - return 0; -} -static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) -{ - return NULL; -} -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) -{ - return NULL; -} -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return false; -} -static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) -{ - return 0; -} -static inline void blk_integrity_register(struct gendisk *d, - struct blk_integrity *b) -{ -} -static inline void blk_integrity_unregister(struct gendisk *d) -{ -} -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ -} -static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) -{ - return 0; -} - -static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, - unsigned int sectors) -{ - return 0; -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return 0; -} - -static inline struct bio_vec *rq_integrity_vec(struct request *rq) -{ - return NULL; -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - #ifdef CONFIG_BLK_INLINE_ENCRYPTION bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q); -- cgit v1.2.3 From 24b83deb29b7f06a5573b65f2ce96f5482755d43 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:28 +0200 Subject: block: move struct request to blk-mq.h struct request is only used by blk-mq drivers, so move it and all related declarations to blk-mq.h. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-18-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 469 +------------------------------------------------ 1 file changed, 1 insertion(+), 468 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 56e60e5c09d0..0e960d74615e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -32,9 +31,6 @@ struct blk_queue_stats; struct blk_stat_callback; struct blk_keyslot_manager; -#define BLKDEV_MIN_RQ 4 -#define BLKDEV_MAX_RQ 128 /* Default maximum */ - /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 @@ -47,213 +43,12 @@ struct blk_keyslot_manager; */ #define BLKCG_MAX_POLS 6 -typedef void (rq_end_io_fn)(struct request *, blk_status_t); - -/* - * request flags */ -typedef __u32 __bitwise req_flags_t; - -/* drive already may have started this one */ -#define RQF_STARTED ((__force req_flags_t)(1 << 1)) -/* may not be passed by ioscheduler */ -#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) -/* request for flush sequence */ -#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) -/* merge of different types, fail separately */ -#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) -/* track inflight for MQ */ -#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) -/* don't call prep for this one */ -#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) -/* vaguely specified driver internal error. Ignored by the block layer */ -#define RQF_FAILED ((__force req_flags_t)(1 << 10)) -/* don't warn about errors */ -#define RQF_QUIET ((__force req_flags_t)(1 << 11)) -/* elevator private data attached */ -#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) -/* account into disk and partition IO statistics */ -#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) -/* runtime pm request */ -#define RQF_PM ((__force req_flags_t)(1 << 15)) -/* on IO scheduler merge hash */ -#define RQF_HASHED ((__force req_flags_t)(1 << 16)) -/* track IO completion time */ -#define RQF_STATS ((__force req_flags_t)(1 << 17)) -/* Look at ->special_vec for the actual data payload instead of the - bio chain. */ -#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -/* The per-zone write lock is held for this request */ -#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) -/* already slept for hybrid poll */ -#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) -/* ->timeout has been called, don't expire again */ -#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) - -/* flags that prevent us from merging requests: */ -#define RQF_NOMERGE_FLAGS \ - (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) - -/* - * Request state for blk-mq. - */ -enum mq_rq_state { - MQ_RQ_IDLE = 0, - MQ_RQ_IN_FLIGHT = 1, - MQ_RQ_COMPLETE = 2, -}; - -/* - * Try to put the fields that are referenced together in the same cacheline. - * - * If you modify this structure, make sure to update blk_rq_init() and - * especially blk_mq_rq_ctx_init() to take care of the added fields. - */ -struct request { - struct request_queue *q; - struct blk_mq_ctx *mq_ctx; - struct blk_mq_hw_ctx *mq_hctx; - - unsigned int cmd_flags; /* op and common flags */ - req_flags_t rq_flags; - - int tag; - int internal_tag; - - /* the following two fields are internal, NEVER access directly */ - unsigned int __data_len; /* total data len */ - sector_t __sector; /* sector cursor */ - - struct bio *bio; - struct bio *biotail; - - struct list_head queuelist; - - /* - * The hash is used inside the scheduler, and killed once the - * request reaches the dispatch list. The ipi_list is only used - * to queue the request for softirq completion, which is long - * after the request has been unhashed (and even removed from - * the dispatch list). - */ - union { - struct hlist_node hash; /* merge hash */ - struct llist_node ipi_list; - }; - - /* - * The rb_node is only used inside the io scheduler, requests - * are pruned when moved to the dispatch queue. So let the - * completion_data share space with the rb_node. - */ - union { - struct rb_node rb_node; /* sort/lookup */ - struct bio_vec special_vec; - void *completion_data; - int error_count; /* for legacy drivers, don't use */ - }; - - /* - * Three pointers are available for the IO schedulers, if they need - * more they have to dynamically allocate it. Flush requests are - * never put on the IO scheduler. So let the flush fields share - * space with the elevator data. - */ - union { - struct { - struct io_cq *icq; - void *priv[2]; - } elv; - - struct { - unsigned int seq; - struct list_head list; - rq_end_io_fn *saved_end_io; - } flush; - }; - - struct gendisk *rq_disk; - struct block_device *part; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME - /* Time that the first bio started allocating this request. */ - u64 alloc_time_ns; -#endif - /* Time that this request was allocated for this IO. */ - u64 start_time_ns; - /* Time that I/O was submitted to the device. */ - u64 io_start_time_ns; - -#ifdef CONFIG_BLK_WBT - unsigned short wbt_flags; -#endif - /* - * rq sectors used for blk stats. It has the same value - * with blk_rq_sectors(rq), except that it never be zeroed - * by completion. - */ - unsigned short stats_sectors; - - /* - * Number of scatter-gather DMA addr+len pairs after - * physical address coalescing is performed. - */ - unsigned short nr_phys_segments; - -#if defined(CONFIG_BLK_DEV_INTEGRITY) - unsigned short nr_integrity_segments; -#endif - -#ifdef CONFIG_BLK_INLINE_ENCRYPTION - struct bio_crypt_ctx *crypt_ctx; - struct blk_ksm_keyslot *crypt_keyslot; -#endif - - unsigned short write_hint; - unsigned short ioprio; - - enum mq_rq_state state; - refcount_t ref; - - unsigned int timeout; - unsigned long deadline; - - union { - struct __call_single_data csd; - u64 fifo_time; - }; - - /* - * completion callback. - */ - rq_end_io_fn *end_io; - void *end_io_data; -}; - static inline bool blk_op_is_passthrough(unsigned int op) { op &= REQ_OP_MASK; return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } -static inline bool blk_rq_is_passthrough(struct request *rq) -{ - return blk_op_is_passthrough(req_op(rq)); -} - -static inline unsigned short req_get_ioprio(struct request *req) -{ - return req->ioprio; -} - -struct bio_vec; - -enum blk_eh_timer_return { - BLK_EH_DONE, /* drivers has completed the command */ - BLK_EH_RESET_TIMER, /* reset timer and try again */ -}; - -#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ -#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ - /* * Zoned block device models (zoned limit). * @@ -620,11 +415,6 @@ extern void blk_clear_pm_only(struct request_queue *q); #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) -#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) - -#define rq_dma_dir(rq) \ - (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) - #define dma_map_bvec(dev, bv, dir, attrs) \ dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) @@ -740,11 +530,6 @@ static inline unsigned int queue_max_active_zones(const struct request_queue *q) } #endif /* CONFIG_BLK_DEV_ZONED */ -static inline bool rq_is_sync(struct request *rq) -{ - return op_is_sync(rq->cmd_flags); -} - static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) @@ -759,83 +544,20 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) #define BLK_MIN_SG_TIMEOUT (7 * HZ) -struct rq_map_data { - struct page **pages; - int page_order; - int nr_entries; - unsigned long offset; - int null_mapped; - int from_user; -}; - -struct req_iterator { - struct bvec_iter iter; - struct bio *bio; -}; - /* This should not be used directly - use rq_for_each_segment */ #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) -#define __rq_for_each_bio(_bio, rq) \ - if ((rq->bio)) \ - for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) - -#define rq_for_each_segment(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bio_for_each_segment(bvl, _iter.bio, _iter.iter) -#define rq_for_each_bvec(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bio_for_each_bvec(bvl, _iter.bio, _iter.iter) - -#define rq_iter_last(bvec, _iter) \ - (_iter.bio->bi_next == NULL && \ - bio_iter_last(bvec, _iter.iter)) - -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" -#endif -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -extern void rq_flush_dcache_pages(struct request *rq); -#else -static inline void rq_flush_dcache_pages(struct request *rq) -{ -} -#endif extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); blk_qc_t submit_bio_noacct(struct bio *bio); -extern void blk_rq_init(struct request_queue *q, struct request *rq); -extern void blk_put_request(struct request *); -extern struct request *blk_get_request(struct request_queue *, unsigned int op, - blk_mq_req_flags_t flags); + extern int blk_lld_busy(struct request_queue *q); -extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data); -extern void blk_rq_unprep_clone(struct request *rq); -extern blk_status_t blk_insert_cloned_request(struct request_queue *q, - struct request *rq); -int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_queue_split(struct bio **); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -extern int blk_rq_map_user(struct request_queue *, struct request *, - struct rq_map_data *, void __user *, unsigned long, - gfp_t); -extern int blk_rq_unmap_user(struct bio *); -extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); -extern int blk_rq_map_user_iov(struct request_queue *, struct request *, - struct rq_map_data *, const struct iov_iter *, - gfp_t); -extern void blk_execute_rq_nowait(struct gendisk *, - struct request *, int, rq_end_io_fn *); - -blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, - int at_head); /* Helper to convert REQ_OP_XXX to its string format XXX */ extern const char *blk_op_str(unsigned int op); @@ -867,47 +589,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) #define SECTOR_MASK (PAGE_SECTORS - 1) -/* - * blk_rq_pos() : the current sector - * blk_rq_bytes() : bytes left in the entire request - * blk_rq_cur_bytes() : bytes left in the current segment - * blk_rq_err_bytes() : bytes left till the next error boundary - * blk_rq_sectors() : sectors left in the entire request - * blk_rq_cur_sectors() : sectors left in the current segment - * blk_rq_stats_sectors() : sectors of the entire request used for stats - */ -static inline sector_t blk_rq_pos(const struct request *rq) -{ - return rq->__sector; -} - -static inline unsigned int blk_rq_bytes(const struct request *rq) -{ - return rq->__data_len; -} - -static inline int blk_rq_cur_bytes(const struct request *rq) -{ - return rq->bio ? bio_cur_bytes(rq->bio) : 0; -} - -extern unsigned int blk_rq_err_bytes(const struct request *rq); - -static inline unsigned int blk_rq_sectors(const struct request *rq) -{ - return blk_rq_bytes(rq) >> SECTOR_SHIFT; -} - -static inline unsigned int blk_rq_cur_sectors(const struct request *rq) -{ - return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; -} - -static inline unsigned int blk_rq_stats_sectors(const struct request *rq) -{ - return rq->stats_sectors; -} - #ifdef CONFIG_BLK_DEV_ZONED /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ @@ -924,42 +605,8 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio) return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev), bio->bi_iter.bi_sector); } - -static inline unsigned int blk_rq_zone_no(struct request *rq) -{ - return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); -} - -static inline unsigned int blk_rq_zone_is_seq(struct request *rq) -{ - return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); -} #endif /* CONFIG_BLK_DEV_ZONED */ -/* - * Some commands like WRITE SAME have a payload or data transfer size which - * is different from the size of the request. Any driver that supports such - * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to - * calculate the data transfer size. - */ -static inline unsigned int blk_rq_payload_bytes(struct request *rq) -{ - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - return rq->special_vec.bv_len; - return blk_rq_bytes(rq); -} - -/* - * Return the first full biovec in the request. The caller needs to check that - * there are any bvecs before calling this helper. - */ -static inline struct bio_vec req_bvec(struct request *rq) -{ - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - return rq->special_vec; - return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); -} - static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { @@ -999,30 +646,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q, return min(q->limits.max_sectors, chunk_sectors); } -static inline unsigned int blk_rq_count_bios(struct request *rq) -{ - unsigned int nr_bios = 0; - struct bio *bio; - - __rq_for_each_bio(bio, rq) - nr_bios++; - - return nr_bios; -} - -void blk_steal_bios(struct bio_list *list, struct request *rq); - -/* - * Request completion related functions. - * - * blk_update_request() completes given number of bytes and updates - * the request without completing it. - */ -extern bool blk_update_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes); - -extern void blk_abort_request(struct request *); - /* * Access functions for manipulating queue properties */ @@ -1081,42 +704,6 @@ extern void blk_queue_required_elevator_features(struct request_queue *q, extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, struct device *dev); -/* - * Number of physical segments as sent to the device. - * - * Normally this is the number of discontiguous data segments sent by the - * submitter. But for data-less command like discard we might have no - * actual data segments submitted, but the driver might have to add it's - * own special payload. In that case we still return 1 here so that this - * special payload will be mapped. - */ -static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) -{ - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - return 1; - return rq->nr_phys_segments; -} - -/* - * Number of discard segments (or ranges) the driver needs to fill in. - * Each discard bio merged into a request is counted as one segment. - */ -static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) -{ - return max_t(unsigned short, rq->nr_phys_segments, 1); -} - -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg); -static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) -{ - struct scatterlist *last_sg = NULL; - - return __blk_rq_map_sg(q, rq, sglist, &last_sg); -} -extern void blk_dump_rq_flags(struct request *, char *); - bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); @@ -1613,60 +1200,6 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); -#ifdef CONFIG_BLK_DEV_ZONED -bool blk_req_needs_zone_write_lock(struct request *rq); -bool blk_req_zone_write_trylock(struct request *rq); -void __blk_req_zone_write_lock(struct request *rq); -void __blk_req_zone_write_unlock(struct request *rq); - -static inline void blk_req_zone_write_lock(struct request *rq) -{ - if (blk_req_needs_zone_write_lock(rq)) - __blk_req_zone_write_lock(rq); -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ - if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) - __blk_req_zone_write_unlock(rq); -} - -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return rq->q->seq_zones_wlock && - test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - if (!blk_req_needs_zone_write_lock(rq)) - return true; - return !blk_req_zone_is_write_locked(rq); -} -#else -static inline bool blk_req_needs_zone_write_lock(struct request *rq) -{ - return false; -} - -static inline void blk_req_zone_write_lock(struct request *rq) -{ -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ -} -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return false; -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - return true; -} -#endif /* CONFIG_BLK_DEV_ZONED */ - static inline void blk_wake_io_task(struct task_struct *waiter) { /* -- cgit v1.2.3 From e155b0c238b20f0a866f4334d292656665836c8a Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 5 Oct 2021 18:23:37 +0800 Subject: blk-mq: Use shared tags for shared sbitmap support Currently we use separate sbitmap pairs and active_queues atomic_t for shared sbitmap support. However a full sets of static requests are used per HW queue, which is quite wasteful, considering that the total number of requests usable at any given time across all HW queues is limited by the shared sbitmap depth. As such, it is considerably more memory efficient in the case of shared sbitmap to allocate a set of static rqs per tag set or request queue, and not per HW queue. So replace the sbitmap pairs and active_queues atomic_t with a shared tags per tagset and request queue, which will hold a set of shared static rqs. Since there is now no valid HW queue index to be passed to the blk_mq_ops .init and .exit_request callbacks, pass an invalid index token. This changes the semantics of the APIs, such that the callback would need to validate the HW queue index before using it. Currently no user of shared sbitmap actually uses the HW queue index (as would be expected). Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1633429419-228500-13-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0e960d74615e..cf92c13eb80e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -238,8 +238,7 @@ struct request_queue { atomic_t nr_active_requests_shared_sbitmap; - struct sbitmap_queue sched_bitmap_tags; - struct sbitmap_queue sched_breserved_tags; + struct blk_mq_tags *shared_sbitmap_tags; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From 079a2e3e862548087041a1873bbffceb41a72a33 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 5 Oct 2021 18:23:39 +0800 Subject: blk-mq: Change shared sbitmap naming to shared tags Now that shared sbitmap support really means shared tags, rename symbols to match that. Signed-off-by: John Garry Link: https://lore.kernel.org/r/1633429419-228500-15-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cf92c13eb80e..b19172db7eef 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -236,9 +236,9 @@ struct request_queue { struct timer_list timeout; struct work_struct timeout_work; - atomic_t nr_active_requests_shared_sbitmap; + atomic_t nr_active_requests_shared_tags; - struct blk_mq_tags *shared_sbitmap_tags; + struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From ba0ffdd8ce48ad7f7e85191cd29f9674caca3745 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Oct 2021 12:01:07 -0600 Subject: block: bump max plugged deferred size from 16 to 32 Particularly for NVMe with efficient deferred submission for many requests, there are nice benefits to be seen by bumping the default max plug count from 16 to 32. This is especially true for virtualized setups, where the submit part is more expensive. But can be noticed even on native hardware. Reduce the multiple queue factor from 4 to 2, since we're changing the default size. While changing it, move the defines into the block layer private header. These aren't values that anyone outside of the block layer uses, or should use. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b19172db7eef..472b4ab007c6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -727,8 +727,6 @@ struct blk_plug { bool multiple_queues; bool nowait; }; -#define BLK_MAX_REQUEST_COUNT 16 -#define BLK_PLUG_FLUSH_SIZE (128 * 1024) struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); -- cgit v1.2.3 From 47c122e35d7e43b14129ceb9ed3a7e67599978fa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Oct 2021 06:34:11 -0600 Subject: block: pre-allocate requests if plug is started and is a batch The caller typically has a good (or even exact) idea of how many requests it needs to submit. We can make the request/tag allocation a lot more efficient if we just allocate N requests/tags upfront when we queue the first bio from the batch. Provide a new plug start helper that allows the caller to specify how many IOs are expected. This sets plug->nr_ios, and we can use that for smarter request allocation. The plug provides a holding spot for requests, and request allocation will check it before calling into the normal request allocation path. The blk_finish_plug() is called, check if there are unused requests and free them. This should not happen in normal operations. The exception is if we get merging, then we may be left with requests that need freeing when done. This raises the per-core performance on my setup from ~5.8M to ~6.1M IOPS. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 472b4ab007c6..17705c970d7e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -722,10 +722,17 @@ extern void blk_set_queue_dying(struct request_queue *); */ struct blk_plug { struct list_head mq_list; /* blk-mq requests */ - struct list_head cb_list; /* md requires an unplug callback */ + + /* if ios_left is > 1, we can batch tag/rq allocations */ + struct request *cached_rq; + unsigned short nr_ios; + unsigned short rq_count; + bool multiple_queues; bool nowait; + + struct list_head cb_list; /* md requires an unplug callback */ }; struct blk_plug_cb; @@ -738,6 +745,7 @@ struct blk_plug_cb { extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size); extern void blk_start_plug(struct blk_plug *); +extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short); extern void blk_finish_plug(struct blk_plug *); extern void blk_flush_plug_list(struct blk_plug *, bool); @@ -772,6 +780,11 @@ long nr_blockdev_pages(void); struct blk_plug { }; +static inline void blk_start_plug_nr_ios(struct blk_plug *plug, + unsigned short nr_ios) +{ +} + static inline void blk_start_plug(struct blk_plug *plug) { } -- cgit v1.2.3 From ef99b2d37666b7a600baab9e1c4944436652b0a2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:19 +0200 Subject: block: replace the spin argument to blk_iopoll with a flags argument Switch the boolean spin argument to blk_poll to passing a set of flags instead. This will allow to control polling behavior in a more fine grained way. Signed-off-by: Christoph Hellwig Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-10-hch@lst.de [axboe: adapt to changed io_uring iopoll] Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 17705c970d7e..e177346bc020 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -564,7 +564,9 @@ extern const char *blk_op_str(unsigned int op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); +/* only poll the hardware once, don't continue until a completion was found */ +#define BLK_POLL_ONESHOT (1 << 0) +int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { -- cgit v1.2.3 From d729cf9acb9311956c8a37113dcfa0160a2d9665 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:20 +0200 Subject: io_uring: don't sleep when polling for I/O There is no point in sleeping for the expected I/O completion timeout in the io_uring async polling model as we never poll for a specific I/O. Signed-off-by: Christoph Hellwig Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e177346bc020..2b80c98fc373 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -566,6 +566,8 @@ blk_status_t errno_to_blk_status(int errno); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) +/* do not sleep to wait for the expected completion time */ +#define BLK_POLL_NOSLEEP (1 << 1) int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) -- cgit v1.2.3 From 3e08773c3841e9db7a520908cc2b136a77d275ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:24 +0200 Subject: block: switch polling to be bio based Replace the blk_poll interface that requires the caller to keep a queue and cookie from the submissions with polling based on the bio. Polling for the bio itself leads to a few advantages: - the cookie construction can made entirely private in blk-mq.c - the caller does not need to remember the request_queue and cookie separately and thus sidesteps their lifetime issues - keeping the device and the cookie inside the bio allows to trivially support polling BIOs remapping by stacking drivers - a lot of code to propagate the cookie back up the submission path can be removed entirely. Signed-off-by: Christoph Hellwig Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-15-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2b80c98fc373..2a8689e949b4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -25,6 +25,7 @@ struct request; struct sg_io_hdr; struct blkcg_gq; struct blk_flush_queue; +struct kiocb; struct pr_ops; struct rq_qos; struct blk_queue_stats; @@ -550,7 +551,7 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -blk_qc_t submit_bio_noacct(struct bio *bio); +void submit_bio_noacct(struct bio *bio); extern int blk_lld_busy(struct request_queue *q); extern void blk_queue_split(struct bio **); @@ -568,7 +569,8 @@ blk_status_t errno_to_blk_status(int errno); #define BLK_POLL_ONESHOT (1 << 0) /* do not sleep to wait for the expected completion time */ #define BLK_POLL_NOSLEEP (1 << 1) -int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); +int bio_poll(struct bio *bio, unsigned int flags); +int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -1176,7 +1178,7 @@ static inline void blk_ksm_unregister(struct request_queue *q) { } struct block_device_operations { - blk_qc_t (*submit_bio) (struct bio *bio); + void (*submit_bio)(struct bio *bio); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); -- cgit v1.2.3 From 17220ca5ce9606c1b015c4316fca18734c2df0bb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 14 Oct 2021 15:03:26 +0100 Subject: block: cache request queue in bdev There are tons of places where we need to get a request_queue only having bdev, which turns into bdev->bd_disk->queue. There are probably a hundred of such places considering inline helpers, and enough of them are in hot paths. Cache queue pointer in struct block_device and make use of it in bdev_get_queue(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a3bfaecdd28956f03629d0ca5c63ebc096e1c809.1634219547.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2a8689e949b4..d5b21fc8f49e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -574,7 +574,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { - return bdev->bd_disk->queue; /* this is never NULL */ + return bdev->bd_queue; /* this is never NULL */ } /* -- cgit v1.2.3 From 013a7f95438144f4ab39a1017a0bff2765d2551a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Oct 2021 07:58:52 -0600 Subject: block: provide helpers for rq_list manipulation Instead of open-coding the list additions, traversal, and removal, provide a basic set of helpers. Suggested-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d5b21fc8f49e..b0a322172965 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1298,4 +1298,33 @@ int fsync_bdev(struct block_device *bdev); int freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev); +#define rq_list_add(listptr, rq) do { \ + (rq)->rq_next = *(listptr); \ + *(listptr) = rq; \ +} while (0) + +#define rq_list_pop(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) { \ + __req = *(listptr); \ + *(listptr) = __req->rq_next; \ + } \ + __req; \ +}) + +#define rq_list_peek(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) \ + __req = *(listptr); \ + __req; \ +}) + +#define rq_list_for_each(listptr, pos) \ + for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \ + +#define rq_list_next(rq) (rq)->rq_next +#define rq_list_empty(list) ((list) == (struct request *) NULL) + #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 5a72e899ceb465d731c413d57c6c12cdbf88303c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Oct 2021 09:24:29 -0600 Subject: block: add a struct io_comp_batch argument to fops->iopoll() struct io_comp_batch contains a list head and a completion handler, which will allow completions to more effciently completed batches of IO. For now, no functional changes in this patch, we just define the io_comp_batch structure and add the argument to the file_operations iopoll handler. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b0a322172965..fd9771a1da09 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -569,8 +569,9 @@ blk_status_t errno_to_blk_status(int errno); #define BLK_POLL_ONESHOT (1 << 0) /* do not sleep to wait for the expected completion time */ #define BLK_POLL_NOSLEEP (1 << 1) -int bio_poll(struct bio *bio, unsigned int flags); -int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags); +int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); +int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, + unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -1298,6 +1299,14 @@ int fsync_bdev(struct block_device *bdev); int freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev); +struct io_comp_batch { + struct request *req_list; + bool need_ts; + void (*complete)(struct io_comp_batch *); +}; + +#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } + #define rq_list_add(listptr, rq) do { \ (rq)->rq_next = *(listptr); \ *(listptr) = rq; \ -- cgit v1.2.3 From 99457db8b40c66941072a383a5ab4e36bc53fd3d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Oct 2021 12:11:01 +0200 Subject: block: move the SECTOR_SIZE related definitions to blk_types.h Ensure these are always available for inlines in the various block layer headers. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20211018101130.1838532-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fd9771a1da09..abe721591e80 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -578,23 +578,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) return bdev->bd_queue; /* this is never NULL */ } -/* - * The basic unit of block I/O is a sector. It is used in a number of contexts - * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9 - * bytes. Variables of type sector_t represent an offset or size that is a - * multiple of 512 bytes. Hence these two constants. - */ -#ifndef SECTOR_SHIFT -#define SECTOR_SHIFT 9 -#endif -#ifndef SECTOR_SIZE -#define SECTOR_SIZE (1 << SECTOR_SHIFT) -#endif - -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) -#define SECTOR_MASK (PAGE_SECTORS - 1) - #ifdef CONFIG_BLK_DEV_ZONED /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ -- cgit v1.2.3 From bc490f81731e181b07b8d7577425c06ae91692c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Oct 2021 10:12:12 -0600 Subject: block: change plugging to use a singly linked list Use a singly linked list for the blk_plug. This saves 8 bytes in the blk_plug struct, and makes for faster list manipulations than doubly linked lists. As we don't use the doubly linked lists for anything, singly linked is just fine. This yields a bump in default (merging enabled) performance from 7.0 to 7.1M IOPS, and ~7.5M IOPS with merging disabled. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fd9771a1da09..4027112b9851 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -728,7 +728,7 @@ extern void blk_set_queue_dying(struct request_queue *); * schedule() where blk_schedule_flush_plug() is called. */ struct blk_plug { - struct list_head mq_list; /* blk-mq requests */ + struct request *mq_list; /* blk-mq requests */ /* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; @@ -777,8 +777,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) struct blk_plug *plug = tsk->plug; return plug && - (!list_empty(&plug->mq_list) || - !list_empty(&plug->cb_list)); + (plug->mq_list || !list_empty(&plug->cb_list)); } int blkdev_issue_flush(struct block_device *bdev); -- cgit v1.2.3 From dc5fc361d891e089dfd9c0a975dc78041036b906 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Oct 2021 06:02:30 -0600 Subject: block: attempt direct issue of plug list If we have just one queue type in the plug list, then we can extend our direct issue to cover a full plug list as well. This allows sending a batch of requests for direct issue, which is more efficient than doing one-at-a-time kind of issue. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4027112b9851..f13091d3d476 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -737,6 +737,7 @@ struct blk_plug { unsigned short rq_count; bool multiple_queues; + bool has_elevator; bool nowait; struct list_head cb_list; /* md requires an unplug callback */ -- cgit v1.2.3 From e70feb8b3e6886c525c88943b5f1508d02f5a683 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 14 Oct 2021 16:17:10 +0800 Subject: blk-mq: support concurrent queue quiesce/unquiesce blk_mq_quiesce_queue() has been used a bit wide now, so far we don't support concurrent/nested quiesce. One biggest issue is that unquiesce can happen unexpectedly in case that quiesce/unquiesce are run concurrently from more than one context. This patch introduces q->mq_quiesce_depth to deal concurrent quiesce, and we only unquiesce queue when it is the last/outer-most one of all contexts. Several kernel panic issue has been reported[1][2][3] when running stress quiesce test. And this patch has been verified in these reports. [1] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m1fc52431fad7f33b1ffc3f12c4450e4238540787 [2] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m10ad90afeb9c8cc318334190a7c24c8b5c5e0722 [3] https://listman.redhat.com/archives/dm-devel/2021-September/msg00189.html Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211014081710.1871747-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f13091d3d476..2b22fa36e568 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -315,6 +315,8 @@ struct request_queue { */ struct mutex mq_freeze_lock; + int quiesce_depth; + struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set bio_split; -- cgit v1.2.3 From 008f75a20e7072d0840ec323c39b42206f3fa8a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Oct 2021 16:41:19 +0200 Subject: block: cleanup the flush plug helpers Consolidate the various helpers into a single blk_flush_plug helper that takes a plk_plug and the from_scheduler bool and switch all callsites to call it directly. Checks that the plug is non-NULL must be performed by the caller, something that most already do anyway. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211020144119.142582-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2b22fa36e568..c7b1e9355123 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -725,9 +725,8 @@ extern void blk_set_queue_dying(struct request_queue *); * as the lock contention for request_queue lock is reduced. * * It is ok not to disable preemption when adding the request to the plug list - * or when attempting a merge, because blk_schedule_flush_list() will only flush - * the plug list when the task sleeps by itself. For details, please see - * schedule() where blk_schedule_flush_plug() is called. + * or when attempting a merge. For details, please see schedule() where + * blk_flush_plug() is called. */ struct blk_plug { struct request *mq_list; /* blk-mq requests */ @@ -757,23 +756,8 @@ extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, extern void blk_start_plug(struct blk_plug *); extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short); extern void blk_finish_plug(struct blk_plug *); -extern void blk_flush_plug_list(struct blk_plug *, bool); -static inline void blk_flush_plug(struct task_struct *tsk) -{ - struct blk_plug *plug = tsk->plug; - - if (plug) - blk_flush_plug_list(plug, false); -} - -static inline void blk_schedule_flush_plug(struct task_struct *tsk) -{ - struct blk_plug *plug = tsk->plug; - - if (plug) - blk_flush_plug_list(plug, true); -} +void blk_flush_plug(struct blk_plug *plug, bool from_schedule); static inline bool blk_needs_flush_plug(struct task_struct *tsk) { @@ -802,15 +786,10 @@ static inline void blk_finish_plug(struct blk_plug *plug) { } -static inline void blk_flush_plug(struct task_struct *task) -{ -} - -static inline void blk_schedule_flush_plug(struct task_struct *task) +static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } - static inline bool blk_needs_flush_plug(struct task_struct *tsk) { return false; -- cgit v1.2.3 From cb77cb5abe1f4fae4a33b735606aae22f9eaa1c7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Oct 2021 11:04:52 -0700 Subject: blk-crypto: rename blk_keyslot_manager to blk_crypto_profile blk_keyslot_manager is misnamed because it doesn't necessarily manage keyslots. It actually does several different things: - Contains the crypto capabilities of the device. - Provides functions to control the inline encryption hardware. Originally these were just for programming/evicting keyslots; however, new functionality (hardware-wrapped keys) will require new functions here which are unrelated to keyslots. Moreover, device-mapper devices already (ab)use "keyslot_evict" to pass key eviction requests to their underlying devices even though device-mapper devices don't have any keyslots themselves (so it really should be "evict_key", not "keyslot_evict"). - Sometimes (but not always!) it manages keyslots. Originally it always did, but device-mapper devices don't have keyslots themselves, so they use a "passthrough keyslot manager" which doesn't actually manage keyslots. This hack works, but the terminology is unnatural. Also, some hardware doesn't have keyslots and thus also uses a "passthrough keyslot manager" (support for such hardware is yet to be upstreamed, but it will happen eventually). Let's stop having keyslot managers which don't actually manage keyslots. Instead, rename blk_keyslot_manager to blk_crypto_profile. This is a fairly big change, since for consistency it also has to update keyslot manager-related function names, variable names, and comments -- not just the actual struct name. However it's still a fairly straightforward change, as it doesn't change any actual functionality. Acked-by: Ulf Hansson # For MMC Reviewed-by: Mike Snitzer Reviewed-by: Martin K. Petersen Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20211018180453.40441-4-ebiggers@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c7b1e9355123..f72ccb2829db 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -30,7 +30,7 @@ struct pr_ops; struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; -struct blk_keyslot_manager; +struct blk_crypto_profile; /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 @@ -224,8 +224,7 @@ struct request_queue { unsigned int dma_alignment; #ifdef CONFIG_BLK_INLINE_ENCRYPTION - /* Inline crypto capabilities */ - struct blk_keyslot_manager *ksm; + struct blk_crypto_profile *crypto_profile; #endif unsigned int rq_timeout; @@ -1142,19 +1141,20 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo #ifdef CONFIG_BLK_INLINE_ENCRYPTION -bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q); +bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q); -void blk_ksm_unregister(struct request_queue *q); +void blk_crypto_unregister(struct request_queue *q); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm, - struct request_queue *q) +static inline bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q) { return true; } -static inline void blk_ksm_unregister(struct request_queue *q) { } +static inline void blk_crypto_unregister(struct request_queue *q) { } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ -- cgit v1.2.3 From 9208d414975895f69e9aca49153060ddd31b18d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 08:06:01 +0200 Subject: block: add a ->get_unique_id method Add a method to query unique IDs from block devices. It will be used to remove code that deeply pokes into SCSI internals in the NFS server. The implementation in the sd driver itself is also much nicer as it can use the cached VPD page instead of always sending a command as the current NFS code does. For now the interface is kept very minimal but could be easily extended when other users like a block-layer sysfs interface for uniquue IDs shows up. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20211021060607.264371-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f72ccb2829db..0d5826066e16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1158,6 +1158,14 @@ static inline void blk_crypto_unregister(struct request_queue *q) { } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ +enum blk_unique_id { + /* these match the Designator Types specified in SPC */ + BLK_UID_T10 = 1, + BLK_UID_EUI64 = 2, + BLK_UID_NAA = 3, +}; + +#define NFL4_UFLG_MASK 0x0000003F struct block_device_operations { void (*submit_bio)(struct bio *bio); @@ -1176,6 +1184,9 @@ struct block_device_operations { int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); char *(*devnode)(struct gendisk *disk, umode_t *mode); + /* returns the length of the identifier or a negative errno: */ + int (*get_unique_id)(struct gendisk *disk, u8 id[16], + enum blk_unique_id id_type); struct module *owner; const struct pr_ops *pr_ops; -- cgit v1.2.3 From 4845012eb5b4e56cadb5f484cb55dd4fd9d1df80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 08:06:07 +0200 Subject: block: remove QUEUE_FLAG_SCSI_PASSTHROUGH Export scsi_device_from_queue for use with pktcdvd and use that instead of the otherwise unused QUEUE_FLAG_SCSI_PASSTHROUGH queue flag. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20211021060607.264371-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0d5826066e16..1ad30f85d30e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -357,7 +357,6 @@ struct request_queue { #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_POLL_STATS 21 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ -#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ @@ -391,8 +390,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) -#define blk_queue_scsi_passthrough(q) \ - test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) #ifdef CONFIG_BLK_RQ_ALLOC_TIME -- cgit v1.2.3 From 70164eb6ccb76ab679b016b4b60123bf4ec6c162 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:25 +0200 Subject: block: remove __sync_blockdev Instead offer a new sync_blockdev_nowait helper for the !wait case. This new helper is exported as it will grow modular callers in a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f72ccb2829db..4b5a6bbdacd0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1266,6 +1266,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); +int sync_blockdev_nowait(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -1274,6 +1275,10 @@ static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline int sync_blockdev_nowait(struct block_device *bdev) +{ + return 0; +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From 1e03a36bdff4709c1bbf0f57f60ae3f776d51adf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:30 +0200 Subject: block: simplify the block device syncing code Get rid of the indirections and just provide a sync_bdevs helper for the generic sync code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4b5a6bbdacd0..09fdf7018b7f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1267,6 +1267,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); int sync_blockdev_nowait(struct block_device *bdev); +void sync_bdevs(bool wait); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -1279,6 +1280,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) { return 0; } +static inline void sync_bdevs(bool wait) +{ +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From a2247f19ee1c5ad75ef095cdfb909a3244b88aa8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 27 Oct 2021 11:22:19 +0900 Subject: block: Add independent access ranges support The Concurrent Positioning Ranges VPD page (for SCSI) and data log page (for ATA) contain parameters describing the set of contiguous LBAs that can be served independently by a single LUN multi-actuator hard-disk. Similarly, a logically defined block device composed of multiple disks can in some cases execute requests directed at different sector ranges in parallel. A dm-linear device aggregating 2 block devices together is an example. This patch implements support for exposing a block device independent access ranges to the user through sysfs to allow optimizing device accesses to increase performance. To describe the set of independent sector ranges of a device (actuators of a multi-actuator HDDs or table entries of a dm-linear device), The type struct blk_independent_access_ranges is introduced. This structure describes the sector ranges using an array of struct blk_independent_access_range structures. This range structure defines the start sector and number of sectors of the access range. The ranges in the array cannot overlap and must contain all sectors within the device capacity. The function disk_set_independent_access_ranges() allows a device driver to signal to the block layer that a device has multiple independent access ranges. In this case, a struct blk_independent_access_ranges is attached to the device request queue by the function disk_set_independent_access_ranges(). The function disk_alloc_independent_access_ranges() is provided for drivers to allocate this structure. struct blk_independent_access_ranges contains kobjects (struct kobject) to expose to the user through sysfs the set of independent access ranges supported by a device. When the device is initialized, sysfs registration of the ranges information is done from blk_register_queue() using the block layer internal function disk_register_independent_access_ranges(). If a driver calls disk_set_independent_access_ranges() for a registered queue, e.g. when a device is revalidated, disk_set_independent_access_ranges() will execute disk_register_independent_access_ranges() to update the sysfs attribute files. The sysfs file structure created starts from the independent_access_ranges sub-directory and contains the start sector and number of sectors of each range, with the information for each range grouped in numbered sub-directories. E.g. for a dual actuator HDD, the user sees: $ tree /sys/block/sdk/queue/independent_access_ranges/ /sys/block/sdk/queue/independent_access_ranges/ |-- 0 | |-- nr_sectors | `-- sector `-- 1 |-- nr_sectors `-- sector For a regular device with a single access range, the independent_access_ranges sysfs directory does not exist. Device revalidation may lead to changes to this structure and to the attribute values. When manipulated, the queue sysfs_lock and sysfs_dir_lock mutexes are held for atomicity, similarly to how the blk-mq and elevator sysfs queue sub-directories are protected. The code related to the management of independent access ranges is added in the new file block/blk-ia-ranges.c. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20211027022223.183838-2-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f72ccb2829db..6d95a4b36cfa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -150,6 +150,34 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, #endif /* CONFIG_BLK_DEV_ZONED */ +/* + * Independent access ranges: struct blk_independent_access_range describes + * a range of contiguous sectors that can be accessed using device command + * execution resources that are independent from the resources used for + * other access ranges. This is typically found with single-LUN multi-actuator + * HDDs where each access range is served by a different set of heads. + * The set of independent ranges supported by the device is defined using + * struct blk_independent_access_ranges. The independent ranges must not overlap + * and must include all sectors within the disk capacity (no sector holes + * allowed). + * For a device with multiple ranges, requests targeting sectors in different + * ranges can be executed in parallel. A request can straddle an access range + * boundary. + */ +struct blk_independent_access_range { + struct kobject kobj; + struct request_queue *queue; + sector_t sector; + sector_t nr_sectors; +}; + +struct blk_independent_access_ranges { + struct kobject kobj; + bool sysfs_registered; + unsigned int nr_ia_ranges; + struct blk_independent_access_range ia_range[]; +}; + struct request_queue { struct request *last_merge; struct elevator_queue *elevator; @@ -331,6 +359,12 @@ struct request_queue { #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; + + /* + * Independent sector access ranges. This is always NULL for + * devices that do not have multiple independent access ranges. + */ + struct blk_independent_access_ranges *ia_ranges; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ @@ -698,6 +732,11 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); +struct blk_independent_access_ranges * +disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); +void disk_set_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *iars); + /* * Elevator features for blk_queue_required_elevator_features: */ -- cgit v1.2.3 From 570b1cac477643cbf01a45fa5d018430a1fddbce Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 26 Oct 2021 22:40:12 +0800 Subject: block: Add a helper to validate the block size There are some duplicated codes to validate the block size in block drivers. This limitation actually comes from block layer, so this patch tries to add a new block layer helper for that. Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20211026144015.188-2-xieyongji@bytedance.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6d95a4b36cfa..d2d627e2c782 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -44,6 +44,14 @@ struct blk_crypto_profile; */ #define BLKCG_MAX_POLS 6 +static inline int blk_validate_block_size(unsigned int bsize) +{ + if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) + return -EINVAL; + + return 0; +} + static inline bool blk_op_is_passthrough(unsigned int op) { op &= REQ_OP_MASK; -- cgit v1.2.3