From 52f019d43c229afd65dc11c8c1b05b6436bf6765 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 11:42:51 +0100 Subject: block: add a hard-readonly flag to struct gendisk Commit 20bd1d026aac ("scsi: sd: Keep disk read-only when re-reading partition") addressed a long-standing problem with user read-only policy being overridden as a result of a device-initiated revalidate. The commit has since been reverted due to a regression that left some USB devices read-only indefinitely. To fix the underlying problems with revalidate we need to keep track of hardware state and user policy separately. The gendisk has been updated to reflect the current hardware state set by the device driver. This is done to allow returning the device to the hardware state once the user clears the BLKROSET flag. The resulting semantics are as follows: - If BLKROSET sets a given partition read-only, that partition will remain read-only even if the underlying storage stack initiates a revalidate. However, the BLKRRPART ioctl will cause the partition table to be dropped and any user policy on partitions will be lost. - If BLKROSET has not been set, both the whole disk device and any partitions will reflect the current write-protect state of the underlying device. Based on a patch from Martin K. Petersen . Reported-by: Oleksii Kurochko Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201221 Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/genhd.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 809aaa32d53c..a62ccbfac54b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -163,6 +163,7 @@ struct gendisk { int flags; unsigned long state; #define GD_NEED_PART_SCAN 0 +#define GD_READ_ONLY 1 struct kobject *slave_dir; struct timer_rand_state *random; @@ -249,11 +250,12 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk) extern void del_gendisk(struct gendisk *gp); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); -extern void set_disk_ro(struct gendisk *disk, int flag); +void set_disk_ro(struct gendisk *disk, bool read_only); static inline int get_disk_ro(struct gendisk *disk) { - return disk->part0->bd_read_only; + return disk->part0->bd_read_only || + test_bit(GD_READ_ONLY, &disk->state); } extern void disk_block_events(struct gendisk *disk); -- cgit v1.2.3 From 309dca309fc39a9e3c31b916393b74bd174fd74e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:34 +0100 Subject: block: store a block_device pointer in struct bio Replace the gendisk pointer in struct bio with a pointer to the newly improved struct block device. From that the gendisk can be trivially accessed with an extra indirection, but it also allows to directly look up all information related to partition remapping. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/bio.h | 18 ++++++++---------- include/linux/blk-mq.h | 4 ++-- include/linux/blk_types.h | 3 +-- include/linux/blkdev.h | 5 +++-- 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 1edda614f7ce..12af7aa5db37 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -483,24 +483,22 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); -#define bio_set_dev(bio, bdev) \ -do { \ - if ((bio)->bi_disk != (bdev)->bd_disk) \ - bio_clear_flag(bio, BIO_THROTTLED);\ - (bio)->bi_disk = (bdev)->bd_disk; \ - (bio)->bi_partno = (bdev)->bd_partno; \ - bio_associate_blkg(bio); \ +#define bio_set_dev(bio, bdev) \ +do { \ + if ((bio)->bi_bdev != (bdev)) \ + bio_clear_flag(bio, BIO_THROTTLED); \ + (bio)->bi_bdev = (bdev); \ + bio_associate_blkg(bio); \ } while (0) #define bio_copy_dev(dst, src) \ do { \ - (dst)->bi_disk = (src)->bi_disk; \ - (dst)->bi_partno = (src)->bi_partno; \ + (dst)->bi_bdev = (src)->bi_bdev; \ bio_clone_blkg_association(dst, src); \ } while (0) #define bio_dev(bio) \ - disk_devt((bio)->bi_disk) + disk_devt((bio)->bi_bdev->bd_disk) #ifdef CONFIG_BLK_CGROUP void bio_associate_blkg(struct bio *bio); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d705b174d346..6b410dab48ee 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -602,8 +602,8 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; + if (bio->bi_bdev) + rq->rq_disk = bio->bi_bdev->bd_disk; } blk_qc_t blk_mq_submit_bio(struct bio *bio); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 866f74261b3b..8ebd8be3e050 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -222,7 +222,7 @@ static inline void bio_issue_init(struct bio_issue *issue, */ struct bio { struct bio *bi_next; /* request queue link */ - struct gendisk *bi_disk; + struct block_device *bi_bdev; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. @@ -231,7 +231,6 @@ struct bio { unsigned short bi_ioprio; unsigned short bi_write_hint; blk_status_t bi_status; - u8 bi_partno; atomic_t __bi_remaining; struct bvec_iter bi_iter; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f94ee3089e01..b55bd534b2e1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1967,7 +1967,8 @@ void part_end_io_acct(struct block_device *part, struct bio *bio, */ static inline unsigned long bio_start_io_acct(struct bio *bio) { - return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio)); + return disk_start_io_acct(bio->bi_bdev->bd_disk, bio_sectors(bio), + bio_op(bio)); } /** @@ -1977,7 +1978,7 @@ static inline unsigned long bio_start_io_acct(struct bio *bio) */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { - return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time); + return disk_end_io_acct(bio->bi_bdev->bd_disk, bio_op(bio), start_time); } int bdev_read_only(struct block_device *bdev); -- cgit v1.2.3 From 30c5d3456c272f0de0d7e7eb9fc355fa64a5f649 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:36 +0100 Subject: block: do not reassig ->bi_bdev when partition remapping There is no good reason to reassign ->bi_bdev when remapping the partition-relative block number to the device wide one, as all the information required by the drivers comes from the gendisk anyway. Keeping the original ->bi_bdev alive will allow to greatly simplify the partition-away I/O accounting. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 ++ include/linux/blk_types.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 12af7aa5db37..2f1155eabaff 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -485,6 +485,7 @@ extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ do { \ + bio_clear_flag(bio, BIO_REMAPPED); \ if ((bio)->bi_bdev != (bdev)) \ bio_clear_flag(bio, BIO_THROTTLED); \ (bio)->bi_bdev = (bdev); \ @@ -493,6 +494,7 @@ do { \ #define bio_copy_dev(dst, src) \ do { \ + bio_clear_flag(dst, BIO_REMAPPED); \ (dst)->bi_bdev = (src)->bi_bdev; \ bio_clone_blkg_association(dst, src); \ } while (0) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8ebd8be3e050..1bc6f6a01070 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -303,6 +303,7 @@ enum { * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ + BIO_REMAPPED, BIO_FLAG_LAST }; -- cgit v1.2.3 From 99dfc43ecbf67f12a06512918aaba61d55863efc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:37 +0100 Subject: block: use ->bi_bdev for bio based I/O accounting Rework the I/O accounting for bio based drivers to use ->bi_bdev. This means all drivers can now simply use bio_start_io_acct to start accounting, and it will take partitions into account automatically. To end I/O account either bio_end_io_acct can be used if the driver never remaps I/O to a different device, or bio_end_io_acct_remapped if the driver did remap the I/O. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b55bd534b2e1..4526b9ef8edb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1954,22 +1954,9 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); -unsigned long part_start_io_acct(struct gendisk *disk, - struct block_device **part, struct bio *bio); -void part_end_io_acct(struct block_device *part, struct bio *bio, - unsigned long start_time); - -/** - * bio_start_io_acct - start I/O accounting for bio based drivers - * @bio: bio to start account for - * - * Returns the start time that should be passed back to bio_end_io_acct(). - */ -static inline unsigned long bio_start_io_acct(struct bio *bio) -{ - return disk_start_io_acct(bio->bi_bdev->bd_disk, bio_sectors(bio), - bio_op(bio)); -} +unsigned long bio_start_io_acct(struct bio *bio); +void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, + struct block_device *orig_bdev); /** * bio_end_io_acct - end I/O accounting for bio based drivers @@ -1978,7 +1965,7 @@ static inline unsigned long bio_start_io_acct(struct bio *bio) */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { - return disk_end_io_acct(bio->bi_bdev->bd_disk, bio_op(bio), start_time); + return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } int bdev_read_only(struct block_device *bdev); -- cgit v1.2.3 From bc359d03c7ec1bf3b86d03bafaf6bbb21e6414fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:39 +0100 Subject: block: add a disk_uevent helper Add a helper to call kobject_uevent for the disk and all partitions, and unexport the disk_part_iter_* helpers that are now only used in the core block code. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index a62ccbfac54b..670eaef0e876 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -213,6 +213,8 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +void disk_uevent(struct gendisk *disk, enum kobject_action action); + /* * Smarter partition iterator without context limits. */ -- cgit v1.2.3 From 0470dd9d5f103e7f1d5ba8f755f687c3106c7df1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:40 +0100 Subject: block: remove DISK_PITER_REVERSE There is good reason to iterate backwards when deleting all partitions in del_gendisk, just like we don't in blk_drop_partitions. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 670eaef0e876..51609133c9a3 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -218,7 +218,6 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action); /* * Smarter partition iterator without context limits. */ -#define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */ #define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */ #define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */ #define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */ -- cgit v1.2.3 From a33df75c6328bf40078b35f2040d8e54d574c357 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:41 +0100 Subject: block: use an xarray for disk->part_tbl Now that no fast path lookups in the partition table are left, there is no point in micro-optimizing the data structure for it. Just use a bog standard xarray. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 51609133c9a3..f364619092cc 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -32,6 +32,7 @@ extern struct class block_class; #include #include #include +#include #define PARTITION_META_INFO_VOLNAMELTH 64 /* @@ -116,13 +117,6 @@ enum { DISK_EVENT_FLAG_UEVENT = 1 << 1, }; -struct disk_part_tbl { - struct rcu_head rcu_head; - int len; - struct block_device __rcu *last_lookup; - struct block_device __rcu *part[]; -}; - struct disk_events; struct badblocks; @@ -148,12 +142,7 @@ struct gendisk { unsigned short events; /* supported events */ unsigned short event_flags; /* flags related to event processing */ - /* Array of pointers to partitions indexed by partno. - * Protected with matching bdev lock but stat and other - * non-critical accesses use RCU. Always access through - * helpers. - */ - struct disk_part_tbl __rcu *part_tbl; + struct xarray part_tbl; struct block_device *part0; const struct block_device_operations *fops; @@ -225,7 +214,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action); struct disk_part_iter { struct gendisk *disk; struct block_device *part; - int idx; + unsigned long idx; unsigned int flags; }; @@ -233,7 +222,6 @@ extern void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags); struct block_device *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); -extern bool disk_has_partitions(struct gendisk *disk); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, -- cgit v1.2.3 From 5ac83c644f5fb924f0b2c09102ab82fc788f8411 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Jan 2021 17:47:16 +0100 Subject: Revert "blk-mq, elevator: Count requests per hctx to improve performance" This reverts commit b445547ec1bbd3e7bf4b1c142550942f70527d95. Since both mq-deadline and BFQ completely ignore hctx they are passed to their dispatch function and dispatch whatever request they deem fit checking whether any request for a particular hctx is queued is just pointless since we'll very likely get a request from a different hctx anyway. In the following commit we'll deal with lock contention in these IO schedulers in presence of multiple HW queues in a different way. Signed-off-by: Jan Kara Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6b410dab48ee..aabbf6830ffc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,10 +140,6 @@ struct blk_mq_hw_ctx { * shared across request queues. */ atomic_t nr_active; - /** - * @elevator_queued: Number of queued requests on hctx. - */ - atomic_t elevator_queued; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; -- cgit v1.2.3 From b6e68ee82585f2ee890b0a897a6aacbf49a467bb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Jan 2021 17:47:17 +0100 Subject: blk-mq: Improve performance of non-mq IO schedulers with multiple HW queues Currently when non-mq aware IO scheduler (BFQ, mq-deadline) is used for a queue with multiple HW queues, the performance it rather bad. The problem is that these IO schedulers use queue-wide locking and their dispatch function does not respect the hctx it is passed in and returns any request it finds appropriate. Thus locality of request access is broken and dispatch from multiple CPUs just contends on IO scheduler locks. For these IO schedulers there's little point in dispatching from multiple CPUs. Instead dispatch always only from a single CPU to limit contention. Below is a comparison of dbench runs on XFS filesystem where the storage is a raid card with 64 HW queues and to it attached a single rotating disk. BFQ is used as IO scheduler: clients MQ SQ MQ-Patched Amean 1 39.12 (0.00%) 43.29 * -10.67%* 36.09 * 7.74%* Amean 2 128.58 (0.00%) 101.30 * 21.22%* 96.14 * 25.23%* Amean 4 577.42 (0.00%) 494.47 * 14.37%* 508.49 * 11.94%* Amean 8 610.95 (0.00%) 363.86 * 40.44%* 362.12 * 40.73%* Amean 16 391.78 (0.00%) 261.49 * 33.25%* 282.94 * 27.78%* Amean 32 324.64 (0.00%) 267.71 * 17.54%* 233.00 * 28.23%* Amean 64 295.04 (0.00%) 253.02 * 14.24%* 242.37 * 17.85%* Amean 512 10281.61 (0.00%) 10211.16 * 0.69%* 10447.53 * -1.61%* Numbers are times so lower is better. MQ is stock 5.10-rc6 kernel. SQ is the same kernel with megaraid_sas.host_tagset_enable=0 so that the card advertises just a single HW queue. MQ-Patched is a kernel with this patch applied. You can see multiple hardware queues heavily hurt performance in combination with BFQ. The patch restores the performance. Signed-off-by: Jan Kara Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index bacc40a0bdf3..1fe8e105b83b 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -172,6 +172,8 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); /* Supports zoned block devices sequential write constraint */ #define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) +/* Supports scheduling on multiple hardware queues */ +#define ELEVATOR_F_MQ_AWARE (1U << 1) #endif /* CONFIG_BLOCK */ #endif -- cgit v1.2.3 From 9f180e315a93cde559ac1c9c4c5ce980aa681c1c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:54 +0800 Subject: block: don't allocate inline bvecs if this bioset needn't bvecs The inline bvecs won't be used if user needn't bvecs by not passing BIOSET_NEED_BVECS, so don't allocate bvecs in this situation. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Tested-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 2f1155eabaff..5d8977aafa19 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -703,6 +703,7 @@ struct bio_set { mempool_t bvec_integrity_pool; #endif + unsigned int back_pad; /* * Deadlock avoidance for stacking block drivers: see comments in * bio_alloc_bioset() for details -- cgit v1.2.3 From eec716a1c18c796a69db0be5e2a6f282ba5bccd6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 11 Jan 2021 11:05:56 +0800 Subject: block: move three bvec helpers declaration into private helper bvec_alloc(), bvec_free() and bvec_nr_vecs() are only used inside block layer core functions, no need to declare them in public header. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 5d8977aafa19..e135b500df5d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -478,9 +478,6 @@ static inline void zero_fill_bio(struct bio *bio) zero_fill_bio_iter(bio, bio->bi_iter); } -extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); -extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); -extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ -- cgit v1.2.3 From 8eeed0b554b9fda61be05b17cbb0b89ea2cbbb65 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 25 Jan 2021 05:49:57 +0100 Subject: block: remove unnecessary argument from blk_execute_rq_nowait The 'q' is not used since commit a1ce35fa4985 ("block: remove dead elevator code"), also update the comment of the function. And more importantly it never really was needed to start with given that we can trivial derive it from struct request. Cc: target-devel@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Cc: linux-ide@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: linux-nfs@vger.kernel.org Signed-off-by: Guoqing Jiang Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4526b9ef8edb..623a61239429 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -950,7 +950,7 @@ extern int blk_rq_map_user_iov(struct request_queue *, struct request *, gfp_t); extern void blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); -extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, +extern void blk_execute_rq_nowait(struct gendisk *, struct request *, int, rq_end_io_fn *); /* Helper to convert REQ_OP_XXX to its string format XXX */ -- cgit v1.2.3 From 684da7628d93bbdcfba9081b917d99f29ad04c23 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 25 Jan 2021 05:49:58 +0100 Subject: block: remove unnecessary argument from blk_execute_rq We can remove 'q' from blk_execute_rq as well after the previous change in blk_execute_rq_nowait. And more importantly it never really was needed to start with given that we can trivial derive it from struct request. Cc: linux-scsi@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Cc: linux-ide@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-nvme@lists.infradead.org Cc: linux-nfs@vger.kernel.org Acked-by: Ulf Hansson # for mmc Signed-off-by: Guoqing Jiang Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 623a61239429..20f3706b6b2e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -948,8 +948,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); -extern void blk_execute_rq(struct request_queue *, struct gendisk *, - struct request *, int); +extern void blk_execute_rq(struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct gendisk *, struct request *, int, rq_end_io_fn *); -- cgit v1.2.3 From 3e1a88ec96259282b9a8b45c3f1fda7a3ff4f6ea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:03:02 +0000 Subject: bio: add a helper calculating nr segments to alloc Add a helper function calculating the number of bvec segments we need to allocate to construct a bio. It doesn't change anything functionally, but will be used to not duplicate special cases in the future. Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index e135b500df5d..9ddb19801a03 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -10,6 +10,7 @@ #include /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include +#include #define BIO_DEBUG @@ -441,6 +442,15 @@ static inline void bio_wouldblock_error(struct bio *bio) bio_endio(bio); } +/* + * Calculate number of bvec segments that should be allocated to fit data + * pointed by @iter. + */ +static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) +{ + return iov_iter_npages(iter, max_segs); +} + struct request_queue; extern int submit_bio_wait(struct bio *bio); -- cgit v1.2.3 From c42bca92be928ce7dece5fc04cf68d0e37ee6718 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 9 Jan 2021 16:03:03 +0000 Subject: bio: don't copy bvec for direct IO The block layer spends quite a while in blkdev_direct_IO() to copy and initialise bio's bvec. However, if we've already got a bvec in the input iterator it might be reused in some cases, i.e. when new ITER_BVEC_FLAG_FIXED flag is set. Simple tests show considerable performance boost, and it also reduces memory footprint. Suggested-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9ddb19801a03..676870b2c88d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -444,10 +444,13 @@ static inline void bio_wouldblock_error(struct bio *bio) /* * Calculate number of bvec segments that should be allocated to fit data - * pointed by @iter. + * pointed by @iter. If @iter is backed by bvec it's going to be reused + * instead of allocating a new one. */ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) { + if (iov_iter_is_bvec(iter)) + return 0; return iov_iter_npages(iter, max_segs); } -- cgit v1.2.3 From 2c2b9fd6b496b3616e9b9537ea0258b3040914f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 9 Jan 2021 12:13:32 +0100 Subject: block: unexport truncate_bdev_range truncate_bdev_range is only used in always built-in block layer code, so remove the export and the !CONFIG_BLOCK stub. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 20f3706b6b2e..2491e17b61c4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1999,21 +1999,16 @@ void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, + loff_t lend); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, - loff_t lend); int sync_blockdev(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { } -static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode, - loff_t lstart, loff_t lend) -{ - return 0; -} static inline int sync_blockdev(struct block_device *bdev) { return 0; -- cgit v1.2.3 From 3175199ab0ac8c874ec25c6bf169f74888917435 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:34 +0100 Subject: block: split bio_kmalloc from bio_alloc_bioset bio_kmalloc shares almost no logic with the bio_set based fast path in bio_alloc_bioset. Split it into an entirely separate implementation. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 676870b2c88d..c74857cf1252 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -408,6 +408,7 @@ extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); @@ -420,11 +421,6 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } -static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); -} - extern blk_qc_t submit_bio(struct bio *); extern void bio_endio(struct bio *); -- cgit v1.2.3 From c6bf3f0e25f4c0f0ecce6cf8d1c589bd9d74d3cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:35 +0100 Subject: block: use an on-stack bio in blkdev_issue_flush There is no point in allocating memory for a synchronous flush. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2491e17b61c4..0dea268bd61b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1288,7 +1288,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) !list_empty(&plug->cb_list)); } -int blkdev_issue_flush(struct block_device *, gfp_t); +int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ struct blk_plug { @@ -1316,7 +1316,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return false; } -static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) +static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; } -- cgit v1.2.3 From 48d15436fde6feebcded7bd0fdc8ea4a9181b8fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:47 +0100 Subject: mm: remove get_swap_bio Just reuse the block_device and sector from the swap_info structure, just as used by the SWP_SYNCHRONOUS path. Also remove the checks for NULL returns from bio_alloc as that can't happen for sleeping allocations. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 596bc2f4d9b0..3f1f7ae0fbe9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -468,7 +468,6 @@ extern int free_swap_and_cache(swp_entry_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); -extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int __swap_count(swp_entry_t entry); -- cgit v1.2.3 From 6ac0b71537e1c14e7532408fe4aae553aa314237 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:20 +0100 Subject: block: move struct biovec_slab to bio.c struct biovec_slab is only used inside of bio.c, so move it there. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index c74857cf1252..4a84207dd996 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -720,12 +720,6 @@ struct bio_set { struct workqueue_struct *rescue_workqueue; }; -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; - static inline bool bioset_initialized(struct bio_set *bs) { return bs->bio_slab != NULL; -- cgit v1.2.3 From 0f2e6ab851ae146c468bc5151c302c6e2473f70a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:24 +0100 Subject: block: turn the nr_iovecs argument to bio_alloc* into an unsigned short The bi_max_vecs and bi_vcnt fields are defined as unsigned short, so don't allow passing larger values in. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 4a84207dd996..9ceeb8ecdb7f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -407,8 +407,9 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); -extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs); +struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs, + struct bio_set *bs); +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); @@ -416,7 +417,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio_set fs_bio_set; -static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } -- cgit v1.2.3 From 7a800a20ae6329e803c5c646b20811a6ae9ca136 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 2 Feb 2021 18:19:29 +0100 Subject: block: use bi_max_vecs to find the bvec pool Instead of encoding of the bvec pool using magic bio flags, just use a helper to find the pool based on the max_vecs value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - include/linux/blk_types.h | 29 +---------------------------- 2 files changed, 1 insertion(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9ceeb8ecdb7f..3cbbaf76906e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -329,7 +329,6 @@ struct bio_integrity_payload { struct bvec_iter bip_iter; - unsigned short bip_slab; /* slab the bip came from */ unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1bc6f6a01070..db026b6ec15a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -227,7 +227,7 @@ struct bio { * top bits REQ_OP. Use * accessors. */ - unsigned short bi_flags; /* status, etc and bvec pool number */ + unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; unsigned short bi_write_hint; blk_status_t bi_status; @@ -307,33 +307,6 @@ enum { BIO_FLAG_LAST }; -/* See BVEC_POOL_OFFSET below before adding new flags */ - -/* - * We support 6 different bvec pools, the last one is magic in that it - * is backed by a mempool. - */ -#define BVEC_POOL_NR 6 -#define BVEC_POOL_MAX (BVEC_POOL_NR - 1) - -/* - * Top 3 bits of bio flags indicate the pool the bvecs came from. We add - * 1 to the actual index so that 0 indicates that there are no bvecs to be - * freed. - */ -#define BVEC_POOL_BITS (3) -#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) -#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) -#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) -# error "BVEC_POOL_BITS is too small" -#endif - -/* - * Flags starting here get preserved by bio_reset() - this includes - * only BVEC_POOL_IDX() - */ -#define BIO_RESET_BITS BVEC_POOL_OFFSET - typedef __u32 __bitwise blk_mq_req_flags_t; /* -- cgit v1.2.3 From a805a4fa4fa376bbc145762bb8b09caa2fa8af48 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 28 Jan 2021 13:47:30 +0900 Subject: block: introduce zone_write_granularity limit Per ZBC and ZAC specifications, host-managed SMR hard-disks mandate that all writes into sequential write required zones be aligned to the device physical block size. However, NVMe ZNS does not have this constraint and allows write operations into sequential zones to be aligned to the device logical block size. This inconsistency does not help with software portability across device types. To solve this, introduce the zone_write_granularity queue limit to indicate the alignment constraint, in bytes, of write operations into zones of a zoned block device. This new limit is exported as a read-only sysfs queue attribute and the helper blk_queue_zone_write_granularity() introduced for drivers to set this limit. The function blk_queue_set_zoned() is modified to set this new limit to the device logical block size by default. NVMe ZNS devices as well as zoned nullb devices use this default value as is. The scsi disk driver is modified to execute the blk_queue_zone_write_granularity() helper to set the zone write granularity of host-managed SMR disks to the disk physical block size. The accessor functions queue_zone_write_granularity() and bdev_zone_write_granularity() are also introduced. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0dea268bd61b..9149f4a5adb3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -337,6 +337,7 @@ struct queue_limits { unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; + unsigned int zone_write_granularity; unsigned short max_segments; unsigned short max_integrity_segments; @@ -1160,6 +1161,8 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); extern void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); +void blk_queue_zone_write_granularity(struct request_queue *q, + unsigned int size); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); void blk_queue_update_readahead(struct request_queue *q); @@ -1473,6 +1476,18 @@ static inline int bdev_io_opt(struct block_device *bdev) return queue_io_opt(bdev_get_queue(bdev)); } +static inline unsigned int +queue_zone_write_granularity(const struct request_queue *q) +{ + return q->limits.zone_write_granularity; +} + +static inline unsigned int +bdev_zone_write_granularity(struct block_device *bdev) +{ + return queue_zone_write_granularity(bdev_get_queue(bdev)); +} + static inline int queue_alignment_offset(const struct request_queue *q) { if (q->limits.misaligned) -- cgit v1.2.3