From 4593fdbe7a2f44d5e64c627c715dd0bcec9bdf14 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 27 Sep 2015 02:09:20 +0900 Subject: blk-mq: fix sysfs registration/unregistration race There is a race between cpu hotplug handling and adding/deleting gendisk for blk-mq, where both are trying to register and unregister the same sysfs entries. null_add_dev --> blk_mq_init_queue --> blk_mq_init_allocated_queue --> add to 'all_q_list' (*) --> add_disk --> blk_register_queue --> blk_mq_register_disk (++) null_del_dev --> del_gendisk --> blk_unregister_queue --> blk_mq_unregister_disk (--) --> blk_cleanup_queue --> blk_mq_free_queue --> del from 'all_q_list' (*) blk_mq_queue_reinit --> blk_mq_sysfs_unregister (-) --> blk_mq_sysfs_register (+) While the request queue is added to 'all_q_list' (*), blk_mq_queue_reinit() can be called for the queue anytime by CPU hotplug callback. But blk_mq_sysfs_unregister (-) and blk_mq_sysfs_register (+) in blk_mq_queue_reinit must not be called before blk_mq_register_disk (++) and after blk_mq_unregister_disk (--) is finished. Because '/sys/block/*/mq/' is not exists. There has already been BLK_MQ_F_SYSFS_UP flag in hctx->flags which can be used to track these sysfs stuff, but it is only fixing this issue partially. In order to fix it completely, we just need per-queue flag instead of per-hctx flag with appropriate locking. So this introduces q->mq_sysfs_init_done which is properly protected with all_q_mutex. Also, we need to ensure that blk_mq_map_swqueue() is called with all_q_mutex is held. Since hctx->nr_ctx is reset temporarily and updated in blk_mq_map_swqueue(), so we should avoid blk_mq_register_hctx() seeing the temporary hctx->nr_ctx value in CPU hotplug handling or adding/deleting gendisk . Signed-off-by: Akinobu Mita Reviewed-by: Ming Lei Cc: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99da9ebc7377..19c2e947d4d1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -456,6 +456,8 @@ struct request_queue { struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set *bio_split; + + bool mq_sysfs_init_done; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ -- cgit v1.2.3 From aff34e192e4eeacfb8b5ffc68e10a240f2c0c6d7 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 21 Oct 2015 13:19:27 -0400 Subject: block: Move integrity kobject to struct gendisk The integrity kobject purely exists to support the integrity subdirectory in sysfs and doesn't really have anything to do with the blk_integrity data structure. Move the kobject to struct gendisk where it belongs. Signed-off-by: Martin K. Petersen Reported-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 19c2e947d4d1..830f9c07d4bb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1472,8 +1472,6 @@ struct blk_integrity { unsigned short tag_size; const char *name; - - struct kobject kobj; }; extern bool blk_integrity_is_initialized(struct gendisk *); -- cgit v1.2.3 From 0f8087ecdeac921fc4920f1328f55c15080bc6aa Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 21 Oct 2015 13:19:33 -0400 Subject: block: Consolidate static integrity profile properties We previously made a complete copy of a device's data integrity profile even though several of the fields inside the blk_integrity struct are pointers to fixed template entries in t10-pi.c. Split the static and per-device portions so that we can reference the template directly. Signed-off-by: Martin K. Petersen Reported-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Cc: Dan Williams Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 830f9c07d4bb..f36c6476f1c7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1462,16 +1462,18 @@ struct blk_integrity_iter { typedef int (integrity_processing_fn) (struct blk_integrity_iter *); -struct blk_integrity { - integrity_processing_fn *generate_fn; - integrity_processing_fn *verify_fn; - - unsigned short flags; - unsigned short tuple_size; - unsigned short interval; - unsigned short tag_size; +struct blk_integrity_profile { + integrity_processing_fn *generate_fn; + integrity_processing_fn *verify_fn; + const char *name; +}; - const char *name; +struct blk_integrity { + struct blk_integrity_profile *profile; + unsigned short flags; + unsigned short tuple_size; + unsigned short interval; + unsigned short tag_size; }; extern bool blk_integrity_is_initialized(struct gendisk *); -- cgit v1.2.3 From a48f041d91bf1aee599fa2adb53b780ed20c2ee5 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 21 Oct 2015 13:19:38 -0400 Subject: block: Reduce the size of struct blk_integrity The per-device properties in the blk_integrity structure were previously unsigned short. However, most of the values fit inside a char. The only exception is the data interval size and we can work around that by storing it as a power of two. This cuts the size of the dynamic portion of blk_integrity in half. Signed-off-by: Martin K. Petersen Reported-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f36c6476f1c7..4f1968f15e30 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1470,10 +1470,10 @@ struct blk_integrity_profile { struct blk_integrity { struct blk_integrity_profile *profile; - unsigned short flags; - unsigned short tuple_size; - unsigned short interval; - unsigned short tag_size; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; }; extern bool blk_integrity_is_initialized(struct gendisk *); -- cgit v1.2.3 From 25520d55cdb6ee289abc68f553d364d22478ff54 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 21 Oct 2015 13:19:49 -0400 Subject: block: Inline blk_integrity in struct gendisk Up until now the_integrity profile has been dynamically allocated and attached to struct gendisk after the disk has been made active. This causes problems because NVMe devices need to register the profile prior to the partition table being read due to a mandatory metadata buffer requirement. In addition, DM goes through hoops to deal with preallocating, but not initializing integrity profiles. Since the integrity profile is small (4 bytes + a pointer), Christoph suggested moving it to struct gendisk proper. This requires several changes: - Moving the blk_integrity definition to genhd.h. - Inlining blk_integrity in struct gendisk. - Removing the dynamic allocation code. - Adding helper functions which allow gendisk to set up and tear down the integrity sysfs dir when a disk is added/deleted. - Adding a blk_integrity_revalidate() callback for updating the stable pages bdi setting. - The calls that depend on whether a device has an integrity profile or not now key off of the bi->profile pointer. - Simplifying the integrity support routines in DM (Mike Snitzer). Signed-off-by: Martin K. Petersen Reported-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Mike Snitzer Cc: Dan Williams Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4f1968f15e30..60669c20190f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1468,16 +1468,7 @@ struct blk_integrity_profile { const char *name; }; -struct blk_integrity { - struct blk_integrity_profile *profile; - unsigned char flags; - unsigned char tuple_size; - unsigned char interval_exp; - unsigned char tag_size; -}; - -extern bool blk_integrity_is_initialized(struct gendisk *); -extern int blk_integrity_register(struct gendisk *, struct blk_integrity *); +extern void blk_integrity_register(struct gendisk *, struct blk_integrity *); extern void blk_integrity_unregister(struct gendisk *); extern int blk_integrity_compare(struct gendisk *, struct gendisk *); extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, @@ -1488,15 +1479,20 @@ extern bool blk_integrity_merge_rq(struct request_queue *, struct request *, extern bool blk_integrity_merge_bio(struct request_queue *, struct request *, struct bio *); -static inline -struct blk_integrity *bdev_get_integrity(struct block_device *bdev) +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { - return bdev->bd_disk->integrity; + struct blk_integrity *bi = &disk->integrity; + + if (!bi->profile) + return NULL; + + return bi; } -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +static inline +struct blk_integrity *bdev_get_integrity(struct block_device *bdev) { - return disk->integrity; + return blk_get_integrity(bdev->bd_disk); } static inline bool blk_integrity_rq(struct request *rq) @@ -1570,10 +1566,9 @@ static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) { return 0; } -static inline int blk_integrity_register(struct gendisk *d, +static inline void blk_integrity_register(struct gendisk *d, struct blk_integrity *b) { - return 0; } static inline void blk_integrity_unregister(struct gendisk *d) { @@ -1598,10 +1593,7 @@ static inline bool blk_integrity_merge_bio(struct request_queue *rq, { return true; } -static inline bool blk_integrity_is_initialized(struct gendisk *g) -{ - return 0; -} + static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { -- cgit v1.2.3 From 3ef28e83ab15799742e55fd13243a5f678b04242 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 21 Oct 2015 13:20:12 -0400 Subject: block: generic request_queue reference counting Allow pmem, and other synchronous/bio-based block drivers, to fallback on a per-cpu reference count managed by the core for tracking queue live/dead state. The existing per-cpu reference count for the blk_mq case is promoted to be used in all block i/o scenarios. This involves initializing it by default, waiting for it to drop to zero at exit, and holding a live reference over the invocation of q->make_request_fn() in generic_make_request(). The blk_mq code continues to take its own reference per blk_mq request and retains the ability to freeze the queue, but the check that the queue is frozen is moved to generic_make_request(). This fixes crash signatures like the following: BUG: unable to handle kernel paging request at ffff880140000000 [..] Call Trace: [] ? copy_user_handle_tail+0x5f/0x70 [] pmem_do_bvec.isra.11+0x70/0xf0 [nd_pmem] [] pmem_make_request+0xd1/0x200 [nd_pmem] [] ? mempool_alloc+0x72/0x1a0 [] generic_make_request+0xd6/0x110 [] submit_bio+0x76/0x170 [] submit_bh_wbc+0x12f/0x160 [] submit_bh+0x12/0x20 [] jbd2_write_superblock+0x8d/0x170 [] jbd2_mark_journal_empty+0x5d/0x90 [] jbd2_journal_destroy+0x24b/0x270 [] ? put_pwq_unlocked+0x2a/0x30 [] ? destroy_workqueue+0x225/0x250 [] ext4_put_super+0x64/0x360 [] generic_shutdown_super+0x6a/0xf0 Cc: Jens Axboe Cc: Keith Busch Cc: Ross Zwisler Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Tested-by: Ross Zwisler Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 60669c20190f..3e0465257d68 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -450,7 +450,7 @@ struct request_queue { #endif struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; - struct percpu_ref mq_usage_counter; + struct percpu_ref q_usage_counter; struct list_head all_q_node; struct blk_mq_tag_set *tag_set; -- cgit v1.2.3 From ac6fc48c9fb7d3220ec4e0be0c29bb314ea75f9f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 21 Oct 2015 13:20:18 -0400 Subject: block: move blk_integrity to request_queue A trace like the following proceeds a crash in bio_integrity_process() when it goes to use an already freed blk_integrity profile. BUG: unable to handle kernel paging request at ffff8800d31b10d8 IP: [] 0xffff8800d31b10d8 PGD 2f65067 PUD 21fffd067 PMD 80000000d30001e3 Oops: 0011 [#1] SMP Dumping ftrace buffer: --------------------------------- ndctl-2222 2.... 44526245us : disk_release: pmem1s systemd--2223 4.... 44573945us : bio_integrity_endio: pmem1s <...>-409 4.... 44574005us : bio_integrity_process: pmem1s --------------------------------- [..] Call Trace: [] ? bio_integrity_process+0x159/0x2d0 [] bio_integrity_verify_fn+0x36/0x60 [] process_one_work+0x1cc/0x4e0 Given that a request_queue is pinned while i/o is in flight and that a gendisk is allowed to have a shorter lifetime, move blk_integrity to request_queue to satisfy requests arriving after the gendisk has been torn down. Cc: Christoph Hellwig Cc: Martin K. Petersen [martin: fix the CONFIG_BLK_DEV_INTEGRITY=n case] Tested-by: Ross Zwisler Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3e0465257d68..cf57884db4b7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -369,6 +369,10 @@ struct request_queue { */ struct kobject mq_kobj; +#ifdef CONFIG_BLK_DEV_INTEGRITY + struct blk_integrity integrity; +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + #ifdef CONFIG_PM struct device *dev; int rpm_status; @@ -1481,7 +1485,7 @@ extern bool blk_integrity_merge_bio(struct request_queue *, struct request *, static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { - struct blk_integrity *bi = &disk->integrity; + struct blk_integrity *bi = &disk->queue->integrity; if (!bi->profile) return NULL; -- cgit v1.2.3 From bbd3e064362e5057cc4799ba2e4d68c7593e490b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Oct 2015 14:10:48 +0200 Subject: block: add an API for Persistent Reservations This commits adds a driver API and ioctls for controlling Persistent Reservations s/genericly/generically/ at the block layer. Persistent Reservations are supported by SCSI and NVMe and allow controlling who gets access to a device in a shared storage setup. Note that we add a pr_ops structure to struct block_device_operations instead of adding the members directly to avoid bloating all instances of devices that will never support Persistent Reservations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 19c2e947d4d1..fe25da05e823 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -35,6 +35,7 @@ struct sg_io_hdr; struct bsg_job; struct blkcg_gq; struct blk_flush_queue; +struct pr_ops; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -1633,6 +1634,7 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); struct module *owner; + const struct pr_ops *pr_ops; }; extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, -- cgit v1.2.3 From dece16353ef47d8d33f5302bc158072a9d65e26f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Nov 2015 10:41:16 -0700 Subject: block: change ->make_request_fn() and users to return a queue cookie No functional changes in this patch, but it prepares us for returning a more useful cookie related to the IO that was queued up. Signed-off-by: Jens Axboe Acked-by: Christoph Hellwig Acked-by: Keith Busch --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d045ca8487af..5ee0f5243025 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -209,7 +209,7 @@ static inline unsigned short req_get_ioprio(struct request *req) struct blk_queue_ctx; typedef void (request_fn_proc) (struct request_queue *q); -typedef void (make_request_fn) (struct request_queue *q, struct bio *bio); +typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unprep_rq_fn) (struct request_queue *, struct request *); @@ -761,7 +761,7 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -extern void generic_make_request(struct bio *bio); +extern blk_qc_t generic_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); -- cgit v1.2.3 From 05229beeddf7e75e2e616ddaad4b70e7fca9528d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Nov 2015 10:44:55 -0700 Subject: block: add block polling support Add basic support for polling for specific IO to complete. This uses the cookie that blk-mq passes back, which enables the block layer to pass this cookie to the driver to spin for a specific request. This will be combined with request latency tracking, so we can make qualified decisions about when to poll and when not to. For now, for benchmark purposes, we add a sysfs file that controls whether polling is enabled or not. Signed-off-by: Jens Axboe Acked-by: Christoph Hellwig Acked-by: Keith Busch --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5ee0f5243025..3fe27f8d91f0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -487,6 +487,7 @@ struct request_queue { #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ #define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/ +#define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -814,6 +815,8 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); +bool blk_poll(struct request_queue *q, blk_qc_t cookie); + static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { return bdev->bd_disk->queue; /* this is never NULL */ -- cgit v1.2.3 From 2e6edc95382cc36423aff18a237173ad62d5ab52 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 19 Nov 2015 13:29:28 -0800 Subject: block: protect rw_page against device teardown Fix use after free crashes like the following: general protection fault: 0000 [#1] SMP Call Trace: [] ? pmem_do_bvec.isra.12+0xa6/0xf0 [nd_pmem] [] pmem_rw_page+0x42/0x80 [nd_pmem] [] bdev_read_page+0x50/0x60 [] do_mpage_readpage+0x510/0x770 [] ? I_BDEV+0x20/0x20 [] ? lru_cache_add+0x1c/0x50 [] mpage_readpages+0x107/0x170 [] ? I_BDEV+0x20/0x20 [] ? I_BDEV+0x20/0x20 [] blkdev_readpages+0x1d/0x20 [] __do_page_cache_readahead+0x28f/0x310 [] ? __do_page_cache_readahead+0x169/0x310 [] ? pagecache_get_page+0x2d/0x1d0 [] filemap_fault+0x396/0x530 [] __do_fault+0x4e/0xf0 [] handle_mm_fault+0x11bd/0x1b50 Cc: Cc: Jens Axboe Cc: Alexander Viro Reported-by: kbuild test robot Acked-by: Matthew Wilcox [willy: symmetry fixups] Signed-off-by: Dan Williams --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3fe27f8d91f0..c0d2b7927c1f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -794,6 +794,8 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); +extern int blk_queue_enter(struct request_queue *q, gfp_t gfp); +extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -- cgit v1.2.3