From 6cc77e9cb08041627fe1d32ac3a743249deb8167 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Dec 2017 15:43:38 +0900 Subject: block: introduce zoned block devices zone write locking Components relying only on the request_queue structure for accessing block devices (e.g. I/O schedulers) have a limited knowledged of the device characteristics. In particular, the device capacity cannot be easily discovered, which for a zoned block device also result in the inability to easily know the number of zones of the device (the zone size is indicated by the chunk_sectors field of the queue limits). Introduce the nr_zones field to the request_queue structure to simplify access to this information. Also, add the bitmap seq_zone_bitmap which indicates which zones of the device are sequential zones (write preferred or write required) and the bitmap seq_zones_wlock which indicates if a zone is write locked, that is, if a write request targeting a zone was dispatched to the device. These fields are initialized by the low level block device driver (sd.c for ZBC/ZAC disks). They are not initialized by stacking drivers (device mappers) handling zoned block devices (e.g. dm-linear). Using this, I/O schedulers can introduce zone write locking to control request dispatching to a zoned block device and avoid write request reordering by limiting to at most a single write request per zone outside of the scheduler at any time. Based on previous patches from Damien Le Moal. Signed-off-by: Christoph Hellwig [Damien] * Fixed comments and identation in blkdev.h * Changed helper functions * Fixed this commit message Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 111 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8089ca17db9a..46e606f5b44b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t; /* Look at ->special_vec for the actual data payload instead of the bio chain. */ #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) +/* The per-zone write lock is held for this request */ +#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -546,6 +548,22 @@ struct request_queue { struct queue_limits limits; + /* + * Zoned block device information for request dispatch control. + * nr_zones is the total number of zones of the device. This is always + * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones + * bits which indicates if a zone is conventional (bit clear) or + * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones + * bits which indicates if a zone is write locked, that is, if a write + * request targeting the zone was dispatched. All three fields are + * initialized by the low level device driver (e.g. scsi/sd.c). + * Stacking drivers (device mappers) may or may not initialize + * these fields. + */ + unsigned int nr_zones; + unsigned long *seq_zones_bitmap; + unsigned long *seq_zones_wlock; + /* * sg stuff */ @@ -790,6 +808,27 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } +static inline unsigned int blk_queue_nr_zones(struct request_queue *q) +{ + return q->nr_zones; +} + +static inline unsigned int blk_queue_zone_no(struct request_queue *q, + sector_t sector) +{ + if (!blk_queue_is_zoned(q)) + return 0; + return sector >> ilog2(q->limits.chunk_sectors); +} + +static inline bool blk_queue_zone_is_seq(struct request_queue *q, + sector_t sector) +{ + if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap) + return false; + return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); +} + static inline bool rq_is_sync(struct request *rq) { return op_is_sync(rq->cmd_flags); @@ -1029,6 +1068,16 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +static inline unsigned int blk_rq_zone_no(struct request *rq) +{ + return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); +} + +static inline unsigned int blk_rq_zone_is_seq(struct request *rq) +{ + return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); +} + /* * Some commands like WRITE SAME have a payload or data transfer size which * is different from the size of the request. Any driver that supports such @@ -1578,7 +1627,15 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) if (q) return blk_queue_zone_sectors(q); + return 0; +} +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_nr_zones(q); return 0; } @@ -1954,6 +2011,60 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); + +#ifdef CONFIG_BLK_DEV_ZONED +bool blk_req_needs_zone_write_lock(struct request *rq); +void __blk_req_zone_write_lock(struct request *rq); +void __blk_req_zone_write_unlock(struct request *rq); + +static inline void blk_req_zone_write_lock(struct request *rq) +{ + if (blk_req_needs_zone_write_lock(rq)) + __blk_req_zone_write_lock(rq); +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ + if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) + __blk_req_zone_write_unlock(rq); +} + +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return rq->q->seq_zones_wlock && + test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + if (!blk_req_needs_zone_write_lock(rq)) + return true; + return !blk_req_zone_is_write_locked(rq); +} +#else +static inline bool blk_req_needs_zone_write_lock(struct request *rq) +{ + return false; +} + +static inline void blk_req_zone_write_lock(struct request *rq) +{ +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ +} +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return false; +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + return true; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + #else /* CONFIG_BLOCK */ struct block_device; -- cgit v1.2.3 From 1d9bd5161ba32db5665a617edc8b0723880f543e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:48 -0800 Subject: blk-mq: replace timeout synchronization with a RCU and generation based scheme Currently, blk-mq timeout path synchronizes against the usual issue/completion path using a complex scheme involving atomic bitflags, REQ_ATOM_*, memory barriers and subtle memory coherence rules. Unfortunately, it contains quite a few holes. There's a complex dancing around REQ_ATOM_STARTED and REQ_ATOM_COMPLETE between issue/completion and timeout paths; however, they don't have a synchronization point across request recycle instances and it isn't clear what the barriers add. blk_mq_check_expired() can easily read STARTED from N-2'th iteration, deadline from N-1'th, blk_mark_rq_complete() against Nth instance. In fact, it's pretty easy to make blk_mq_check_expired() terminate a later instance of a request. If we induce 5 sec delay before time_after_eq() test in blk_mq_check_expired(), shorten the timeout to 2s, and issue back-to-back large IOs, blk-mq starts timing out requests spuriously pretty quickly. Nothing actually timed out. It just made the call on a recycle instance of a request and then terminated a later instance long after the original instance finished. The scenario isn't theoretical either. This patch replaces the broken synchronization mechanism with a RCU and generation number based one. 1. Each request has a u64 generation + state value, which can be updated only by the request owner. Whenever a request becomes in-flight, the generation number gets bumped up too. This provides the basis for the timeout path to distinguish different recycle instances of the request. Also, marking a request in-flight and setting its deadline are protected with a seqcount so that the timeout path can fetch both values coherently. 2. The timeout path fetches the generation, state and deadline. If the verdict is timeout, it records the generation into a dedicated request abortion field and does RCU wait. 3. The completion path is also protected by RCU (from the previous patch) and checks whether the current generation number and state match the abortion field. If so, it skips completion. 4. The timeout path, after RCU wait, scans requests again and terminates the ones whose generation and state still match the ones requested for abortion. By now, the timeout path knows that either the generation number and state changed if it lost the race or the completion will yield to it and can safely timeout the request. While it's more lines of code, it's conceptually simpler, doesn't depend on direct use of subtle memory ordering or coherence, and hopefully doesn't terminate the wrong instance. While this change makes REQ_ATOM_COMPLETE synchronization unnecessary between issue/complete and timeout paths, REQ_ATOM_COMPLETE isn't removed yet as it's still used in other places. Future patches will move all state tracking to the new mechanism and remove all bitops in the hot paths. Note that this patch adds a comment explaining a race condition in BLK_EH_RESET_TIMER path. The race has always been there and this patch doesn't change it. It's just documenting the existing race. v2: - Fixed BLK_EH_RESET_TIMER handling as pointed out by Jianchao. - s/request->gstate_seqc/request->gstate_seq/ as suggested by Peter. - READ_ONCE() added in blk_mq_rq_update_state() as suggested by Peter. v3: - Fixed possible extended seqcount / u64_stats_sync read looping spotted by Peter. - MQ_RQ_IDLE was incorrectly being set in complete_request instead of free_request. Fixed. v4: - Rebased on top of hctx_lock() refactoring patch. - Added comment explaining the use of hctx_lock() in completion path. v5: - Added comments requested by Bart. - Note the addition of BLK_EH_RESET_TIMER race condition in the commit message. Signed-off-by: Tejun Heo Cc: "jianchao.wang" Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 46e606f5b44b..ae563d01b29d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,8 @@ #include #include #include +#include +#include struct module; struct scsi_ioctl_command; @@ -230,6 +232,27 @@ struct request { unsigned short write_hint; + /* + * On blk-mq, the lower bits of ->gstate (generation number and + * state) carry the MQ_RQ_* state value and the upper bits the + * generation number which is monotonically incremented and used to + * distinguish the reuse instances. + * + * ->gstate_seq allows updates to ->gstate and other fields + * (currently ->deadline) during request start to be read + * atomically from the timeout path, so that it can operate on a + * coherent set of information. + */ + seqcount_t gstate_seq; + u64 gstate; + + /* + * ->aborted_gstate is used by the timeout to claim a specific + * recycle instance of this request. See blk_mq_timeout_work(). + */ + struct u64_stats_sync aborted_gstate_sync; + u64 aborted_gstate; + unsigned long deadline; struct list_head timeout_list; -- cgit v1.2.3 From 634f9e4631a88025d3b90c1884e9a1b6a13d01d2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 08:29:51 -0800 Subject: blk-mq: remove REQ_ATOM_COMPLETE usages from blk-mq After the recent updates to use generation number and state based synchronization, blk-mq no longer depends on REQ_ATOM_COMPLETE except to avoid firing the same timeout multiple times. Remove all REQ_ATOM_COMPLETE usages and use a new rq_flags flag RQF_MQ_TIMEOUT_EXPIRED to avoid firing the same timeout multiple times. This removes atomic bitops from hot paths too. v2: Removed blk_clear_rq_complete() from blk_mq_rq_timed_out(). v3: Added RQF_MQ_TIMEOUT_EXPIRED flag. Signed-off-by: Tejun Heo Cc: "jianchao.wang" Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ae563d01b29d..007a7cf1f262 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -125,6 +125,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* The per-zone write lock is held for this request */ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) +/* timeout is expired */ +#define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ -- cgit v1.2.3 From 76a86f9d027b342b8759a4b2f9f7fe046e284220 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:30:56 -0700 Subject: block: remove REQ_ATOM_POLL_SLEPT We don't need this to be an atomic flag, it can be a regular flag. We either end up on the same CPU for the polling, in which case the state is sane, or we did the sleep which would imply the needed barrier to ensure we see the right state. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 007a7cf1f262..ba31674d8581 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -127,6 +127,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* timeout is expired */ #define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) +/* already slept for hybrid poll */ +#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ -- cgit v1.2.3 From 0a72e7f44964b9ada3e5c15820372e9cb119bf80 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Jan 2018 14:23:42 -0700 Subject: block: add accessors for setting/querying request deadline We reduce the resolution of request expiry, but since we're already using jiffies for this where resolution depends on the kernel configuration and since the timeout resolution is coarse anyway, that should be fine. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ba31674d8581..aa6698cf483c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -257,7 +257,9 @@ struct request { struct u64_stats_sync aborted_gstate_sync; u64 aborted_gstate; - unsigned long deadline; + /* access through blk_rq_set_deadline, blk_rq_deadline */ + unsigned long __deadline; + struct list_head timeout_list; /* -- cgit v1.2.3 From e14575b3d457f5806d79b85886ef94d9c29e3b2a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:34:25 -0700 Subject: block: convert REQ_ATOM_COMPLETE to stealing rq->__deadline bit We only have one atomic flag left. Instead of using an entire unsigned long for that, steal the bottom bit of the deadline field that we already reserved. Remove ->atomic_flags, since it's now unused. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aa6698cf483c..d4b2f7bb18d6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -156,8 +156,6 @@ struct request { int internal_tag; - unsigned long atomic_flags; - /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ int tag; -- cgit v1.2.3 From 7c3fb70f0341f9d924818e648906774921f4bcb3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2018 11:46:39 -0700 Subject: block: rearrange a few request fields for better cache layout Move completion related items (like the call single data) near the end of the struct, instead of mixing them in with the initial queueing related fields. Move queuelist below the bio structures. Then we have all queueing related bits in the first cache line. This yields a 1.5-2% increase in IOPS for a null_blk test, both for sync and for high thread count access. Sync test goes form 975K to 992K, 32-thread case from 20.8M to 21.2M IOPS. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d4b2f7bb18d6..71a9371c8182 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -141,12 +141,6 @@ typedef __u32 __bitwise req_flags_t; * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { - struct list_head queuelist; - union { - call_single_data_t csd; - u64 fifo_time; - }; - struct request_queue *q; struct blk_mq_ctx *mq_ctx; @@ -164,6 +158,8 @@ struct request { struct bio *bio; struct bio *biotail; + struct list_head queuelist; + /* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used @@ -211,19 +207,16 @@ struct request { struct hd_struct *part; unsigned long start_time; struct blk_issue_stat issue_stat; -#ifdef CONFIG_BLK_CGROUP - struct request_list *rl; /* rl this rq is alloced from */ - unsigned long long start_time_ns; - unsigned long long io_start_time_ns; /* when passed to hardware */ -#endif /* Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; + #if defined(CONFIG_BLK_DEV_INTEGRITY) unsigned short nr_integrity_segments; #endif + unsigned short write_hint; unsigned short ioprio; unsigned int timeout; @@ -232,8 +225,6 @@ struct request { unsigned int extra_len; /* length of alignment and padding */ - unsigned short write_hint; - /* * On blk-mq, the lower bits of ->gstate (generation number and * state) carry the MQ_RQ_* state value and the upper bits the @@ -260,6 +251,11 @@ struct request { struct list_head timeout_list; + union { + call_single_data_t csd; + u64 fifo_time; + }; + /* * completion callback. */ @@ -268,6 +264,12 @@ struct request { /* for bidi */ struct request *next_rq; + +#ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ + unsigned long long start_time_ns; + unsigned long long io_start_time_ns; /* when passed to hardware */ +#endif }; static inline bool blk_rq_is_scsi(struct request *rq) -- cgit v1.2.3 From f5ced52aaa5494c1feb9f80252cb2a2cde0dace8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 19 Jan 2018 08:58:56 -0800 Subject: block: Remove kblockd_schedule_delayed_work{,_on}() The previous patch removed all users of these two functions. Hence also remove the functions themselves. Reviewed-by: Mike Snitzer Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 71a9371c8182..afc43fb63c16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1800,8 +1800,6 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); -int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); -int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3